diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index a400f5cd0d..f0f99a2665 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -26,6 +26,7 @@ permissions: jobs: dependency-review: + if: github.repository == 'GoogleCloudPlatform/hpc-toolkit' runs-on: ubuntu-latest steps: - name: 'Checkout Repository' diff --git a/.github/workflows/pr-label-validation.yml b/.github/workflows/pr-label-validation.yml index 3d7667c615..dda59b7161 100644 --- a/.github/workflows/pr-label-validation.yml +++ b/.github/workflows/pr-label-validation.yml @@ -32,6 +32,7 @@ on: jobs: pr-label-validation: + if: github.repository == 'GoogleCloudPlatform/hpc-toolkit' runs-on: ubuntu-latest permissions: pull-requests: read diff --git a/.github/workflows/pr-precommit.yml b/.github/workflows/pr-precommit.yml new file mode 100644 index 0000000000..23ffc30cf7 --- /dev/null +++ b/.github/workflows/pr-precommit.yml @@ -0,0 +1,55 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +name: 'Use pre-commit to validate Pull Request' + +# yamllint disable-line rule:truthy +on: + pull_request: + types: + - edited + - opened + - labeled + - synchronize + branches: + - develop + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + check-latest: true + cache: 'pip' + - uses: actions/setup-go@v5 + with: + go-version: '1.22' + check-latest: true + - run: make install-dev-deps + - uses: terraform-linters/setup-tflint@v4 + with: + tflint_version: v0.49.0 + - uses: pre-commit/action@v3.0.1 + - uses: pre-commit-ci/lite-action@v1.0.2 + # this if statement looks funny but it ensures that this step runs + # only if: user has applied "pre-commit-autofix" label + # even if: job has failed + # not if: job is canceled + if: | + (success() || failure()) && + contains(github.event.pull_request.labels.*.name, 'pre-commit-autofix') diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f7e9acd152..523e20fd6d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ --- repos: - repo: https://github.com/antonbabenko/pre-commit-terraform - rev: v1.82.0 + rev: v1.86.0 hooks: - id: terraform_fmt - id: terraform_tflint @@ -82,16 +82,16 @@ repos: # hooks: # - id: go-critic - repo: https://github.com/Bahjat/pre-commit-golang - rev: v1.0.2 + rev: v1.0.3 hooks: - id: go-static-check - repo: https://github.com/adrienverge/yamllint - rev: v1.32.0 + rev: v1.34.0 hooks: - id: yamllint - args: [-c=.yamllint] + args: [-c=.yamllint, --no-warnings] - repo: https://github.com/jackdewinter/pymarkdown - rev: v0.9.12 + rev: v0.9.17 hooks: - id: pymarkdown # Rules at https://github.com/jackdewinter/pymarkdown/tree/main/docs/rules @@ -101,7 +101,8 @@ repos: # MD034 - Bare URL used # MD041 - First line in file should be a top level header # MD046 - Code block style - args: [--disable-rules, "MD013,MD022,MD033,MD034,MD041,MD046", scan] + # MD024 - Multiple headings cannot contain the same content. + args: [--disable-rules, "MD013,MD022,MD033,MD034,MD041,MD046,MD024", scan] - repo: https://github.com/jumanjihouse/pre-commit-hooks rev: "3.0.0" hooks: @@ -110,7 +111,7 @@ repos: - id: shfmt exclude: ".*tpl" - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: end-of-file-fixer - repo: https://github.com/codespell-project/codespell diff --git a/Makefile b/Makefile index 4efab7a3ac..d452cddf93 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ ifneq (, $(shell which git)) ifneq (,$(wildcard .git)) ## GIT DIRECTORY EXISTS GIT_TAG_VERSION=$(shell git tag --points-at HEAD) -GIT_BRANCH=$(shell git branch --show-current) +GIT_BRANCH=$(shell $(SHELL) -c 'git branch --show-current || git rev-parse --abbrev-ref HEAD' 2>/dev/null) GIT_COMMIT_INFO=$(shell git describe --tags --dirty --long --always) GIT_COMMIT_HASH=$(shell git rev-parse HEAD) GIT_INITIAL_HASH=$(shell git rev-list --max-parents=0 HEAD) diff --git a/cmd/color.go b/cmd/color.go index 9eab8ef6f2..171c5767c4 100644 --- a/cmd/color.go +++ b/cmd/color.go @@ -30,7 +30,7 @@ func init() { } func addColorFlag(flagset *pflag.FlagSet) { - flagset.BoolVar(&noColorFlag, "no-color", true, "Disable colorized output.") + flagset.BoolVar(&noColorFlag, "no-color", false, "Disable colorized output.") } func initColor() { diff --git a/cmd/create.go b/cmd/create.go index 21ff70dbbe..ccf5dc3124 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -52,6 +52,10 @@ func init() { "Note: Terraform state IS preserved. \n"+ "Note: Terraform workspaces are NOT supported (behavior undefined). \n"+ "Note: Packer is NOT supported.") + createCmd.Flags().BoolVar(&forceOverwrite, "force", false, + "Forces overwrite of existing deployment directory. \n"+ + "If set, --overwrite-deployment is implied. \n"+ + "No validation is performed on the existing deployment directory.") rootCmd.AddCommand(createCmd) } @@ -62,6 +66,7 @@ var ( cliBEConfigVars []string overwriteDeployment bool + forceOverwrite bool validationLevel string validationLevelDesc = "Set validation level to one of (\"ERROR\", \"WARNING\", \"IGNORE\")" validatorsToSkip []string @@ -80,7 +85,7 @@ var ( func runCreateCmd(cmd *cobra.Command, args []string) { dc := expandOrDie(args[0]) deplDir := filepath.Join(outputDir, dc.Config.DeploymentName()) - checkErr(checkOverwriteAllowed(deplDir, dc.Config, overwriteDeployment)) + checkErr(checkOverwriteAllowed(deplDir, dc.Config, overwriteDeployment, forceOverwrite)) checkErr(modulewriter.WriteDeployment(dc, deplDir)) logging.Info("To deploy your infrastructure please run:") @@ -110,7 +115,7 @@ func expandOrDie(path string) config.DeploymentConfig { logging.Fatal("Failed to set the backend config at CLI: %v", err) } checkErr(setValidationLevel(&dc.Config, validationLevel)) - checkErr(skipValidators(&dc)) + skipValidators(&dc) if dc.Config.GhpcVersion != "" { logging.Info("ghpc_version setting is ignored.") @@ -159,58 +164,6 @@ func validateMaybeDie(bp config.Blueprint, ctx config.YamlCtx) { } -func findPos(path config.Path, ctx config.YamlCtx) (config.Pos, bool) { - pos, ok := ctx.Pos(path) - for !ok && path.Parent() != nil { - path = path.Parent() - pos, ok = ctx.Pos(path) - } - return pos, ok -} - -func renderError(err error, ctx config.YamlCtx) string { - switch te := err.(type) { - case config.Errors: - var sb strings.Builder - for _, e := range te.Errors { - sb.WriteString(renderError(e, ctx)) - sb.WriteString("\n") - } - return sb.String() - case validators.ValidatorError: - title := boldRed(fmt.Sprintf("validator %q failed:", te.Validator)) - return fmt.Sprintf("%s\n%v\n", title, renderError(te.Err, ctx)) - case config.BpError: - if pos, ok := findPos(te.Path, ctx); ok { - return renderRichError(te.Err, pos, ctx) - } - return renderError(te.Err, ctx) - default: - return err.Error() - } -} - -func renderRichError(err error, pos config.Pos, ctx config.YamlCtx) string { - line := pos.Line - 1 - if line < 0 { - line = 0 - } - if line >= len(ctx.Lines) { - line = len(ctx.Lines) - 1 - } - - pref := fmt.Sprintf("%d: ", pos.Line) - arrow := " " - if pos.Column > 0 { - spaces := strings.Repeat(" ", len(pref)+pos.Column-1) - arrow = spaces + "^" - } - - return fmt.Sprintf(`%s: %s -%s%s -%s`, boldRed("Error"), err, pref, ctx.Lines[line], arrow) -} - func setCLIVariables(bp *config.Blueprint, s []string) error { for _, cliVar := range s { arr := strings.SplitN(cliVar, "=", 2) @@ -268,16 +221,10 @@ func setValidationLevel(bp *config.Blueprint, s string) error { return nil } -func skipValidators(dc *config.DeploymentConfig) error { - if validatorsToSkip == nil { - return nil - } +func skipValidators(dc *config.DeploymentConfig) { for _, v := range validatorsToSkip { - if err := dc.SkipValidator(v); err != nil { - return err - } + dc.SkipValidator(v) } - return nil } func filterYaml(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { @@ -287,29 +234,37 @@ func filterYaml(cmd *cobra.Command, args []string, toComplete string) ([]string, return []string{"yaml", "yml"}, cobra.ShellCompDirectiveFilterFileExt } +func forceErr(err error) error { + return config.HintError{ + Err: err, + Hint: "Use `--force` to overwrite the deployment anyway. Proceed at your own risk."} +} + // Determines if overwrite is allowed -func checkOverwriteAllowed(depDir string, bp config.Blueprint, overwriteFlag bool) error { - if _, err := os.Stat(depDir); os.IsNotExist(err) { +func checkOverwriteAllowed(depDir string, bp config.Blueprint, overwriteFlag bool, forceFlag bool) error { + if _, err := os.Stat(depDir); os.IsNotExist(err) || forceFlag { return nil // all good, no previous deployment } if _, err := os.Stat(modulewriter.HiddenGhpcDir(depDir)); os.IsNotExist(err) { // hidden ghpc dir does not exist - return fmt.Errorf("folder %q already exists, and it is not a valid GHPC deployment folder", depDir) + return forceErr(fmt.Errorf("folder %q already exists, and it is not a valid GHPC deployment folder", depDir)) } // try to get previous deployment expPath := filepath.Join(modulewriter.ArtifactsDir(depDir), modulewriter.ExpandedBlueprintName) if _, err := os.Stat(expPath); os.IsNotExist(err) { - return fmt.Errorf("expanded blueprint file %q is missing, this could be a result of changing GHPC version between consecutive deployments", expPath) + return forceErr(fmt.Errorf("expanded blueprint file %q is missing, this could be a result of changing GHPC version between consecutive deployments", expPath)) } prev, _, err := config.NewDeploymentConfig(expPath) if err != nil { - return err + return forceErr(err) } if prev.Config.GhpcVersion != bp.GhpcVersion { - logging.Info("WARNING: ghpc_version has changed from %q to %q, using different versions of GHPC to update a live deployment is not officially supported. Proceed at your own risk", prev.Config.GhpcVersion, bp.GhpcVersion) + return forceErr(fmt.Errorf( + "ghpc_version has changed from %q to %q, using different versions of GHPC to update a live deployment is not officially supported", + prev.Config.GhpcVersion, bp.GhpcVersion)) } if !overwriteFlag { @@ -323,7 +278,7 @@ func checkOverwriteAllowed(depDir string, bp config.Blueprint, overwriteFlag boo for _, g := range prev.Config.DeploymentGroups { if !newGroups[g.Name] { - return fmt.Errorf("you are attempting to remove a deployment group %q, which is not supported", g.Name) + return forceErr(fmt.Errorf("you are attempting to remove a deployment group %q, which is not supported", g.Name)) } } diff --git a/cmd/create_test.go b/cmd/create_test.go index 94454fe26a..891c1680df 100644 --- a/cmd/create_test.go +++ b/cmd/create_test.go @@ -15,7 +15,6 @@ package cmd import ( - "errors" "hpc-toolkit/pkg/config" "hpc-toolkit/pkg/modulewriter" "os" @@ -75,9 +74,12 @@ func (s *MySuite) TestSetCLIVariables(c *C) { // Failure: Variable without '=' bp = config.Blueprint{} inv := []string{"project_idcli_test_project_id"} + c.Check(setCLIVariables(&bp, inv), ErrorMatches, "invalid format: .*") - c.Assert(setCLIVariables(&bp, inv), ErrorMatches, "invalid format: .*") - c.Check(bp.Vars, DeepEquals, config.Dict{}) + // Failure: Unmarshalable value + bp = config.Blueprint{} + inv = []string{"pyrite={gold"} + c.Check(setCLIVariables(&bp, inv), ErrorMatches, ".*unable to convert.*pyrite.*gold.*") } func (s *MySuite) TestSetBackendConfig(c *C) { @@ -133,32 +135,6 @@ func (s *MySuite) TestValidationLevels(c *C) { c.Check(setValidationLevel(&bp, "INVALID"), NotNil) } -func (s *MySuite) TestRenderError(c *C) { - { // simple - err := errors.New("arbuz") - got := renderError(err, config.YamlCtx{}) - c.Check(got, Equals, "arbuz") - } - { // has pos, but context doesn't contain it - ctx, _ := config.NewYamlCtx([]byte(``)) - pth := config.Root.Vars.Dot("kale") - err := config.BpError{Path: pth, Err: errors.New("arbuz")} - got := renderError(err, ctx) - c.Check(got, Equals, "arbuz") - } - { // has pos, has context - ctx, _ := config.NewYamlCtx([]byte(` -vars: - kale: dos`)) - pth := config.Root.Vars.Dot("kale") - err := config.BpError{Path: pth, Err: errors.New("arbuz")} - got := renderError(err, ctx) - c.Check(got, Equals, `Error: arbuz -3: kale: dos - ^`) - } -} - func (s *MySuite) TestValidateMaybeDie(c *C) { bp := config.Blueprint{ Validators: []config.Validator{{Validator: "invalid"}}, @@ -173,16 +149,20 @@ func (s *MySuite) TestIsOverwriteAllowed_Absent(c *C) { depDir := filepath.Join(testDir, "casper") bp := config.Blueprint{} - c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/), IsNil) - c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/), IsNil) + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, false /*forceOverwrite*/), IsNil) + c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/, false /*forceOverwrite*/), IsNil) } func (s *MySuite) TestIsOverwriteAllowed_NotGHPC(c *C) { depDir := c.MkDir() // empty deployment folder considered malformed bp := config.Blueprint{} - c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/), ErrorMatches, ".* not a valid GHPC deployment folder") - c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/), ErrorMatches, ".* not a valid GHPC deployment folder") + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, false /*forceOverwrite*/), + ErrorMatches, ".* not a valid GHPC deployment folder.*") + c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/, false /*forceOverwrite*/), + ErrorMatches, ".* not a valid GHPC deployment folder.*") + + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, true /*forceOverwrite*/), IsNil) } func (s *MySuite) TestIsOverwriteAllowed_NoExpanded(c *C) { @@ -192,8 +172,12 @@ func (s *MySuite) TestIsOverwriteAllowed_NoExpanded(c *C) { } bp := config.Blueprint{} - c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/), ErrorMatches, ".* changing GHPC version.*") - c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/), ErrorMatches, ".* changing GHPC version.*") + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, false /*forceOverwrite*/), + ErrorMatches, ".* changing GHPC version.*") + c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/, false /*forceOverwrite*/), + ErrorMatches, ".* changing GHPC version.*") + + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, true /*forceOverwrite*/), IsNil) } func (s *MySuite) TestIsOverwriteAllowed_Malformed(c *C) { @@ -207,36 +191,57 @@ func (s *MySuite) TestIsOverwriteAllowed_Malformed(c *C) { } bp := config.Blueprint{} - c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/), NotNil) - c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/), NotNil) + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, false /*forceOverwrite*/), NotNil) + c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/, false /*forceOverwrite*/), NotNil) + // force + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, true /*forceOverwrite*/), IsNil) + c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/, true /*forceOverwrite*/), IsNil) } func (s *MySuite) TestIsOverwriteAllowed_Present(c *C) { - depDir := c.MkDir() - artDir := modulewriter.ArtifactsDir(depDir) + p := c.MkDir() + artDir := modulewriter.ArtifactsDir(p) if err := os.MkdirAll(artDir, 0755); err != nil { c.Fatal(err) } prev := config.DeploymentConfig{ Config: config.Blueprint{ - GhpcVersion: "TaleOdBygoneYears", + GhpcVersion: "TaleOfBygoneYears", DeploymentGroups: []config.DeploymentGroup{ {Name: "isildur"}}}} if err := prev.ExportBlueprint(filepath.Join(artDir, "expanded_blueprint.yaml")); err != nil { c.Fatal(err) } + noW, yesW, noForce, yesForce := false, true, false, true - super := config.Blueprint{ - DeploymentGroups: []config.DeploymentGroup{ - {Name: "isildur"}, - {Name: "elendil"}}} - c.Check(checkOverwriteAllowed(depDir, super, false /*overwriteFlag*/), ErrorMatches, ".* already exists, use -w to overwrite") - c.Check(checkOverwriteAllowed(depDir, super, true /*overwriteFlag*/), IsNil) - - sub := config.Blueprint{ - DeploymentGroups: []config.DeploymentGroup{ - {Name: "aragorn"}}} - c.Check(checkOverwriteAllowed(depDir, sub, false /*overwriteFlag*/), ErrorMatches, `.* already exists, use -w to overwrite`) - c.Check(checkOverwriteAllowed(depDir, sub, true /*overwriteFlag*/), ErrorMatches, `.*remove a deployment group "isildur".*`) + { // Superset + bp := config.Blueprint{ + GhpcVersion: "TaleOfBygoneYears", + DeploymentGroups: []config.DeploymentGroup{ + {Name: "isildur"}, + {Name: "elendil"}}} + c.Check(checkOverwriteAllowed(p, bp, noW, noForce), ErrorMatches, ".* already exists, use -w to overwrite") + c.Check(checkOverwriteAllowed(p, bp, yesW, noForce), IsNil) + } + + { // Version mismatch + bp := config.Blueprint{ + GhpcVersion: "TheAlloyOfLaw", + DeploymentGroups: []config.DeploymentGroup{ + {Name: "isildur"}}} + c.Check(checkOverwriteAllowed(p, bp, noW, noForce), ErrorMatches, ".*ghpc_version has changed.*") + c.Check(checkOverwriteAllowed(p, bp, yesW, noForce), ErrorMatches, ".*ghpc_version has changed.*") + c.Check(checkOverwriteAllowed(p, bp, noW, yesForce), IsNil) + } + + { // Subset + bp := config.Blueprint{ + GhpcVersion: "TaleOfBygoneYears", + DeploymentGroups: []config.DeploymentGroup{ + {Name: "aragorn"}}} + c.Check(checkOverwriteAllowed(p, bp, noW, noForce), ErrorMatches, `.* already exists, use -w to overwrite`) + c.Check(checkOverwriteAllowed(p, bp, yesW, noForce), ErrorMatches, `.*remove a deployment group "isildur".*`) + c.Check(checkOverwriteAllowed(p, bp, noW, yesForce), IsNil) + } } diff --git a/cmd/render.go b/cmd/render.go new file mode 100644 index 0000000000..769f975dd5 --- /dev/null +++ b/cmd/render.go @@ -0,0 +1,91 @@ +// Copyright 2024 "Google LLC" +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "fmt" + "hpc-toolkit/pkg/config" + "hpc-toolkit/pkg/validators" + "strings" +) + +func findPos(path config.Path, ctx config.YamlCtx) (config.Pos, bool) { + pos, ok := ctx.Pos(path) + for !ok && path.Parent() != nil { + path = path.Parent() + pos, ok = ctx.Pos(path) + } + return pos, ok +} + +func renderError(err error, ctx config.YamlCtx) string { + switch te := err.(type) { + case config.Errors: + return renderMultiError(te, ctx) + case validators.ValidatorError: + return renderValidatorError(te, ctx) + case config.HintError: + return renderHintError(te, ctx) + case config.BpError: + return renderBpError(te, ctx) + case config.PosError: + return renderPosError(te, ctx) + default: + return fmt.Sprintf("%s: %s", boldRed("Error"), err) + } +} + +func renderMultiError(errs config.Errors, ctx config.YamlCtx) string { + var sb strings.Builder + for _, e := range errs.Errors { + sb.WriteString(renderError(e, ctx)) + sb.WriteString("\n") + } + return sb.String() +} + +func renderValidatorError(err validators.ValidatorError, ctx config.YamlCtx) string { + title := boldRed(fmt.Sprintf("validator %q failed:", err.Validator)) + return fmt.Sprintf("%s\n%v\n", title, renderError(err.Err, ctx)) +} + +func renderHintError(err config.HintError, ctx config.YamlCtx) string { + return fmt.Sprintf("%s\n%s: %s", renderError(err.Err, ctx), boldYellow("Hint"), err.Hint) +} + +func renderBpError(err config.BpError, ctx config.YamlCtx) string { + if pos, ok := findPos(err.Path, ctx); ok { + posErr := config.PosError{Pos: pos, Err: err.Err} + return renderPosError(posErr, ctx) + } + return renderError(err.Err, ctx) +} + +func renderPosError(err config.PosError, ctx config.YamlCtx) string { + pos := err.Pos + line := pos.Line - 1 + if line < 0 || line >= len(ctx.Lines) { + return renderError(err, ctx) + } + + pref := fmt.Sprintf("%d: ", pos.Line) + arrow := " " + if pos.Column > 0 { + spaces := strings.Repeat(" ", len(pref)+pos.Column-1) + arrow = spaces + "^" + } + + return fmt.Sprintf("%s\n%s%s\n%s", renderError(err.Err, ctx), pref, ctx.Lines[line], arrow) +} diff --git a/cmd/render_test.go b/cmd/render_test.go new file mode 100644 index 0000000000..a1ad7bded4 --- /dev/null +++ b/cmd/render_test.go @@ -0,0 +1,80 @@ +// Copyright 2024 "Google LLC" +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "errors" + "hpc-toolkit/pkg/config" + "testing" + + "github.com/google/go-cmp/cmp" +) + +func makeCtx(yml string, t *testing.T) config.YamlCtx { + ctx, err := config.NewYamlCtx([]byte(yml)) + if err != nil { + t.Fatal(err, yml) + } + return ctx +} + +func TestRenderError(t *testing.T) { + type test struct { + err error + ctx config.YamlCtx + want string + } + tests := []test{ + {errors.New("arbuz"), makeCtx("", t), "Error: arbuz"}, + { // has pos, but context doesn't contain it + err: config.BpError{Path: config.Root.Vars.Dot("kale"), Err: errors.New("arbuz")}, + ctx: makeCtx("", t), + want: "Error: arbuz"}, + { // has pos, has context + err: config.BpError{Path: config.Root.Vars.Dot("kale"), Err: errors.New("arbuz")}, + ctx: makeCtx(` +vars: + kale: dos`, t), + want: `Error: arbuz +3: kale: dos + ^`}, + { + err: config.HintError{Hint: "did you mean 'kale'?", Err: errors.New("arbuz")}, + ctx: makeCtx("", t), + want: `Error: arbuz +Hint: did you mean 'kale'?`}, + { // has pos, has context + err: config.BpError{ + Path: config.Root.Vars.Dot("kale"), + Err: config.HintError{ + Hint: "did you mean 'kale'?", + Err: errors.New("arbuz")}}, + ctx: makeCtx(` +vars: + kale: dos`, t), + want: `Error: arbuz +Hint: did you mean 'kale'? +3: kale: dos + ^`}, + } + for _, tc := range tests { + t.Run(tc.want, func(t *testing.T) { + got := renderError(tc.err, tc.ctx) + if diff := cmp.Diff(tc.want, got); diff != "" { + t.Errorf("diff (-want +got):\n%s", diff) + } + }) + } +} diff --git a/cmd/root.go b/cmd/root.go index 9583af298f..ba077f2750 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -52,7 +52,7 @@ HPC deployments on the Google Cloud Platform.`, logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.27.0", + Version: "v1.28.1", Annotations: annotation, } ) @@ -250,7 +250,6 @@ func execPath() string { // NOTE: this function uses empty YamlCtx, so if you have one, use renderError directly. func checkErr(err error) { if err != nil { - msg := fmt.Sprintf("%s: %s", boldRed("Error"), renderError(err, config.YamlCtx{})) - logging.Fatal(msg) + logging.Fatal(renderError(err, config.YamlCtx{})) } } diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index 4f68b4de41..3a52e74d42 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -171,7 +171,7 @@ deployment_groups: # these images must match the images used by Slurm modules below because # we are building OpenMPI with PMI support in libraries contained in # Slurm installation - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public - id: low_cost_node_group diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml new file mode 100644 index 0000000000..b8a1cf8888 --- /dev/null +++ b/community/examples/hpc-build-slurm-image.yaml @@ -0,0 +1,119 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +blueprint_name: hpc-build-slurm-image + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: build-slurm-1 + region: us-central1 + zone: us-central1-a + + image_build_machine_type: n2d-standard-16 + build_from_image_family: hpc-rocky-linux-8 + build_from_image_project: cloud-hpc-image-public + built_image_family: my-custom-slurm + built_instance_image: + family: $(vars.built_image_family) + project: $(vars.project_id) + instance_image_custom: true + +deployment_groups: +- group: setup + modules: + - id: network + source: modules/network/vpc + + - id: slurm-build-script + source: modules/scripts/startup-script + settings: + # Do not create Ansible virtual env; Install system wide Ansible below. + install_ansible: false + runners: + - type: shell + destination: prep-for-slurm-build.sh + content: | + #!/bin/bash + set -e -o pipefail + # Slurm build on Rocky8 will upgrade to python38 as part of build + # This is not compatible with ansible-local runner + dnf install -y python38 + alternatives --set python3 /usr/bin/python3.8 + python3 -m pip install pip --upgrade + python3 -m pip install ansible==6.7.0 + python3 -m pip install selinux + export PATH=/usr/local/bin:$PATH + ansible --version + ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents + - type: data + destination: /var/tmp/slurm_vars.json + content: | + { + "reboot": false, + "slurm_version": "23.02.5", + "install_cuda": false, + "nvidia_version": "latest", + "install_ompi": true, + "install_lustre": false, + "install_gcsfuse": true + } + - type: shell + destination: install_slurm.sh + content: | + #!/bin/bash + set -e -o pipefail + ansible-pull \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C master \ + -i localhost, --limit localhost --connection=local \ + -e @/var/tmp/slurm_vars.json \ + ansible/playbook.yml + +- group: build-slurm + modules: + - id: slurm-custom-image + source: modules/packer/custom-image + kind: packer + settings: + machine_type: $(vars.image_build_machine_type) + source_image_family: $(vars.build_from_image_family) + source_image_project_id: [$(vars.build_from_image_project)] + image_family: $(vars.built_image_family) + use: + - network + - slurm-build-script + +- group: demo-cluster + modules: + + - id: debug_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + machine_type: n2d-standard-2 + instance_image: $(vars.built_instance_image) + + - id: debug_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [debug_nodeset] + settings: + partition_name: debug + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + use: + - network + - debug_partition + settings: + machine_type: n2d-standard-4 + instance_image: $(vars.built_instance_image) diff --git a/community/examples/hpc-slurm-chromedesktop.yaml b/community/examples/hpc-slurm-chromedesktop.yaml index 8e6b816cb2..c6a31d9337 100644 --- a/community/examples/hpc-slurm-chromedesktop.yaml +++ b/community/examples/hpc-slurm-chromedesktop.yaml @@ -22,10 +22,10 @@ vars: region: us-central1 zone: us-central1-c instance_image_crd: - family: slurm-gcp-5-9-debian-11 + family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image: - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public # Documentation for each of the modules used below can be found at diff --git a/community/examples/hpc-slurm-gromacs.yaml b/community/examples/hpc-slurm-gromacs.yaml index 9a45afc97d..c4f8780250 100644 --- a/community/examples/hpc-slurm-gromacs.yaml +++ b/community/examples/hpc-slurm-gromacs.yaml @@ -29,7 +29,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: modules/network/pre-existing-vpc + source: modules/network/vpc ## Filesystems - id: appsfs @@ -49,6 +49,7 @@ deployment_groups: source: community/modules/scripts/spack-setup settings: install_dir: /sw/spack + spack_ref: v0.20.0 - id: spack-execute source: community/modules/scripts/spack-execute @@ -68,11 +69,11 @@ deployment_groups: projections: all: '{name}/{version}-{compiler.name}-{compiler.version}' commands: | - # Un-comment and update mirror_url to install from spack cache - # if ! spack mirror list | grep -q gcs_cache; then - # spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket - # fi - # spack buildcache keys --install --trust + ## Un-comment and update mirror_url to install from spack cache + ## if ! spack mirror list | grep -q gcs_cache; then + ## spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket + ## fi + ## spack buildcache keys --install --trust spack config --scope defaults add config:build_stage:/sw/spack/spack-stage spack config --scope defaults add -f /tmp/projections-config.yaml @@ -82,38 +83,52 @@ deployment_groups: spack compiler find --scope site spack install intel-mpi@2018.4.274%gcc@10.3.0 - spack install gromacs@2023.1 %gcc@10.3.0 ^intel-mpi@2018.4.274 ^cmake@3.26.3 %gcc@4.8.5 + spack install gromacs@2023.1 %gcc@10.3.0 ^intel-mpi@2018.4.274 ^cmake@3.26.3 %gcc@8.5.0 - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - - appsfs + - id: script + source: modules/scripts/startup-script settings: - partition_name: compute - max_node_count: 20 + runners: + # remove lustre client temporary to avoid startup failure due to known + # issue. + - type: shell + destination: remove_lustre_client.sh + content: | + #!/bin/bash + rm /etc/yum.repos.d/lustre-client.repo + - $(spack-execute.spack_runner) + + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - appsfs - - compute_partition + - id: compute_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset, homefs, appsfs] settings: - login_node_count: 1 + partition_name: compute + is_default: true - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network1] + settings: + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: false + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 + - compute_partition + - slurm_login - homefs - appsfs - - slurm_controller - - spack-execute settings: - login_machine_type: c2-standard-4 - login_scopes: - - https://www.googleapis.com/auth/cloud-platform + disable_controller_public_ips: false + login_startup_script: $(script.startup_script) + login_startup_scripts_timeout: 21600 diff --git a/community/examples/hpc-slurm-legacy.yaml b/community/examples/hpc-slurm-legacy.yaml deleted file mode 100644 index d8eafeb4f5..0000000000 --- a/community/examples/hpc-slurm-legacy.yaml +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-slurm-legacy - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-high-io - region: us-west4 - zone: us-west4-c - -# Documentation for each of the modules used below can be found at -# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md - -deployment_groups: -- group: primary - modules: - # Source is an embedded module, denoted by "modules/*" without ./, ../, / - # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/pre-existing-vpc - - id: network1 - source: modules/network/pre-existing-vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: projectsfs - source: modules/file-system/filestore - use: [network1] - settings: - filestore_tier: HIGH_SCALE_SSD - size_gb: 10240 - local_mount: /projects - - # This file system has an associated license cost. - # https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud - - id: scratchfs - source: community/modules/file-system/DDN-EXAScaler - use: [network1] - settings: - local_mount: /scratch - - - id: low_cost_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - - scratchfs - - projectsfs - settings: - partition_name: low_cost - max_node_count: 10 - enable_placement: false - exclusive: false - machine_type: n2-standard-4 - bandwidth_tier: gvnic_enabled - - # This compute_partition is far more performant than low_cost_partition. - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - - scratchfs - - projectsfs - settings: - max_node_count: 200 - partition_name: compute - bandwidth_tier: gvnic_enabled - - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - scratchfs - - projectsfs - - low_cost_partition # low cost partition will be default as it is listed first - - compute_partition - settings: - controller_machine_type: c2-standard-8 - suspend_time: 60 - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - homefs - - scratchfs - - projectsfs - - slurm_controller - settings: - login_machine_type: n2-standard-4 - - - id: hpc_dashboard - source: modules/monitoring/dashboard - outputs: [instructions] diff --git a/community/examples/hpc-slurm-legacy-sharedvpc.yaml b/community/examples/hpc-slurm-sharedvpc.yaml similarity index 68% rename from community/examples/hpc-slurm-legacy-sharedvpc.yaml rename to community/examples/hpc-slurm-sharedvpc.yaml index 9db8f115dd..d44d333140 100644 --- a/community/examples/hpc-slurm-legacy-sharedvpc.yaml +++ b/community/examples/hpc-slurm-sharedvpc.yaml @@ -55,46 +55,50 @@ deployment_groups: local_mount: /home connect_mode: PRIVATE_SERVICE_ACCESS - # This debug_partition will work out of the box without requesting additional GCP quota. + - id: debug_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + node_count_dynamic_max: 4 + machine_type: n2-standard-2 + enable_placement: false # the default is: true + - id: debug_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [debug_nodeset, homefs] settings: partition_name: debug - max_node_count: 4 - enable_placement: false - exclusive: false - machine_type: n2-standard-2 + exclusive: false # allows nodes to stay up after jobs are done + is_default: true - # This compute_partition is far more performant than debug_partition but may require requesting GCP quotas first. - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] settings: - partition_name: compute - max_node_count: 20 + node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - debug_partition # debug partition will be default as it is listed first - - compute_partition + - id: compute_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset, homefs] settings: - login_node_count: 1 - shared_vpc_host_project: $(vars.host_project_id) + partition_name: compute - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network1] + settings: + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: false + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 + - debug_partition + - compute_partition + - slurm_login - homefs - - slurm_controller settings: - shared_vpc_host_project: $(vars.host_project_id) + disable_controller_public_ips: false diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml index 261376e816..637a167602 100644 --- a/community/examples/hpc-slurm-ubuntu2004.yaml +++ b/community/examples/hpc-slurm-ubuntu2004.yaml @@ -23,8 +23,8 @@ vars: zone: us-west4-c instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - family: slurm-gcp-5-9-ubuntu-2004-lts + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + family: slurm-gcp-5-10-ubuntu-2004-lts project: schedmd-slurm-public instance_image_custom: true diff --git a/community/examples/hpc-slurm6-tpu.yaml b/community/examples/hpc-slurm6-tpu.yaml index 4a5dd3c3c3..ed3ccd449e 100644 --- a/community/examples/hpc-slurm6-tpu.yaml +++ b/community/examples/hpc-slurm6-tpu.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network - source: modules/network/pre-existing-vpc + source: modules/network/vpc - id: tpu_nodeset source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu @@ -35,11 +35,10 @@ deployment_groups: name: v2x8 node_type: v2-8 tf_version: 2.10.0 - disable_public_ips: false - # To specify if TPU nodes are preemptible. The nodes will be shut down if - # it requires additional resources. + # Preemptible TPUs cost much less than non-preemptible TPUs. + # The Cloud TPU service might preempt (shut down) these TPUs at any time. # https://cloud.google.com/tpu/docs/preemptible - preemptible: true + preemptible: false # Specify whether to preserve TPU on suspend. # If set to true, suspended VM will be stopped. # If set to false, suspended VM will be deleted. diff --git a/community/examples/hpc-slurm6.yaml b/community/examples/hpc-slurm6.yaml deleted file mode 100644 index 923691eda4..0000000000 --- a/community/examples/hpc-slurm6.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-slurm6 - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: slurm-gcp-v6 - region: us-west4 - zone: us-west4-c - instance_image: - family: slurm-gcp-6-1-hpc-rocky-linux-8 - project: schedmd-slurm-public - -deployment_groups: -- group: primary - modules: - - id: network - source: modules/network/vpc - - - id: homefs - source: modules/file-system/filestore - use: [network] - settings: - local_mount: /home - - - id: debug_nodeset - source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset - use: [network] - settings: - name: ns1 - node_count_dynamic_max: 4 - machine_type: n2-standard-2 - enable_placement: false # the default is: true - - - id: debug_partition - source: community/modules/compute/schedmd-slurm-gcp-v6-partition - use: [debug_nodeset, homefs] - settings: - partition_name: debug - exclusive: false # allows nodes to stay up after jobs are done - is_default: true - - - id: compute_nodeset - source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset - use: [network] - settings: - name: ns2 - node_count_dynamic_max: 20 - bandwidth_tier: gvnic_enabled - - - id: compute_partition - source: community/modules/compute/schedmd-slurm-gcp-v6-partition - use: [compute_nodeset, homefs] - settings: - partition_name: compute - - - id: slurm_login - source: community/modules/scheduler/schedmd-slurm-gcp-v6-login - use: [network] - settings: - name_prefix: login - machine_type: n2-standard-4 - disable_login_public_ips: false - - - id: slurm_controller - source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller - use: - - network - - debug_partition - - compute_partition - - slurm_login - - homefs - settings: - disable_controller_public_ips: false diff --git a/community/examples/htc-slurm.yaml b/community/examples/htc-slurm.yaml index 554448b115..25abc6c1f0 100644 --- a/community/examples/htc-slurm.yaml +++ b/community/examples/htc-slurm.yaml @@ -17,7 +17,7 @@ # This blueprint provisions a cluster using the Slurm scheduler configured to # efficiently run many short duration, loosely-coupled (non-MPI) jobs. See also: -# https://github.com/SchedMD/slurm-gcp/blob/master/docs/htc.md +# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md # https://slurm.schedmd.com/high_throughput.html blueprint_name: htc-slurm @@ -146,8 +146,8 @@ deployment_groups: settings: machine_type: c2-standard-8 disable_controller_public_ips: $(vars.disable_public_ips) - slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/slurm.conf.tpl - slurmdbd_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/slurmdbd.conf.tpl + slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl + slurmdbd_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl - id: slurm_login source: community/modules/scheduler/schedmd-slurm-gcp-v5-login diff --git a/community/examples/intel/README.md b/community/examples/intel/README.md index 7172673363..0679b5fb01 100644 --- a/community/examples/intel/README.md +++ b/community/examples/intel/README.md @@ -4,12 +4,6 @@ - [Intel Solutions for the HPC Toolkit](#intel-solutions-for-the-hpc-toolkit) - - [Intel-Optimized Slurm Cluster](#intel-optimized-slurm-cluster) - - [Initial Setup for the Intel-Optimized Slurm Cluster](#initial-setup-for-the-intel-optimized-slurm-cluster) - - [Deploy the Slurm Cluster](#deploy-the-slurm-cluster) - - [Connect to the login node](#connect-to-the-login-node) - - [Access the cluster and provision an example job](#access-the-cluster-and-provision-an-example-job) - - [Delete the infrastructure when not in use](#delete-the-infrastructure-when-not-in-use) - [DAOS Cluster](#daos-cluster) - [Initial Setup for DAOS Cluster](#initial-setup-for-daos-cluster) - [Deploy the DAOS Cluster](#deploy-the-daos-cluster) @@ -17,7 +11,7 @@ - [Verify the DAOS storage system](#verify-the-daos-storage-system) - [Create a DAOS Pool and Container](#create-a-daos-pool-and-container) - [About the DAOS Command Line Tools](#about-the-daos-command-line-tools) - - [Determine Free Space](#determine-free-space) + - [View Free Space](#view-free-space) - [Create a Pool](#create-a-pool) - [Create a Container](#create-a-container) - [Mount the DAOS Container](#mount-the-daos-container) @@ -33,155 +27,6 @@ - [Unmount the Container](#unmount-the-container) - [Delete the DAOS/Slurm Cluster infrastructure when not in use](#delete-the-daosslurm-cluster-infrastructure-when-not-in-use) -## Intel-Optimized Slurm Cluster - -This document is adapted from a [Cloud Shell tutorial][tutorial] developed to -demonstrate Intel Select Solutions within the Toolkit. It expands upon that -tutorial by building custom images that save provisioning time and improve -reliability when scaling up compute nodes. - -The Google Cloud [HPC VM Image][hpcvmimage] has a built-in feature enabling it -to install a Google Cloud-tested release of Intel compilers and libraries that -are known to achieve optimal performance on Google Cloud. - -[tutorial]: ../../../docs/tutorials/intel-select/intel-select.md -[hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm - -Identify a project to work in and substitute its unique id wherever you see -`<>` in the instructions below. - -### Initial Setup for the Intel-Optimized Slurm Cluster - -Before provisioning any infrastructure in this project you should follow the -Toolkit guidance to enable [APIs][apis] and establish minimum resource -[quotas][quotas]. In particular, the following APIs should be enabled - -- [file.googleapis.com](https://cloud.google.com/filestore/docs/reference/rest) (Cloud Filestore) -- [compute.googleapis.com](https://cloud.google.com/compute/docs/reference/rest/v1#service:-compute.googleapis.com) (Google Compute Engine) - -[apis]: ../../../README.md#enable-gcp-apis -[quotas]: ../../../README.md#gcp-quotas - -And the following available quota is required in the region used by the cluster: - -- Filestore: 2560GB -- C2 CPUs: 4 (login node) -- C2 CPUs: up to 6000 (fully-scaled "compute" partition) - - This quota is not necessary at initial deployment, but will be required to - successfully scale the partition to its maximum size - -### Deploy the Slurm Cluster - -Use `ghpc` to provision the blueprint, supplying your project ID - -```text -ghpc create --vars project_id=<> community/examples/intel/hpc-intel-select-slurm.yaml -``` - -This will create a set of directories containing Terraform modules and Packer -templates. **Please ignore the printed instructions** in favor of the following: - -1. Provision the network and startup scripts that install Intel software - - ```shell - terraform -chdir=hpc-intel-select/primary init - terraform -chdir=hpc-intel-select/primary validate - terraform -chdir=hpc-intel-select/primary apply - ``` - -2. Capture the startup scripts to files that will be used by Packer to build the - images - - ```shell - terraform -chdir=hpc-intel-select/primary output \ - -raw startup_script_startup_controller > \ - hpc-intel-select/build1/controller-image/startup_script.sh - - terraform -chdir=hpc-intel-select/primary output \ - -raw startup_script_startup_compute > \ - hpc-intel-select/build2/compute-image/startup_script.sh - ``` - -3. Build the custom Slurm controller image. While this step is executing, you - may begin the next step in parallel. - - ```shell - cd hpc-intel-select/build1/controller-image - packer init . - packer validate . - packer build -var startup_script_file=startup_script.sh . - ``` - -4. Build the custom Slurm image for login and compute nodes - - ```shell - cd - - cd hpc-intel-select/build2/compute-image - packer init . - packer validate . - packer build -var startup_script_file=startup_script.sh . - ``` - -5. Provision the Slurm cluster - - ```shell - cd - - terraform -chdir=hpc-intel-select/cluster init - terraform -chdir=hpc-intel-select/cluster validate - terraform -chdir=hpc-intel-select/cluster apply - ``` - -### Connect to the login node - -Once the startup script has completed and Slurm reports readiness, connect to the login node. - -1. Open the following URL in a new tab. - - https://console.cloud.google.com/compute - - This will take you to **Compute Engine > VM instances** in the Google Cloud Console - - Ensure that you select the project in which you are provisioning the cluster. - -2. Click on the **SSH** button associated with the `slurm-hpc-intel-select-login0` - instance. - - This will open a separate pop up window with a terminal into our newly created - Slurm login VM. - -### Access the cluster and provision an example job - - **The commands below should be run on the login node.** - -1. Create a default ssh key to be able to ssh between nodes - - ```shell - ssh-keygen -q -N '' -f ~/.ssh/id_rsa - cp ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys - chmod 0600 ~/.ssh/authorized_keys - ``` - -1. Submit an example job - - ```shell - cp /var/tmp/dgemm_job.sh . - sbatch dgemm_job.sh - ``` - -### Delete the infrastructure when not in use - -> **_NOTE:_** If the Slurm controller is shut down before the auto-scale nodes -> are destroyed then they will be left running. - -Open your browser to the VM instances page and ensure that nodes named "compute" -have been shutdown and deleted by the Slurm autoscaler. Delete the remaining -infrastructure in reverse order of creation: - -```shell -terraform -chdir=hpc-intel-select/cluster destroy -terraform -chdir=hpc-intel-select/primary destroy -``` - ## DAOS Cluster The [pfs-daos.yaml](pfs-daos.yaml) blueprint describes an environment with @@ -196,16 +41,22 @@ for general information on building custom images using the Toolkit. Identify a project to work in and substitute its unique id wherever you see `<>` in the instructions below. +[google-cloud-daos]: https://github.com/daos-stack/google-cloud-daos +[pre-deployment_guide]: https://github.com/daos-stack/google-cloud-daos/blob/main/docs/pre-deployment_guide.md +[DAOS Yum Repository]: https://packages.daos.io + ### Initial Setup for DAOS Cluster Before provisioning the DAOS cluster you must follow the steps listed in the [Google Cloud DAOS Pre-deployment Guide][pre-deployment_guide]. Skip the "Build DAOS Images" step at the end of the [Pre-deployment Guide][pre-deployment_guide]. The [pfs-daos.yaml](pfs-daos.yaml) blueprint will build the images as part of the deployment. -The Pre-deployment Guide provides instructions for enabling service accounts, APIs, establishing minimum resource quotas and other necessary steps to prepare your project. - -[google-cloud-daos]: https://github.com/daos-stack/google-cloud-daos -[pre-deployment_guide]: https://github.com/daos-stack/google-cloud-daos/blob/main/docs/pre-deployment_guide.md +The Pre-deployment Guide provides instructions for: +- installing the Google Cloud CLI +- enabling service accounts +- enabling APIs +- establishing minimum resource quotas +- creating a Cloud NAT to allow instances without public IPs to access the [DAOS Yum Repository] repository. ### Deploy the DAOS Cluster @@ -247,7 +98,7 @@ ghpc deploy pfs-daos --auto-approve The `community/examples/intel/pfs-daos.yaml` blueprint does not contain configuration for DAOS pools and containers. Therefore, pools and containers will need to be created manually. -Before pools and containers can be created the storage system must be formatted. Formatting the storage is done automatically by the startup script that runs on the *daos-server-0001* instance. The startup script will run the [dmg storage format](https://docs.daos.io/v2.2/admin/deployment/?h=dmg+storage#storage-formatting) command. It may take a few minutes for all daos server instances to join. +Before pools and containers can be created the storage system must be formatted. Formatting the storage is done automatically by the startup script that runs on the *daos-server-0001* instance. The startup script will run the [dmg storage format](https://docs.daos.io/v2.4/admin/deployment/?h=dmg+storage#storage-formatting) command. It may take a few minutes for all daos server instances to join. Verify that the storage system has been formatted and that the daos-server instances have joined. @@ -272,35 +123,24 @@ Both daos-server instances should show a state of *Joined*. #### About the DAOS Command Line Tools -The DAOS Management tool `dmg` is used by System Administrators to manage the DAOS storage [system](https://docs.daos.io/v2.2/overview/architecture/#daos-system) and DAOS [pools](https://docs.daos.io/v2.2/overview/storage/#daos-pool). Therefore, `sudo` must be used when running `dmg`. +The DAOS Management tool `dmg` is used by System Administrators to manage the DAOS storage [system](https://docs.daos.io/v2.4/overview/architecture/#daos-system) and DAOS [pools](https://docs.daos.io/v2.4/overview/storage/#daos-pool). Therefore, `sudo` must be used when running `dmg`. -The DAOS CLI `daos` is used by both users and System Administrators to create and manage [containers](https://docs.daos.io/v2.2/overview/storage/#daos-container). It is not necessary to use `sudo` with the `daos` command. +The DAOS CLI `daos` is used by both users and System Administrators to create and manage [containers](https://docs.daos.io/v2.4/overview/storage/#daos-container). It is not necessary to use `sudo` with the `daos` command. -#### Determine Free Space +#### View Free Space -Determine how much free space is available. +View how much free space is available. ```bash sudo dmg storage query usage ``` -The result will look similar to - -```text -Hosts SCM-Total SCM-Free SCM-Used NVMe-Total NVMe-Free NVMe-Used ------ --------- -------- -------- ---------- --------- --------- -daos-server-0001 215 GB 215 GB 0 % 6.4 TB 6.4 TB 0 % -daos-server-0002 215 GB 215 GB 0 % 6.4 TB 6.4 TB 0 % -``` - -In the example output above we see that there is a total of 12.8TB NVME-Free. - #### Create a Pool -Create a single pool owned by root which uses all available free space. +Create a single pool owned by root which uses 100% of the available free space. ```bash -sudo dmg pool create -z 12.8TB -t 3 -u root --label=pool1 +sudo dmg pool create --size=100% --user=root pool1 ``` Set ACLs to allow any user to create a container in *pool1*. @@ -309,7 +149,7 @@ Set ACLs to allow any user to create a container in *pool1*. sudo dmg pool update-acl -e A::EVERYONE@:rcta pool1 ``` -See the [Pool Operations](https://docs.daos.io/v2.2/admin/pool_operations) section of the of the DAOS Administration Guide for more information about creating pools. +See the [Pool Operations](https://docs.daos.io/v2.4/admin/pool_operations) section of the DAOS Administration Guide for more information about creating pools. #### Create a Container @@ -319,24 +159,18 @@ and how it will be used. The ACLs will need to be set properly to allow users an For the purpose of this demo create the container without specifying ACLs. The container will be owned by your user account and you will have full access to the container. ```bash -daos cont create pool1 \ - --label cont1 \ - --type POSIX \ - --properties rf:0 +daos container create --type=POSIX --properties=rf:0 pool1 cont1 ``` -See the [Container Management](https://docs.daos.io/v2.2/user/container) section of the of the DAOS User Guide for more information about creating containers. +See the [Container Management](https://docs.daos.io/v2.4/user/container) section of the DAOS User Guide for more information about creating containers. #### Mount the DAOS Container Mount the container with dfuse (DAOS Fuse) ```bash -mkdir -p ${HOME}/daos/cont1 -dfuse --singlethread \ - --pool=pool1 \ - --container=cont1 \ - --mountpoint=${HOME}/daos/cont1 +mkdir -p "${HOME}/daos/cont1" +dfuse --singlethread --pool=pool1 --container=cont1 --mountpoint="${HOME}/daos/cont1" ``` Verify that the container is mounted @@ -356,51 +190,71 @@ time LD_PRELOAD=/usr/lib64/libioil.so \ dd if=/dev/zero of="${HOME}/daos/cont1/test20GiB.img" iflag=fullblock bs=1G count=20 ``` -See the [File System](https://docs.daos.io/v2.2/user/filesystem/) section of the DAOS User Guide for more information about DFuse. +**Known Issue:** -### Unmount the DAOS Container +When you run `ls -lh "${HOME}/daos/cont1"` you may see that the `test20GiB.img` file shows a size of 0 bytes. -The container will need to by unmounted before you log out. If this is not done it can leave open file handles and prevent the container from being mounted when you log in again. +If you unmount the container and mount it again, the file size will show as 20G. ```bash -fusermount3 -u ${HOME}/daos/cont1 +fusermount3 -u "${HOME}/daos/cont1" +dfuse --singlethread --pool=pool1 --container=cont1 --mountpoint="${HOME}/daos/cont1" +ls -lh "${HOME}/daos/cont1" +``` + +A work-around for this issue to disable caching when mounting the container. + +```bash +dfuse --singlethread --disable-caching --pool=pool1 --container=cont1 --mountpoint="${HOME}/daos/cont1" ``` +See the [File System](https://docs.daos.io/v2.4/user/filesystem/) section of the DAOS User Guide for more information about DFuse. + +### Unmount the DAOS Container + +The container will need to be unmounted before you log out. If this is not done it can leave open file handles and prevent the container from being mounted when you log in again. + Verify that the container is unmounted ```bash df -h -t fuse.daos ``` -See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.2/user/filesystem/?h=dfuse#dfuse-daos-fuse) section of the DAOS User Guide for more information about mounting POSIX containers. +Logout of the DAOS client instance. + +```bash +logout +``` + +See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#dfuse-daos-fuse) section of the DAOS User Guide for more information about mounting POSIX containers. ### Delete the DAOS infrastructure when not in use -> **_NOTE:_** All the DAOS data will be permanently lost after cluster deletion. +> **_NOTE:_** Data stored in the DAOS container will be permanently lost after cluster deletion. Delete the remaining infrastructure -```shell +```bash ghpc destroy pfs-daos --auto-approve ``` ## DAOS Server with Slurm cluster -The [hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) blueprint describes an environment with a Slurm cluster and four DAOS server instances. The compute nodes are configured as DAOS clients and have the ability to use the DAOS filesystem on the DAOS server instances. +The [hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) blueprint can be used to deploy a Slurm cluster and four DAOS server instances. The Slurm compute instances are configured as DAOS clients. The blueprint uses modules from - [google-cloud-daos][google-cloud-daos] -- [community/modules/scheduler/SchedMD-slurm-on-gcp-controller][SchedMD-slurm-on-gcp-controller] -- [community/modules/scheduler/SchedMD-slurm-on-gcp-login-node][SchedMD-slurm-on-gcp-login-node] -- [community/modules/compute/SchedMD-slurm-on-gcp-partition][SchedMD-slurm-on-gcp-partition] +- [community/modules/compute/schedmd-slurm-gcp-v6-nodeset][schedmd-slurm-gcp-v6-nodeset] +- [community/modules/compute/schedmd-slurm-gcp-v6-partition][schedmd-slurm-gcp-v6-partition] +- [community/modules/scheduler/schedmd-slurm-gcp-v6-login][schedmd-slurm-gcp-v6-login] +- [community/modules/scheduler/schedmd-slurm-gcp-v6-controller][schedmd-slurm-gcp-v6-controller] The blueprint also uses a Packer template from the [Google Cloud DAOS][google-cloud-daos] repository. Please review the [introduction to image building](../../../docs/image-building.md) for general information on building custom images using the Toolkit. -Identify a project to work in and substitute its unique id wherever you see -`<>` in the instructions below. +Substitute your project ID wherever you see `<>` in the instructions below. ### Initial Setup for the DAOS/Slurm cluster @@ -408,16 +262,16 @@ Before provisioning the DAOS cluster you must follow the steps listed in the [Go Skip the "Build DAOS Images" step at the end of the [Pre-deployment Guide][pre-deployment_guide]. The [hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) blueprint will build the DAOS server image as part of the deployment. -The Pre-deployment Guide provides instructions for enabling service accounts, APIs, establishing minimum resource quotas and other necessary steps to prepare your project for DAOS server deployment. +The [Pre-deployment Guide][pre-deployment_guide] provides instructions for enabling service accounts, APIs, establishing minimum resource quotas and other necessary steps to prepare your project for DAOS server deployment. [google-cloud-daos]: https://github.com/daos-stack/google-cloud-daos [pre-deployment_guide]: https://github.com/daos-stack/google-cloud-daos/blob/main/docs/pre-deployment_guide.md - [packer-template]: https://github.com/daos-stack/google-cloud-daos/blob/main/images/daos.pkr.hcl [apis]: ../../../README.md#enable-gcp-apis -[SchedMD-slurm-on-gcp-controller]: ../../modules/scheduler/SchedMD-slurm-on-gcp-controller -[SchedMD-slurm-on-gcp-login-node]: ../../modules/scheduler/SchedMD-slurm-on-gcp-login-node -[SchedMD-slurm-on-gcp-partition]: ../../modules/compute/SchedMD-slurm-on-gcp-partition +[schedmd-slurm-gcp-v6-nodeset]: ../../modules/compute/schedmd-slurm-gcp-v6-nodeset +[schedmd-slurm-gcp-v6-partition]: ../../modules/compute/schedmd-slurm-gcp-v6-partition +[schedmd-slurm-gcp-v6-controller]: ../../modules/scheduler/schedmd-slurm-gcp-v6-controller +[schedmd-slurm-gcp-v6-login]: ../../modules/scheduler/schedmd-slurm-gcp-v6-login Follow the Toolkit guidance to enable [APIs][apis] and establish minimum resource [quotas][quotas] for Slurm. @@ -450,7 +304,7 @@ The `--backend-config` option is not required but recommended. It will save the Follow `ghpc` instructions to deploy the environment ```text -ghpc deploy daos-slurm --auto-approve +ghpc deploy hpc-slurm-daos --auto-approve ``` [backend]: ../../../examples/README.md#optional-setting-up-a-remote-terraform-state @@ -468,7 +322,7 @@ Once the startup script has completed and Slurm reports readiness, connect to th Select the project in which the cluster will be provisionsd. -2. Click on the `SSH` button associated with the `slurm-daos-slurm-login0` +2. Click on the `SSH` button associated with the `hpcslurmda-login-login-001` instance. This will open a separate pop up window with a terminal into our newly created @@ -483,10 +337,12 @@ You will need to create your own DAOS container in the pool that can be used by While logged into the login node create a container named `cont1` in the `pool1` pool: ```bash -daos cont create --type=POSIX --properties=rf:0 --label=cont1 pool1 +daos cont create --type=POSIX --properties=rf:0 pool1 cont1 ``` -Since the `cont1` container is owned by your account, your Slurm jobs will need to run as your user account in order to access the container. +NOTE: If you encounter an error `daos: command not found`, it's likely that the startup scripts have not finished running yet. Wait a few minutes and try again. + +Since the `cont1` container is owned by your account, your Slurm jobs will need to run as your user account to access the container. Create a mount point for the container and mount it with dfuse (DAOS Fuse) @@ -538,6 +394,7 @@ echo "Job ${SLURM_JOB_ID} running on ${JOB_HOSTNAME}" | tee "${MOUNT_DIR}/${TIME echo "${JOB_HOSTNAME} : Unmounting dfuse" fusermount3 -u "${MOUNT_DIR}" + ``` Run the `daos_job.sh` script in an interactive Slurm job on 4 nodes @@ -575,21 +432,20 @@ Verify that the container is unmounted df -h -t fuse.daos ``` -See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.2/user/filesystem/?h=dfuse#dfuse-daos-fuse) section of the DAOS User Guide for more information about mounting POSIX containers. +See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#dfuse-daos-fuse) section of the DAOS User Guide for more information about mounting POSIX containers. ### Delete the DAOS/Slurm Cluster infrastructure when not in use -> **_NOTE:_** All the DAOS data will be permanently lost after cluster deletion. - - +> **_NOTE:_** +> +> - Data on the DAOS file system will be permanently lost after cluster deletion. +> - If the Slurm controller is shut down before the auto-scale instances are destroyed, those compute instances will be left running. -> **_NOTE:_** If the Slurm controller is shut down before the auto-scale nodes -> are destroyed then they will be left running. +Open your browser to the VM instances page and ensure that instances named "compute" +have been shutdown and deleted by the Slurm autoscaler. -Open your browser to the VM instances page and ensure that nodes named "compute" -have been shutdown and deleted by the Slurm autoscaler. Delete the remaining -infrastructure with `terraform`: +Delete the remaining infrastructure: -```shell -ghpc destroy daos-slurm --auto-approve +```bash +ghpc destroy hpc-slurm-daos --auto-approve ``` diff --git a/community/examples/intel/hpc-intel-select-slurm.yaml b/community/examples/intel/hpc-intel-select-slurm.yaml deleted file mode 100644 index bb197ac533..0000000000 --- a/community/examples/intel/hpc-intel-select-slurm.yaml +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-intel-select-slurm - -vars: - deployment_name: hpc-intel-select - region: us-central1 - zone: us-central1-c - controller_image: - family: slurm-intel-hpc-controller - project: $(vars.project_id) - compute_image: - family: slurm-intel-hpc-compute - project: $(vars.project_id) - network_name: intel-select-net - subnetwork_name: intel-select-primary-subnet - -# Documentation for each of the modules used below can be found at -# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - - - id: startup_controller - source: modules/scripts/startup-script - settings: - runners: - - type: shell - destination: /var/tmp/install_intel_controller.sh - content: | - #!/bin/bash - yum -y update google-hpc-compute - google_install_mpi --prefix /apps --intel_compliance - outputs: - - startup_script - - - id: startup_compute - source: modules/scripts/startup-script - settings: - runners: - - type: shell - destination: /var/tmp/install_intel_compute.sh - content: | - #!/bin/bash - yum -y update google-hpc-compute - google_install_mpi --intel_comp_meta - - type: data - destination: /var/tmp/dgemm_job.sh - content: | - #!/bin/bash - #SBATCH --nodes=4 - #SBATCH --ntasks-per-node=30 - #SBATCH --time=01:00:00 - #SBATCH --job-name=clckjob - #SBATCH --output=job_%j.log - #SBATCH --partition=compute - . /apps/clck/2019.10/env/vars.sh - export CLCK_SHARED_TEMP_DIR=$HOME - cd $SLURM_SUBMIT_DIR - # select_solutions_sim_mod_user_base_2018.0 | select_solutions_sim_mod_user_plus_2018.0 - FWD=select_solutions_sim_mod_user_base_2018.0 - clck -D ${FWD}.db -F ${FWD} -l debug - outputs: - - startup_script - -- group: build1 - modules: - - id: controller-image - source: modules/packer/custom-image - kind: packer - settings: - disk_size: 20 - source_image_project_id: [schedmd-slurm-public] - source_image_family: schedmd-slurm-21-08-8-hpc-centos-7 - image_family: $(vars.controller_image.family) - -- group: build2 - modules: - - id: compute-image - source: modules/packer/custom-image - kind: packer - settings: - disk_size: 20 - source_image_project_id: [schedmd-slurm-public] - source_image_family: schedmd-slurm-21-08-8-hpc-centos-7 - image_family: $(vars.compute_image.family) - -- group: cluster - modules: - - id: cluster-network - source: modules/network/pre-existing-vpc - - - id: homefs - source: modules/file-system/filestore - use: - - cluster-network - settings: - local_mount: /home - - # This debug_partition will work out of the box without requesting additional GCP quota. - - id: debug_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - cluster-network - - homefs - settings: - partition_name: debug - max_node_count: 4 - enable_placement: false - exclusive: false - machine_type: n2-standard-4 - instance_image: $(vars.compute_image) - - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - cluster-network - - homefs - settings: - partition_name: compute - instance_image: $(vars.compute_image) - max_node_count: 100 - machine_type: c2-standard-60 - bandwidth_tier: gvnic_enabled - - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - cluster-network - - compute_partition - - homefs - settings: - login_node_count: 1 - instance_image: $(vars.controller_image) - controller_machine_type: c2-standard-4 - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - cluster-network - - slurm_controller - - homefs - settings: - instance_image: $(vars.compute_image) - login_machine_type: c2-standard-4 diff --git a/community/examples/intel/hpc-slurm-daos.yaml b/community/examples/intel/hpc-slurm-daos.yaml index cd79bdc203..acc99c9050 100644 --- a/community/examples/intel/hpc-slurm-daos.yaml +++ b/community/examples/intel/hpc-slurm-daos.yaml @@ -1,4 +1,4 @@ -# Copyright 2022 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,38 +18,42 @@ blueprint_name: hpc-slurm-daos vars: project_id: ## Set GCP Project ID Here ## - deployment_name: daos-slurm + deployment_name: hpc-slurm-daos region: us-central1 zone: us-central1-c - server_image_family: daos-server-hpc-rocky-8 - -# Documentation for each of the modules used below can be found at -# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md + daos_server_image_family: daos-server-hpc-rocky-8 + daos_version: "2.4" + tags: [] # Note: this blueprint assumes the existence of a default global network and # subnetwork in the region chosen above +validators: +- validator: test_module_not_used + inputs: {} + skip: true + deployment_groups: - group: primary modules: - id: network1 - source: modules/network/pre-existing-vpc + source: modules/network/vpc - id: homefs source: modules/file-system/filestore use: [network1] settings: - local_mount: "/home" + local_mount: /home - group: daos-server-image modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.4.1/images + # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/images - id: daos-server-image - source: github.com/daos-stack/google-cloud-daos//images?ref=v0.4.1&depth=1 + source: "github.com/daos-stack/google-cloud-daos//images?ref=v0.5.0&depth=1" kind: packer settings: - daos_version: 2.2.0 - daos_repo_base_url: https://packages.daos.io + daos_version: $(vars.daos_version) + daos_repo_base_url: https://packages.daos.io/ daos_packages_repo_file: EL8/packages/x86_64/daos_packages.repo use_iap: true enable_oslogin: false @@ -63,26 +67,25 @@ deployment_groups: use_internal_ip: true omit_external_ip: false daos_install_type: server - image_family: $(vars.server_image_family) + image_family: $(vars.daos_server_image_family) - group: cluster modules: # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_server - id: daos - source: github.com/daos-stack/google-cloud-daos//terraform/modules/daos_server?ref=v0.4.1&depth=1 + source: "github.com/daos-stack/google-cloud-daos//terraform/modules/daos_server?ref=v0.5.0&depth=1" use: [network1] settings: labels: {ghpc_role: file-system} - # The default DAOS settings are optimized for TCO - # The following will tune this system for best perf machine_type: "n2-standard-16" - os_family: $(vars.server_image_family) + os_family: $(vars.daos_server_image_family) daos_disk_count: 4 - daos_scm_size: 45 + tags: $(vars.tags) pools: - name: "pool1" - size: "6.4TB" - tier_ratio: 3 + size: "100%" + # Do not set value for scm_size when size=100% + daos_scm_size: user: "root@" group: "root@" acls: @@ -98,67 +101,102 @@ deployment_groups: settings: runners: - type: shell - content: $(daos.daos_client_install_script) - destination: /tmp/daos_client_install.sh + destination: remove_lustre_client_repo.sh + content: | + #!/bin/bash + rm -f /etc/yum.repos.d/lustre-client.repo + dnf clean all --verbose + rm -rf /var/cache/dnf/* + dnf makecache - type: data content: $(daos.daos_agent_yml) destination: /etc/daos/daos_agent.yml - type: data content: $(daos.daos_control_yml) destination: /etc/daos/daos_control.yml + - type: shell + content: $(daos.daos_client_install_script) + destination: /tmp/daos_client_install.sh - type: shell content: $(daos.daos_client_config_script) - destination: /var/daos/daos_client_config.sh + destination: /tmp/daos_client_config.sh + + - id: debug_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + name: ns1 + node_count_dynamic_max: 4 + machine_type: n2-standard-2 + enable_placement: false # the default is: true + service_account: + email: null + scopes: + - "https://www.googleapis.com/auth/monitoring.write" + - "https://www.googleapis.com/auth/logging.write" + - "https://www.googleapis.com/auth/devstorage.read_only" + - "https://www.googleapis.com/auth/cloud-platform" - ## This debug_partition will work out of the box without requesting additional GCP quota. - id: debug_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [debug_nodeset, homefs] settings: partition_name: debug - max_node_count: 4 - enable_placement: false - machine_type: n2-standard-2 + exclusive: false # allows nodes to stay up after jobs are done + is_default: true - # This compute_partition is far more performant than debug_partition but may require requesting GCP quotas first. - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] settings: - partition_name: compute - max_node_count: 20 + name: ns2 + node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled + service_account: + email: null + scopes: + - "https://www.googleapis.com/auth/monitoring.write" + - "https://www.googleapis.com/auth/logging.write" + - "https://www.googleapis.com/auth/devstorage.read_only" + - "https://www.googleapis.com/auth/cloud-platform" - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - debug_partition # debug partition will be default as it is listed first - - compute_partition - - daos-client-script + - id: compute_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset, homefs] settings: - login_node_count: 1 - compute_node_scopes: - - "https://www.googleapis.com/auth/monitoring.write" - - "https://www.googleapis.com/auth/logging.write" - - "https://www.googleapis.com/auth/devstorage.read_only" - - "https://www.googleapis.com/auth/cloud-platform" + partition_name: compute - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network1] + settings: + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: false + tags: $(vars.tags) + service_account: + email: null + scopes: + - "https://www.googleapis.com/auth/monitoring.write" + - "https://www.googleapis.com/auth/logging.write" + - "https://www.googleapis.com/auth/devstorage.read_only" + - "https://www.googleapis.com/auth/cloud-platform" + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 + - debug_partition + - compute_partition + - slurm_login - homefs - - slurm_controller - daos-client-script settings: - login_scopes: - - "https://www.googleapis.com/auth/monitoring.write" - - "https://www.googleapis.com/auth/logging.write" - - "https://www.googleapis.com/auth/devstorage.read_only" - - "https://www.googleapis.com/auth/cloud-platform" + disable_controller_public_ips: false + compute_startup_script: $(daos-client-script.startup_script) + controller_startup_script: $(daos-client-script.startup_script) + login_startup_script: $(daos-client-script.startup_script) + compute_startup_scripts_timeout: 1000 + controller_startup_scripts_timeout: 1000 + login_startup_scripts_timeout: 1000 + tags: $(vars.tags) diff --git a/community/examples/intel/pfs-daos.yaml b/community/examples/intel/pfs-daos.yaml index 648aba9403..3abf5c9778 100644 --- a/community/examples/intel/pfs-daos.yaml +++ b/community/examples/intel/pfs-daos.yaml @@ -1,4 +1,4 @@ -# Copyright 2022 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,11 +21,10 @@ vars: deployment_name: pfs-daos region: us-central1 zone: us-central1-c - server_image_family: daos-server-hpc-rocky-8 - client_image_family: daos-client-hpc-rocky-8 - -# Documentation for each of the modules used below can be found at -# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md + daos_server_image_family: daos-server-hpc-rocky-8 + daos_client_image_family: daos-client-hpc-rocky-8 + daos_version: "2.4" + tags: [] # Note: this blueprint assumes the existence of a default global network and # subnetwork in the region chosen above @@ -38,12 +37,12 @@ deployment_groups: - group: daos-server-image modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.4.1/images + # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/images - id: daos-server-image - source: github.com/daos-stack/google-cloud-daos//images?ref=v0.4.1&depth=1 + source: "github.com/daos-stack/google-cloud-daos//images?ref=v0.5.0&depth=1" kind: packer settings: - daos_version: 2.2.0 + daos_version: $(vars.daos_version) daos_repo_base_url: https://packages.daos.io daos_packages_repo_file: EL8/packages/x86_64/daos_packages.repo use_iap: true @@ -58,16 +57,16 @@ deployment_groups: use_internal_ip: true omit_external_ip: false daos_install_type: server - image_family: $(vars.server_image_family) + image_family: $(vars.daos_server_image_family) - group: daos-client-image modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.4.1/images + # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.5.0/images - id: daos-client-image - source: github.com/daos-stack/google-cloud-daos//images?ref=v0.4.1&depth=1 + source: "github.com/daos-stack/google-cloud-daos//images?ref=v0.5.0&depth=1" kind: packer settings: - daos_version: 2.2.0 + daos_version: $(vars.daos_version) daos_repo_base_url: https://packages.daos.io daos_packages_repo_file: EL8/packages/x86_64/daos_packages.repo use_iap: true @@ -82,24 +81,29 @@ deployment_groups: use_internal_ip: true omit_external_ip: false daos_install_type: client - image_family: $(vars.client_image_family) + image_family: $(vars.daos_client_image_family) - group: daos-cluster modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.4.1/terraform/modules/daos_server + # more info: https://github.com/daos-stack/google-cloud-daos/tree/develop/terraform/modules/daos_server - id: daos-server - source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_server?ref=v0.4.1&depth=1 + # source: $(vars.daos_server_module_source_url) + source: "github.com/daos-stack/google-cloud-daos//terraform/modules/daos_server?ref=v0.5.0&depth=1" use: [network1] settings: number_of_instances: 2 labels: {ghpc_role: file-system} - os_family: $(vars.server_image_family) + os_family: $(vars.daos_server_image_family) + daos_scm_size: "172" + tags: $(vars.tags) - # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.4.1/terraform/modules/daos_client + # more info: https://github.com/daos-stack/google-cloud-daos/tree/develop/terraform/modules/daos_client - id: daos-client - source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_client?ref=v0.4.1&depth=1 + # source: $(vars.daos_client_module_source_url) + source: "github.com/daos-stack/google-cloud-daos//terraform/modules/daos_client?ref=v0.5.0&depth=1" use: [network1, daos-server] settings: number_of_instances: 2 labels: {ghpc_role: compute} - os_family: $(vars.client_image_family) + os_family: $(vars.daos_client_image_family) + tags: $(vars.tags) diff --git a/community/examples/omnia-cluster.yaml b/community/examples/omnia-cluster.yaml index 3437955c58..171871f60b 100644 --- a/community/examples/omnia-cluster.yaml +++ b/community/examples/omnia-cluster.yaml @@ -14,6 +14,8 @@ --- +# WARNING: this example has been deprecated as of v1.28.0 of the HPC Toolkit + blueprint_name: omnia-cluster vars: diff --git a/community/examples/quantum-circuit-simulator.yaml b/community/examples/quantum-circuit-simulator.yaml deleted file mode 100644 index 2876ae5b16..0000000000 --- a/community/examples/quantum-circuit-simulator.yaml +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -blueprint_name: quantum-circuit - -# Please review https://cloud.google.com/compute/docs/regions-zones -# for availability of A2 machine types -vars: - project_id: ## Set project id here - deployment_name: qsim-demo - region: us-central1 - zone: us-central1-f - -# Documentation for each of the modules used below can be found at -# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - - - id: quantum-simulator-setup - source: modules/scripts/startup-script - settings: - runners: - - type: shell - destination: install-qsim.sh - content: | - #!/bin/bash - # This script implements https://quantumai.google/qsim/tutorials/gcp_gpu - # Disable any user interactive prompt during upgrade script. - export DEBIAN_FRONTEND=noninteractive - set -e -o pipefail - curl -O https://raw.githubusercontent.com/GoogleCloudPlatform/compute-gpu-installation/main/linux/install_gpu_driver.py - python3 install_gpu_driver.py - curl -O https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh - bash Miniconda3-py39_4.12.0-Linux-x86_64.sh -b -p /opt/conda - source /opt/conda/bin/activate base - conda init --system - conda config --system --set auto_activate_base False - # following channel ordering is important! use strict_priority! - # cuquantum comes from cuquantum label in nvidia channel - # libcutensor comes from main (default) label in nvidia channel - # cuda and all toolkit comes from cuda-11.5.2 label in nvidia channel - # everything else comes from conda-forge - conda config --system --set channel_priority strict - conda config --system --remove channels defaults - conda config --system --add channels conda-forge - conda config --system --add channels nvidia - conda config --system --add channels nvidia/label/cuda-11.5.2 - conda config --system --add channels nvidia/label/cuquantum-22.07.1 - conda update -n base conda --yes - conda create -n qsim python=3.9 --yes - conda install -n qsim cuda cuquantum make cmake cxx-compiler=1.5.1 --yes - echo "cuda ==11.5.*" > /opt/conda/envs/qsim/conda-meta/pinned - conda clean -p -t --yes - conda activate qsim - pip install pybind11 cirq - git clone -b v0.18.0 https://github.com/quantumlib/qsim.git /opt/qsim - cd /opt/qsim - export CUQUANTUM_ROOT=/opt/conda/envs/qsim - make - pip install . - - type: data - destination: /var/tmp/qsim-example.py - content: | - import sys - import time - import cirq, cirq_google - import qsimcirq - - def sim(width: int, height: int, reps: int, use_gpu: bool, gpu_mode: int): - rqc_fn = cirq.experiments.random_rotations_between_grid_interaction_layers_circuit - qvm_fn = cirq_google.engine.create_default_noisy_quantum_virtual_machine - - qubits = cirq.GridQubit.rect(width, height, 3, 2) - circuit = rqc_fn(qubits, depth=10, seed=0) + cirq.measure(*qubits, key="final_state") - - processor_id = "weber" - qsim_options = qsimcirq.QSimOptions(use_gpu=use_gpu, gpu_mode=gpu_mode) - # we do not recommend using seed=0 in production usage; in this - # example it helps compare performance between runs - qvm = qvm_fn(processor_id, qsimcirq.QSimSimulator, seed=0, qsim_options=qsim_options) - - start = time.time() - results = qvm.get_sampler(processor_id).run(circuit, repetitions=reps) - print(results) - print(f"elapsed: {time.time() - start:.03f}s") - - - if __name__ == "__main__": - width, height, reps = 5, 5, 10 - - print("This series of simulations should last approximately 1 minute on an A2 series VM\n") - print("Running on CPU:") - sys.stdout.flush() - sim(width=width, height=height, reps=reps, use_gpu=False, gpu_mode=0) - print("\nRunning on GPU (CUDA):") - sys.stdout.flush() - sim(width=width, height=height, reps=reps, use_gpu=True, gpu_mode=0) - print("\nRunning on GPU (CUDA + cuQuantum):") - sys.stdout.flush() - sim(width=width, height=height, reps=reps, use_gpu=True, gpu_mode=1) - - type: shell - destination: run-qsim.sh - content: | - #!/bin/bash -i - # The -i above (for interactive) is required so that conda command will be accessible. - # this script demonstrates how to run the qsim example application and - # also "warms up" the GPU to give reliable performance metrics - conda activate qsim - python /var/tmp/qsim-example.py - - - id: qsimvm - source: modules/compute/vm-instance - use: - - network1 - - quantum-simulator-setup - settings: - machine_type: n1-standard-32 - guest_accelerator: - - type: nvidia-tesla-t4 - count: 1 - instance_image: - project: ubuntu-os-cloud - family: ubuntu-2004-lts - - - id: wait - source: community/modules/scripts/wait-for-startup - settings: - instance_name: $(qsimvm.name[0]) - timeout: 2400 diff --git a/community/front-end/ofe/infrastructure_files/workbench_tf/google/wait-for-startup/README.md b/community/front-end/ofe/infrastructure_files/workbench_tf/google/wait-for-startup/README.md index 7d4032f1ab..c8561bc909 100644 --- a/community/front-end/ofe/infrastructure_files/workbench_tf/google/wait-for-startup/README.md +++ b/community/front-end/ofe/infrastructure_files/workbench_tf/google/wait-for-startup/README.md @@ -19,7 +19,7 @@ up a node. kind: terraform id: wait settings: - instance_name: ((module.workstation.name[0])) + instance_name: $(workstation.name[0]) ``` ## License diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 9624308827..c367dfb185 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -12,14 +12,14 @@ cffi==1.15.1 cfgv==3.3.1 charset-normalizer==3.1.0 click==8.1.3 -cryptography==41.0.6 +cryptography==42.0.0 decorator==5.1.1 defusedxml==0.7.1 dill==0.3.6 distlib==0.3.6 # django-revproxy==0.11.0 released but not yet in pypi git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb0465664ccfa5787 -Django==4.2.7 +Django==4.2.10 django-allauth==0.54.0 django-extensions==3.2.3 djangorestframework==3.14.0 @@ -46,7 +46,7 @@ identify==2.5.24 idna==3.4 importlib-resources==6.1.1 isort==5.12.0 -Jinja2==3.1.2 +Jinja2==3.1.3 jsonschema==4.20.0 jsonschema-specifications==2023.11.1 lazy-object-proxy==1.9.0 diff --git a/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md b/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md deleted file mode 100644 index e27e937920..0000000000 --- a/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md +++ /dev/null @@ -1,98 +0,0 @@ -## Description - -> **Warning**: this module is now deprecated. We recommend using the Slurm on GCP V5 -> [schedmd-slurm-gcp-v5-partition](../schedmd-slurm-gcp-v5-partition/README.md) and -> [schedmd-slurm-gcp-v5-node-group](../schedmd-slurm-gcp-v5-node-group/README.md) instead. - -This module creates a compute partition that be can used as input to -[SchedMD-slurm-on-gcp-controller](../../scheduler/SchedMD-slurm-on-gcp-controller/README.md). - -> **Warning**: updating a partition will not cause the slurm controller to -> update its configurations. In other words, it will not update an already -> deployed Slurm cluster. - -### Example - -The following code snippet creates a partition module with: - -* a max node count of 200 -* VM machine type of `c2-standard-30` -* partition name of "compute" -* connected to the `network1` module via `use` -* Mounted to homefs via `use` - -```yaml -- id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: [network1, homefs] - settings: - max_node_count: 200 - partition_name: compute - machine_type: c2-standard-30 -``` - -## GPU Support - -More information on GPU support in Slurm on GCP and other HPC Toolkit modules -can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md) - -## Support -The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform -modules. For support with the underlying modules, see the instructions in the -[slurm-gcp README][slurm-gcp-readme]. - -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform - - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 0.14.0 | - -## Providers - -No providers. - -## Modules - -No modules. - -## Resources - -No resources. - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | -| [compute\_disk\_size\_gb](#input\_compute\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes | `number` | `20` | no | -| [compute\_disk\_type](#input\_compute\_disk\_type) | Type of boot disk to create for the partition compute nodes | `string` | `"pd-standard"` | no | -| [cpu\_platform](#input\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | -| [enable\_placement](#input\_enable\_placement) | Enable compact placement policies for jobs requiring low latency networking. | `bool` | `true` | no | -| [exclusive](#input\_exclusive) | Exclusive job access to nodes | `bool` | `true` | no | -| [gpu\_count](#input\_gpu\_count) | Number of GPUs attached to the partition compute instances | `number` | `0` | no | -| [gpu\_type](#input\_gpu\_type) | Type of GPUs attached to the partition compute instances | `string` | `null` | no | -| [image\_hyperthreads](#input\_image\_hyperthreads) | Enable hyperthreading | `bool` | `false` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used by the compute VMs in this partition.
Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.
Custom images must comply with Slurm on GCP requirements. | `map(string)` |
{
"family": "schedmd-slurm-21-08-8-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | -| [instance\_template](#input\_instance\_template) | Instance template to use to create partition instances | `string` | `null` | no | -| [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | -| [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes | `string` | `"c2-standard-60"` | no | -| [max\_node\_count](#input\_max\_node\_count) | Maximum number of nodes allowed in this partition | `number` | `50` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on the partition compute nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [partition\_name](#input\_partition\_name) | The name of the slurm partition | `string` | n/a | yes | -| [preemptible\_bursting](#input\_preemptible\_bursting) | Should use preemptibles to burst | `string` | `false` | no | -| [regional\_capacity](#input\_regional\_capacity) | If True, then create instances in the region that has available capacity. Specify the region in the zone field. | `bool` | `false` | no | -| [regional\_policy](#input\_regional\_policy) | locationPolicy definition for regional bulkInsert() | `any` | `{}` | no | -| [static\_node\_count](#input\_static\_node\_count) | Number of nodes to be statically created | `number` | `0` | no | -| [subnetwork\_name](#input\_subnetwork\_name) | The name of the pre-defined VPC subnet you want the nodes to attach to based on Region. | `string` | n/a | yes | -| [zone](#input\_zone) | Compute Platform zone where the notebook server will be located | `string` | n/a | yes | - -## Outputs - -| Name | Description | -|------|-------------| -| [partition](#output\_partition) | The partition structure containing all the set variables | - diff --git a/community/modules/compute/SchedMD-slurm-on-gcp-partition/outputs.tf b/community/modules/compute/SchedMD-slurm-on-gcp-partition/outputs.tf deleted file mode 100644 index 4fc43627e3..0000000000 --- a/community/modules/compute/SchedMD-slurm-on-gcp-partition/outputs.tf +++ /dev/null @@ -1,58 +0,0 @@ -# -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -locals { - # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "schedmd-slurm-on-gcp-partition", ghpc_role = "compute" }) -} - -locals { - instance_name = lookup(var.instance_image, "name", null) - instance_family = lookup(var.instance_image, "family", null) - instance_image = ( - local.instance_name != null ? - "projects/${var.instance_image["project"]}/global/images/${local.instance_name}" : - "projects/${var.instance_image["project"]}/global/images/family/${local.instance_family}" - ) -} - - -output "partition" { - description = "The partition structure containing all the set variables" - value = { - name : var.partition_name - machine_type : var.machine_type - static_node_count : var.static_node_count - max_node_count : var.max_node_count - zone : var.zone - image : local.instance_image - image_hyperthreads : var.image_hyperthreads - compute_disk_type : var.compute_disk_type - compute_disk_size_gb : var.compute_disk_size_gb - compute_labels : local.labels - cpu_platform : var.cpu_platform - gpu_count : var.gpu_count - gpu_type : var.gpu_type - network_storage : var.network_storage - preemptible_bursting : var.preemptible_bursting - vpc_subnet : var.subnetwork_name - exclusive : var.exclusive - enable_placement : var.enable_placement - regional_capacity : var.regional_capacity - regional_policy : var.regional_policy - bandwidth_tier : var.bandwidth_tier - instance_template : var.instance_template - } -} diff --git a/community/modules/compute/SchedMD-slurm-on-gcp-partition/variables.tf b/community/modules/compute/SchedMD-slurm-on-gcp-partition/variables.tf deleted file mode 100644 index d94ee8f86a..0000000000 --- a/community/modules/compute/SchedMD-slurm-on-gcp-partition/variables.tf +++ /dev/null @@ -1,184 +0,0 @@ -# -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -variable "partition_name" { - description = "The name of the slurm partition" - type = string -} - -variable "machine_type" { - description = "Compute Platform machine type to use for this partition compute nodes" - type = string - default = "c2-standard-60" -} - -variable "static_node_count" { - description = "Number of nodes to be statically created" - type = number - default = 0 -} - -variable "max_node_count" { - description = "Maximum number of nodes allowed in this partition" - type = number - default = 50 -} - -variable "zone" { - description = "Compute Platform zone where the notebook server will be located" - type = string -} - -variable "instance_image" { - description = <<-EOD - Defines the image that will be used by the compute VMs in this partition. - Expected Fields: - name: The name of the image. Mutually exclusive with family. - family: The image family to use. Mutually exclusive with name. - project: The project where the image is hosted. - Custom images must comply with Slurm on GCP requirements. - EOD - type = map(string) - default = { - project = "schedmd-slurm-public" - family = "schedmd-slurm-21-08-8-hpc-centos-7" - } - - validation { - condition = length(var.instance_image) == 0 || ( - can(var.instance_image["family"]) || can(var.instance_image["name"])) == can(var.instance_image["project"]) - error_message = "The \"project\" is required if \"family\" or \"name\" are provided in var.instance_image." - } - validation { - condition = length(var.instance_image) == 0 || can(var.instance_image["family"]) != can(var.instance_image["name"]) - error_message = "Exactly one of \"family\" and \"name\" must be provided in var.instance_image." - } -} - -variable "image_hyperthreads" { - description = "Enable hyperthreading" - type = bool - default = false -} - -variable "compute_disk_type" { - description = "Type of boot disk to create for the partition compute nodes" - type = string - default = "pd-standard" -} - -variable "compute_disk_size_gb" { - description = "Size of boot disk to create for the partition compute nodes" - type = number - default = 20 -} - -variable "labels" { - description = "Labels to add to partition compute instances. Key-value pairs." - type = map(string) - default = {} -} - -variable "cpu_platform" { - description = "The name of the minimum CPU platform that you want the instance to use." - type = string - default = null -} - -variable "gpu_count" { - description = "Number of GPUs attached to the partition compute instances" - type = number - default = 0 -} - -variable "gpu_type" { - description = "Type of GPUs attached to the partition compute instances" - type = string - default = null -} - -variable "network_storage" { - description = "An array of network attached storage mounts to be configured on the partition compute nodes." - type = list(object({ - server_ip = string, - remote_mount = string, - local_mount = string, - fs_type = string, - mount_options = string, - client_install_runner = map(string) - mount_runner = map(string) - })) - default = [] -} - -variable "preemptible_bursting" { - description = "Should use preemptibles to burst" - type = string - default = false -} - -variable "subnetwork_name" { - description = "The name of the pre-defined VPC subnet you want the nodes to attach to based on Region." - type = string -} - -variable "exclusive" { - description = "Exclusive job access to nodes" - type = bool - default = true -} - -variable "enable_placement" { - description = "Enable compact placement policies for jobs requiring low latency networking." - type = bool - default = true -} - -variable "regional_capacity" { - description = "If True, then create instances in the region that has available capacity. Specify the region in the zone field." - type = bool - default = false -} - -variable "regional_policy" { - description = "locationPolicy definition for regional bulkInsert()" - type = any - default = {} -} - -variable "bandwidth_tier" { - description = <> --all-instances --region <> \ + --project <> --minimal-action replace +``` + +[replacement]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type + ## Known Issues When using OS Login with "external users" (outside of the Google Cloud @@ -177,7 +212,7 @@ limitations under the License. |------|--------|---------| | [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 8.0 | | [mig](#module\_mig) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources @@ -204,7 +239,7 @@ limitations under the License. | [instance\_image](#input\_instance\_image) | HTCondor execute point VM image

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "hpc-rocky-linux-8",
"project": "cloud-hpc-image-public"
}
| no | | [labels](#input\_labels) | Labels to add to HTConodr execute points | `map(string)` | n/a | yes | | [machine\_type](#input\_machine\_type) | Machine type to use for HTCondor execute points | `string` | `"n2-standard-4"` | no | -| [max\_size](#input\_max\_size) | Maximum size of the HTCondor execute point pool. | `number` | `100` | no | +| [max\_size](#input\_max\_size) | Maximum size of the HTCondor execute point pool. | `number` | `5` | no | | [metadata](#input\_metadata) | Metadata to add to HTCondor execute points | `map(string)` | `{}` | no | | [min\_idle](#input\_min\_idle) | Minimum number of idle VMs in the HTCondor pool (if pool reaches var.max\_size, this minimum is not guaranteed); set to ensure jobs beginning run more quickly. | `number` | `0` | no | | [name\_prefix](#input\_name\_prefix) | Name prefix given to hostnames in this group of execute points; must be unique across all instances of this module | `string` | n/a | yes | @@ -217,6 +252,7 @@ limitations under the License. | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork HTCondor execute points will join | `string` | `null` | no | | [target\_size](#input\_target\_size) | Initial size of the HTCondor execute point pool; set to null (default) to avoid Terraform management of size. | `number` | `null` | no | +| [update\_policy](#input\_update\_policy) | Replacement policy for Access Point Managed Instance Group ("PROACTIVE" to replace immediately or "OPPORTUNISTIC" to replace upon instance power cycle) | `string` | `"OPPORTUNISTIC"` | no | | [windows\_startup\_ps1](#input\_windows\_startup\_ps1) | Startup script to run at boot-time for Windows-based HTCondor execute points | `list(string)` | `[]` | no | | [zones](#input\_zones) | Zone(s) in which execute points may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no | diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index b7bc66442f..4a13c3dced 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -125,7 +125,7 @@ resource "google_storage_bucket_object" "execute_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" project_id = var.project_id region = var.region @@ -199,7 +199,7 @@ module "mig" { max_unavailable_percent = null min_ready_sec = 300 minimal_action = "REPLACE" - type = "OPPORTUNISTIC" + type = var.update_policy }] } diff --git a/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl b/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl index d31bdf2faa..19789f122e 100644 --- a/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl +++ b/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl @@ -1,20 +1,34 @@ # create directory for local condor_config customizations $config_dir = 'C:\Condor\config' -if(!(test-path -PathType container -Path $config_dir)) { +if(!(test-path -PathType container -Path $config_dir)) +{ New-Item -ItemType Directory -Path $config_dir } # update local condor_config if blueprint has changed $config_file = "$config_dir\50-ghpc-managed" -if (Test-Path -Path $config_file -PathType Leaf) { +if (Test-Path -Path $config_file -PathType Leaf) +{ $local_hash = gcloud --format="value(md5_hash)" storage hash $config_file -} else { +} +else +{ $local_hash = "INVALID-HASH" } $remote_hash = gcloud --format="value(md5_hash)" storage hash ${config_object} -if ($local_hash -cne $remote_hash) { +if ($local_hash -cne $remote_hash) +{ Write-Output "Updating condor configuration" gcloud storage cp ${config_object} $config_file + if ($LASTEXITCODE -ne 0) + { + throw "Could not download HTCondor configuration; exiting startup script" + } Restart-Service condor } + +# ignored if service is already running; must be here to handle case where +# machine is rebooted, but configuration has previously been downloaded +# and service is disabled from automatic start +Start-Service condor diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index 2991b25dc2..75fde2b84c 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -133,7 +133,7 @@ variable "target_size" { variable "max_size" { description = "Maximum size of the HTCondor execute point pool." type = number - default = 100 + default = 5 } variable "min_idle" { @@ -236,3 +236,13 @@ variable "shielded_instance_config" { enable_integrity_monitoring = true } } + +variable "update_policy" { + description = "Replacement policy for Access Point Managed Instance Group (\"PROACTIVE\" to replace immediately or \"OPPORTUNISTIC\" to replace upon instance power cycle)" + type = string + default = "OPPORTUNISTIC" + validation { + condition = contains(["PROACTIVE", "OPPORTUNISTIC"], var.update_policy) + error_message = "Allowed string values for var.update_policy are \"PROACTIVE\" or \"OPPORTUNISTIC\"." + } +} diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index e395fa31c8..101d0a0830 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.28.1" } } diff --git a/community/modules/compute/pbspro-execution/README.md b/community/modules/compute/pbspro-execution/README.md index 0972b241bd..758428e0d2 100644 --- a/community/modules/compute/pbspro-execution/README.md +++ b/community/modules/compute/pbspro-execution/README.md @@ -74,7 +74,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | | [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | bb47067 | | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.22.1 | diff --git a/community/modules/compute/pbspro-execution/main.tf b/community/modules/compute/pbspro-execution/main.tf index 18323b5ef5..92c269cf33 100644 --- a/community/modules/compute/pbspro-execution/main.tf +++ b/community/modules/compute/pbspro-execution/main.tf @@ -53,7 +53,7 @@ module "pbs_install" { } module "execution_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md index 7d9387f3aa..29b04cf17f 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md @@ -72,8 +72,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License @@ -136,11 +136,12 @@ No modules. | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-9-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-10-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | +| [maintenance\_interval](#input\_maintenance\_interval) | Specifies the frequency of planned maintenance events. Must be "PERIODIC" or empty string to not use this feature. | `string` | `""` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | | [name](#input\_name) | Name of the node group. | `string` | `"ghpc"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf index a382a4232a..825f3c0a4a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf @@ -58,6 +58,7 @@ locals { gpu = one(local.guest_accelerator) labels = local.labels machine_type = var.machine_type + maintenance_interval = var.maintenance_interval metadata = var.metadata min_cpu_platform = var.min_cpu_platform on_host_maintenance = var.on_host_maintenance diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf index 3acb583f3b..ddcb1ff6ee 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-5-9-debian-11", - "slurm-gcp-5-9-hpc-rocky-linux-8", - "slurm-gcp-5-9-ubuntu-2004-lts", - "slurm-gcp-5-9-ubuntu-2204-lts-arm64", - "slurm-gcp-5-9-hpc-centos-7-k80", - "slurm-gcp-5-9-hpc-centos-7" + "slurm-gcp-5-10-debian-11", + "slurm-gcp-5-10-hpc-rocky-linux-8", + "slurm-gcp-5-10-ubuntu-2004-lts", + "slurm-gcp-5-10-ubuntu-2204-lts-arm64", + "slurm-gcp-5-10-hpc-centos-7-k80", + "slurm-gcp-5-10-hpc-centos-7" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf index 2a38a2e64b..04fc1900f6 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "project_id" { description = "Project in which the HPC deployment will be created." @@ -96,7 +96,7 @@ variable "instance_image" { type = map(string) default = { project = "schedmd-slurm-public" - family = "slurm-gcp-5-9-hpc-centos-7" + family = "slurm-gcp-5-10-hpc-centos-7" } validation { @@ -413,6 +413,18 @@ variable "additional_networks" { })) } +variable "maintenance_interval" { + description = "Specifies the frequency of planned maintenance events. Must be \"PERIODIC\" or empty string to not use this feature." + default = "" + type = string + nullable = false + + validation { + condition = contains(["", "PERIODIC"], var.maintenance_interval) + error_message = "var.maintenance_interval must be the empty string or \"PERIODIC\"" + } +} + variable "disable_public_ips" { description = "If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access_config is set." type = bool diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index 06f4753f02..4f366c4b9c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.28.1" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md index de0dbdb267..51e49f42d6 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md @@ -35,8 +35,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License @@ -69,7 +69,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.9.1 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.10.2 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf index 5bf9b93c91..6483eb2e0c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf @@ -29,7 +29,7 @@ locals { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.10.2" slurm_cluster_name = local.slurm_cluster_name enable_job_exclusive = var.exclusive diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf index 55d82b07d1..137023ee26 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "deployment_name" { description = "Name of the deployment." diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index ca57874c31..ba8af335f8 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -110,8 +110,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License @@ -146,7 +146,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.9.1 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.10.2 | ## Resources @@ -164,7 +164,7 @@ limitations under the License. | [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no | | [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on the partition compute nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [node\_groups](#input\_node\_groups) | A list of node groups associated with this partition. See
schedmd-slurm-gcp-v5-node-group for more information on defining a node
group in a blueprint. |
list(object({
node_count_static = number
node_count_dynamic_max = number
group_name = string
node_conf = map(string)
access_config = list(object({
nat_ip = string
network_tier = string
}))
additional_disks = list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
additional_networks = list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
bandwidth_tier = string
can_ip_forward = bool
disable_smt = bool
disk_auto_delete = bool
disk_labels = map(string)
disk_size_gb = number
disk_type = string
enable_confidential_vm = bool
enable_oslogin = bool
enable_shielded_vm = bool
enable_spot_vm = bool
gpu = object({
count = number
type = string
})
instance_template = string
labels = map(string)
machine_type = string
metadata = map(string)
min_cpu_platform = string
on_host_maintenance = string
preemptible = bool
reservation_name = string
service_account = object({
email = string
scopes = list(string)
})
shielded_instance_config = object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
spot_instance_config = object({
termination_action = string
})
source_image_family = string
source_image_project = string
source_image = string
tags = list(string)
}))
| `[]` | no | +| [node\_groups](#input\_node\_groups) | A list of node groups associated with this partition. See
schedmd-slurm-gcp-v5-node-group for more information on defining a node
group in a blueprint. |
list(object({
node_count_static = number
node_count_dynamic_max = number
group_name = string
node_conf = map(string)
access_config = list(object({
nat_ip = string
network_tier = string
}))
additional_disks = list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
additional_networks = list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
bandwidth_tier = string
can_ip_forward = bool
disable_smt = bool
disk_auto_delete = bool
disk_labels = map(string)
disk_size_gb = number
disk_type = string
enable_confidential_vm = bool
enable_oslogin = bool
enable_shielded_vm = bool
enable_spot_vm = bool
gpu = object({
count = number
type = string
})
instance_template = string
labels = map(string)
machine_type = string
maintenance_interval = string
metadata = map(string)
min_cpu_platform = string
on_host_maintenance = string
preemptible = bool
reservation_name = string
service_account = object({
email = string
scopes = list(string)
})
shielded_instance_config = object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
spot_instance_config = object({
termination_action = string
})
source_image_family = string
source_image_project = string
source_image = string
tags = list(string)
}))
| `[]` | no | | [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | | [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes | | [partition\_startup\_scripts\_timeout](#input\_partition\_startup\_scripts\_timeout) | The timeout (seconds) applied to the partition startup script. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf index 643e4f3ac1..80f6b7a6eb 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf @@ -38,7 +38,7 @@ data "google_compute_zones" "available" { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.10.2" slurm_cluster_name = local.slurm_cluster_name partition_nodes = var.node_groups diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf index 698dbd5c60..7c06a1edb5 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "deployment_name" { description = "Name of the deployment." @@ -240,14 +240,15 @@ variable "node_groups" { count = number type = string }) - instance_template = string - labels = map(string) - machine_type = string - metadata = map(string) - min_cpu_platform = string - on_host_maintenance = string - preemptible = bool - reservation_name = string + instance_template = string + labels = map(string) + machine_type = string + maintenance_interval = string + metadata = map(string) + min_cpu_platform = string + on_host_maintenance = string + preemptible = bool + reservation_name = string service_account = object({ email = string scopes = list(string) diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index ad83afa3de..a899a36b26 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.28.1" } required_version = ">= 0.13.0" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md index 7cd9719872..b1353aee7e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md @@ -60,7 +60,7 @@ No resources. | [accelerator\_config](#input\_accelerator\_config) | Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details. |
object({
topology = string
version = string
})
|
{
"topology": "",
"version": ""
}
| no | | [data\_disks](#input\_data\_disks) | The data disks to include in the TPU node | `list(string)` | `[]` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | -| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-1-tf- | `string` | `null` | no | +| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-3-tf- | `string` | `null` | no | | [name](#input\_name) | Name of the nodeset tpu. | `string` | `"ghpc"` | no | | [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of dynamic nodes allowed in this partition. | `number` | `1` | no | | [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no | @@ -68,9 +68,9 @@ No resources. | [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | | [preserve\_tpu](#input\_preserve\_tpu) | Specify whether TPU-vms will get preserve on suspend, if set to true, on suspend vm is stopped, on false it gets deleted | `bool` | `true` | no | | [service\_account](#input\_service\_account) | Service account to attach to the TPU-vm. If none is given, the default service account and scopes will be used. |
object({
email = string
scopes = set(string)
})
| `null` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The name of the subnetwork to attach the TPU-vm of this nodeset to. | `string` | `null` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The name of the subnetwork to attach the TPU-vm of this nodeset to. | `string` | n/a | yes | | [tf\_version](#input\_tf\_version) | Nodeset Tensorflow version, see https://cloud.google.com/tpu/docs/supported-tpu-configurations#tpu_vm for details. | `string` | `"2.9.1"` | no | -| [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes | +| [zone](#input\_zone) | Zone in which to create compute VMs. TPU partitions can only specify a single zone. | `string` | n/a | yes | ## Outputs diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf index 18cccb4f4d..900d1d35b0 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf @@ -34,8 +34,9 @@ locals { docker_image = var.docker_image enable_public_ip = !var.disable_public_ips - subnetwork = var.subnetwork_self_link - service_account = var.service_account - zone = var.zone + # TODO: rename to subnetwork_self_link, requires changes to the scripts + subnetwork = var.subnetwork_self_link + service_account = var.service_account + zone = var.zone } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/metadata.yaml index 641832182d..13ea127b3c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/metadata.yaml +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/metadata.yaml @@ -16,3 +16,5 @@ spec: requirements: services: [] +ghpc: + has_to_be_used: true diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf index ac13f2dc5d..323ed8f655 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf @@ -85,7 +85,7 @@ variable "preserve_tpu" { } variable "zone" { - description = "Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones." + description = "Zone in which to create compute VMs. TPU partitions can only specify a single zone." type = string } @@ -96,7 +96,7 @@ variable "data_disks" { } variable "docker_image" { - description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-1-tf-" + description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-3-tf-" type = string default = null } @@ -104,7 +104,6 @@ variable "docker_image" { variable "subnetwork_self_link" { type = string description = "The name of the subnetwork to attach the TPU-vm of this nodeset to." - default = null } variable "service_account" { diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index ff69b26230..9aeab93394 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.28.1" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 829e9796da..ee0ab788ae 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -122,8 +122,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## Requirements @@ -153,7 +153,9 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | | [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. (do not use "disk\_type: local-ssd"; known issue being addressed) |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | @@ -168,25 +170,25 @@ No modules. | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-1-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-3-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | -| [name](#input\_name) | Name of the nodeset. | `string` | `"ghpc"` | no | +| [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set | `string` | n/a | yes | | [node\_conf](#input\_node\_conf) | Map of Slurm node line configuration. | `map(any)` | `{}` | no | -| [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of dynamic nodes allowed in this partition. | `number` | `1` | no | +| [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of dynamic nodes allowed in this partition. | `number` | `10` | no | | [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy.

Note: Placement groups are not supported when on\_host\_maintenance is set to
"MIGRATE" and will be deactivated regardless of the value of
enable\_placement. To support enable\_placement, ensure on\_host\_maintenance is
set to "TERMINATE". | `string` | `"TERMINATE"` | no | | [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | | [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | +| [reservation\_name](#input\_reservation\_name) | Sets reservation affinity for instances created from this nodeset. | `string` | `null` | no | | [service\_account](#input\_service\_account) | Service account to attach to the compute instances. If not set, the
default compute service account for the given project will be used with the
"https://www.googleapis.com/auth/cloud-platform" scope. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
- enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
- enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
- enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [spot\_instance\_config](#input\_spot\_instance\_config) | Configuration for spot VMs. |
object({
termination_action = string
})
| `null` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `""` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | `null` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | | [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes | | [zone\_target\_shape](#input\_zone\_target\_shape) | Strategy for distributing VMs across zones in a region.
ANY
GCE picks zones for creating VM instances to fulfill the requested number of VMs
within present resource constraints and to maximize utilization of unused zonal
reservations.
ANY\_SINGLE\_ZONE (default)
GCE always selects a single zone for all the VMs, optimizing for resource quotas,
available reservations and general capacity.
BALANCED
GCE prioritizes acquisition of resources, scheduling VMs in zones where resources
are available while distributing VMs as evenly as possible across allowed zones
to minimize the impact of zonal failure. | `string` | `"ANY_SINGLE_ZONE"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 11acd4b963..810795864e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -18,6 +18,8 @@ locals { } locals { + name = substr(replace(var.name, "/[^a-z0-9]/", ""), 0, 6) + additional_disks = [ for ad in var.additional_disks : { disk_name = ad.disk_name @@ -30,11 +32,14 @@ locals { } ] + public_access_config = var.disable_public_ips ? [] : [{ nat_ip = null, network_tier = null }] + access_config = length(var.access_config) == 0 ? local.public_access_config : var.access_config + nodeset = { node_count_static = var.node_count_static node_count_dynamic_max = var.node_count_dynamic_max node_conf = var.node_conf - nodeset_name = var.name + nodeset_name = local.name disk_auto_delete = var.disk_auto_delete disk_labels = merge(local.labels, var.disk_labels) @@ -48,7 +53,6 @@ locals { enable_confidential_vm = var.enable_confidential_vm enable_placement = var.enable_placement - enable_public_ip = !var.disable_public_ips enable_oslogin = var.enable_oslogin enable_shielded_vm = var.enable_shielded_vm gpu = one(local.guest_accelerator) @@ -67,11 +71,13 @@ locals { source_image_family = local.source_image_family # requires source_image_logic.tf source_image_project = local.source_image_project_normalized # requires source_image_logic.tf source_image = local.source_image # requires source_image_logic.tf - subnetwork_project = var.subnetwork_project - subnetwork = var.subnetwork_self_link + subnetwork_self_link = var.subnetwork_self_link + additional_networks = var.additional_networks + access_config = local.access_config tags = var.tags spot = var.enable_spot_vm termination_action = try(var.spot_instance_config.termination_action, null) + reservation_name = var.reservation_name zones = toset(concat([var.zone], tolist(var.zones))) zone_target_shape = var.zone_target_shape diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/metadata.yaml index 641832182d..95b6d1c730 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/metadata.yaml +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/metadata.yaml @@ -16,3 +16,6 @@ spec: requirements: services: [] +ghpc: + inject_module_id: name + has_to_be_used: true diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf index 8759a268cc..532749e7ba 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-1-debian-11", - "slurm-gcp-6-1-hpc-rocky-linux-8", - "slurm-gcp-6-1-ubuntu-2004-lts", - "slurm-gcp-6-1-ubuntu-2204-lts-arm64", - "slurm-gcp-6-1-hpc-centos-7-k80", - "slurm-gcp-6-1-hpc-centos-7" + "slurm-gcp-6-3-debian-11", + "slurm-gcp-6-3-hpc-rocky-linux-8", + "slurm-gcp-6-3-ubuntu-2004-lts", + "slurm-gcp-6-3-ubuntu-2204-lts-arm64", + "slurm-gcp-6-3-hpc-centos-7-k80", + "slurm-gcp-6-3-hpc-centos-7" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index d00219cf73..3a88644b08 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -13,14 +13,8 @@ # limitations under the License. variable "name" { - description = "Name of the nodeset." + description = "Name of the nodeset. Automatically populated by the module id if not set" type = string - default = "ghpc" - - validation { - condition = can(regex("^[a-z](?:[a-z0-9]{0,5})$", var.name)) - error_message = "Nodeset name (var.name) must begin with a letter, be fully alphanumeric and be 6 characters or less. Regexp: '^[a-z](?:[a-z0-9]{0,5})$'." - } } variable "node_conf" { @@ -38,7 +32,7 @@ variable "node_count_static" { variable "node_count_dynamic_max" { description = "Maximum number of dynamic nodes allowed in this partition." type = number - default = 1 + default = 10 } ## VM Definition @@ -82,7 +76,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-1-hpc-rocky-linux-8" + family = "slurm-gcp-6-3-hpc-rocky-linux-8" project = "schedmd-slurm-public" } @@ -378,11 +372,46 @@ EOD variable "subnetwork_self_link" { type = string description = "Subnet to deploy to." - default = null } -variable "subnetwork_project" { - description = "The project the subnetwork belongs to." +variable "additional_networks" { + description = "Additional network interface details for GCE, if any." + default = [] + type = list(object({ + network = string + subnetwork = string + subnetwork_project = string + network_ip = string + nic_type = string + stack_type = string + queue_count = number + access_config = list(object({ + nat_ip = string + network_tier = string + })) + ipv6_access_config = list(object({ + network_tier = string + })) + alias_ip_range = list(object({ + ip_cidr_range = string + subnetwork_range_name = string + })) + })) +} + +variable "access_config" { + description = "Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet." + type = list(object({ + nat_ip = string + network_tier = string + })) + default = [] +} + +variable "reservation_name" { + description = <<-EOD + Sets reservation affinity for instances created from this nodeset. + EOD type = string - default = "" + default = null } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 1c02561ddc..1ff5728890 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.28.1" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md index c482f55f35..00731800cf 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md @@ -56,8 +56,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## Requirements @@ -85,8 +85,8 @@ No resources. | [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no | | [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on the partition compute nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_project = optional(string)
subnetwork = optional(string)
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
}))
| `[]` | no | -| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, true)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = optional(string, "")
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
subnetwork_self_link = string
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
reservation_name = optional(string)
}))
| `[]` | no | +| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, true)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
}))
| `[]` | no | | [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | | [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v6-partition/metadata.yaml index 641832182d..13ea127b3c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/metadata.yaml +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/metadata.yaml @@ -16,3 +16,5 @@ spec: requirements: services: [] +ghpc: + has_to_be_used: true diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf index 56fda6e4d6..2190907599 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf @@ -85,7 +85,6 @@ variable "nodeset" { disk_type = optional(string) enable_confidential_vm = optional(bool, false) enable_placement = optional(bool, false) - enable_public_ip = optional(bool, false) enable_oslogin = optional(bool, true) enable_shielded_vm = optional(bool, false) gpu = optional(object({ @@ -113,13 +112,30 @@ variable "nodeset" { source_image_family = optional(string) source_image_project = optional(string) source_image = optional(string) - subnetwork_project = optional(string) - subnetwork = optional(string) + additional_networks = optional(list(object({ + network = string + subnetwork = string + subnetwork_project = string + network_ip = string + access_config = list(object({ + nat_ip = string + network_tier = string + })) + ipv6_access_config = list(object({ + network_tier = string + })) + }))) + access_config = optional(list(object({ + nat_ip = string + network_tier = string + }))) + subnetwork_self_link = string spot = optional(bool, false) tags = optional(list(string), []) termination_action = optional(string) zones = optional(list(string), []) zone_target_shape = optional(string, "ANY_SINGLE_ZONE") + reservation_name = optional(string) })) default = [] @@ -150,7 +166,7 @@ variable "nodeset_tpu" { zone = string data_disks = optional(list(string), []) docker_image = optional(string, "") - subnetwork = optional(string, "") + subnetwork = string service_account = optional(object({ email = optional(string) scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"]) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index b671307427..4fb6264eb6 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.28.1" } } diff --git a/community/modules/database/slurm-cloudsql-federation/metadata.yaml b/community/modules/database/slurm-cloudsql-federation/metadata.yaml index 0db2ea4503..fc0cae0859 100644 --- a/community/modules/database/slurm-cloudsql-federation/metadata.yaml +++ b/community/modules/database/slurm-cloudsql-federation/metadata.yaml @@ -18,3 +18,4 @@ spec: services: - bigqueryconnection.googleapis.com - sqladmin.googleapis.com + - servicenetworking.googleapis.com diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 4c9c03e98b..fd1524e946 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -30,10 +30,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.28.1" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.28.1" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/DDN-EXAScaler/README.md b/community/modules/file-system/DDN-EXAScaler/README.md index 3206752cb8..f825d0e6d2 100644 --- a/community/modules/file-system/DDN-EXAScaler/README.md +++ b/community/modules/file-system/DDN-EXAScaler/README.md @@ -27,7 +27,7 @@ Toolkit, see the extended [Network Storage documentation](../../../../docs/netwo ## Mounting To mount the DDN EXAScaler Lustre file system you must first install the DDN -Luster client and then call the proper `mount` command. +Lustre client and then call the proper `mount` command. Both of these steps are automatically handled with the use of the `use` command in a selection of HPC Toolkit modules. See the [compatibility matrix][matrix] in diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index cdecd164d2..8526f45f39 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.28.1" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/gke-persistent-volume/versions.tf b/community/modules/file-system/gke-persistent-volume/versions.tf index 0ed9c0ee1e..32acf35767 100644 --- a/community/modules/file-system/gke-persistent-volume/versions.tf +++ b/community/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.28.1" } } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 65282a9121..c5c8fd51fc 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.28.1" } required_version = ">= 0.14.0" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index 48dafa2ba3..9e8f6bc869 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.28.1" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.28.1" } } diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index 883632dcb2..f3eae8670e 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.28.1" } required_version = ">= 0.14.0" diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf index d76add2595..798ed45617 100644 --- a/community/modules/pubsub/bigquery-sub/versions.tf +++ b/community/modules/pubsub/bigquery-sub/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.28.1" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.28.1" } required_version = ">= 1.0" } diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf index 2620a02775..b6b090b6f1 100644 --- a/community/modules/pubsub/topic/versions.tf +++ b/community/modules/pubsub/topic/versions.tf @@ -27,6 +27,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:topic/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:topic/v1.28.1" } } diff --git a/community/modules/remote-desktop/chrome-remote-desktop/README.md b/community/modules/remote-desktop/chrome-remote-desktop/README.md index c1b3fb2bf7..809ba73fd1 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/README.md +++ b/community/modules/remote-desktop/chrome-remote-desktop/README.md @@ -63,7 +63,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | | [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | bb47067 | ## Resources @@ -85,7 +85,7 @@ No resources. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. Requires virtual workstation accelerator if Nvidia Grid Drivers are required |
list(object({
type = string,
count = number
}))
|
[
{
"count": 1,
"type": "nvidia-tesla-t4-vws"
}
]
| no | | [install\_nvidia\_driver](#input\_install\_nvidia\_driver) | Installs the nvidia driver (true/false). For details, see https://cloud.google.com/compute/docs/gpus/install-drivers-gpu | `bool` | n/a | yes | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | -| [instance\_image](#input\_instance\_image) | Image used to build chrome remote desktop node. The default image is from
family= \"debian-11\" and project = \"debian-cloud\". An alternative image is
from family = \"ubuntu-2204-lts\" and project = \"ubuntu-os-cloud\".

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "debian-11",
"project": "debian-cloud"
}
| no | +| [instance\_image](#input\_instance\_image) | Image used to build chrome remote desktop node. The default image is from
family= \"debian-12\" and project = \"debian-cloud\". An alternative image is
from family = \"ubuntu-2204-lts\" and project = \"ubuntu-os-cloud\".

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "debian-12",
"project": "debian-cloud"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Machine type to use for the instance creation. Must be N1 family if GPU is used. | `string` | `"n1-standard-8"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | diff --git a/community/modules/remote-desktop/chrome-remote-desktop/main.tf b/community/modules/remote-desktop/chrome-remote-desktop/main.tf index 98d8c83568..e90820bfc8 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/main.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/main.tf @@ -55,7 +55,7 @@ locals { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id diff --git a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-grid-drivers.yml b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-grid-drivers.yml index e67f8f65d0..93efbfdb7e 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-grid-drivers.yml +++ b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-grid-drivers.yml @@ -26,6 +26,14 @@ - gdm3 grid_fn: NVIDIA-Linux-x86_64-510.85.02-grid.run grid_ver: vGPU14.2 + bookworm: + packages: + - build-essential + - gdebi-core + - mesa-utils + - gdm3 + grid_fn: NVIDIA-Linux-x86_64-535.154.05-grid.run + grid_ver: vGPU16.3 jammy: packages: - build-essential diff --git a/community/modules/remote-desktop/chrome-remote-desktop/variables.tf b/community/modules/remote-desktop/chrome-remote-desktop/variables.tf index 276470a575..ba1fa5ea52 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/variables.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/variables.tf @@ -58,7 +58,7 @@ variable "network_storage" { variable "instance_image" { description = <<-EOD Image used to build chrome remote desktop node. The default image is from - family= \"debian-11\" and project = \"debian-cloud\". An alternative image is + family= \"debian-12\" and project = \"debian-cloud\". An alternative image is from family = \"ubuntu-2204-lts\" and project = \"ubuntu-os-cloud\". Expected Fields: @@ -69,7 +69,7 @@ variable "instance_image" { type = map(string) default = { project = "debian-cloud" - family = "debian-11" + family = "debian-12" } } diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md deleted file mode 100644 index 3389b002b7..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md +++ /dev/null @@ -1,132 +0,0 @@ -## Description - -> **Warning**: this module is now deprecated. We recommend using the Slurm on GCP V5 -> [schedmd-slurm-gcp-v5-controller](../schedmd-slurm-gcp-v5-controller/README.md) instead. - -This module creates a slurm controller node via the SchedMD/slurm-gcp -[controller] module. - -More information about Slurm On GCP can be found at the -[project's GitHub page][slurm-on-gcp] and in the -[Slurm on Google Cloud User Guide][slurm-ug]. - -The [user guide][slurm-ug] provides detailed instructions on customizing and -enhancing the Slurm on GCP cluster as well as recommendations on configuring the -controller for optimal performance at different scales. - -[controller]: https://github.com/SchedMD/slurm-gcp/tree/v4.2.0/tf/modules/controller -[slurm-ug]: https://goo.gle/slurm-gcp-user-guide. - -### Example - -```yaml -- id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - compute_partition - settings: - login_node_count: 1 -``` - -This creates a controller node connected to the primary subnetwork with 1 login -node (defined elsewhere). The controller will also have the `homefs` file system -mounted via the `use` field and manage one partition, also declared in the `use` -field. - -## GPU Support - -More information on GPU support in Slurm on GCP and other HPC Toolkit modules -can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md) - -## Support -The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform -modules. For support with the underlying modules, see the instructions in the -[slurm-gcp README][slurm-gcp-readme]. - -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform - -## License - - -Copyright 2022 Google LLC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 0.14.0 | - -## Providers - -No providers. - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [slurm\_cluster\_compute\_node](#module\_slurm\_cluster\_compute\_node) | github.com/SchedMD/slurm-gcp//tf/modules/compute/ | v4.2.1 | -| [slurm\_cluster\_controller](#module\_slurm\_cluster\_controller) | github.com/SchedMD/slurm-gcp//tf/modules/controller/ | v4.2.1 | - -## Resources - -No resources. - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [boot\_disk\_size](#input\_boot\_disk\_size) | Size of boot disk to create for the cluster controller node | `number` | `50` | no | -| [boot\_disk\_type](#input\_boot\_disk\_type) | Type of boot disk to create for the cluster controller node.
Choose from: pd-ssd, pd-standard, pd-balanced, pd-extreme.
pd-ssd is recommended if the controller is hosting the SlurmDB and NFS share.
If SlurmDB and NFS share are not running on the controller, pd-standard is
recommended. See "Controller configuration recommendations" in the Slurm on
Google Cloud User Guide for more information:
https://goo.gle/slurm-gcp-user-guide | `string` | `"pd-ssd"` | no | -| [cloudsql](#input\_cloudsql) | Define an existing CloudSQL instance to use instead of instance-local MySQL |
object({
server_ip = string,
user = string,
password = string,
db_name = string
})
| `null` | no | -| [cluster\_name](#input\_cluster\_name) | Name of the cluster | `string` | `null` | no | -| [compute\_node\_scopes](#input\_compute\_node\_scopes) | Scopes to apply to compute nodes. | `list(string)` |
[
"https://www.googleapis.com/auth/monitoring.write",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/devstorage.read_only"
]
| no | -| [compute\_node\_service\_account](#input\_compute\_node\_service\_account) | Service Account for compute nodes. | `string` | `null` | no | -| [compute\_startup\_script](#input\_compute\_startup\_script) | Custom startup script to run on the compute nodes | `string` | `null` | no | -| [controller\_instance\_template](#input\_controller\_instance\_template) | Instance template to use to create controller instance | `string` | `null` | no | -| [controller\_machine\_type](#input\_controller\_machine\_type) | Compute Platform machine type to use in controller node creation. `c2-standard-4`
is recommended for clusters up to 50 nodes, for larger clusters see
"Controller configuration recommendations" in the Slurm on Google Cloud User
Guide: https://goo.gle/slurm-gcp-user-guide | `string` | `"c2-standard-4"` | no | -| [controller\_scopes](#input\_controller\_scopes) | Scopes to apply to the controller | `list(string)` |
[
"https://www.googleapis.com/auth/cloud-platform",
"https://www.googleapis.com/auth/devstorage.read_only"
]
| no | -| [controller\_secondary\_disk](#input\_controller\_secondary\_disk) | Create secondary disk mounted to controller node | `bool` | `false` | no | -| [controller\_secondary\_disk\_size](#input\_controller\_secondary\_disk\_size) | Size of disk for the secondary disk | `number` | `100` | no | -| [controller\_secondary\_disk\_type](#input\_controller\_secondary\_disk\_type) | Disk type (pd-ssd or pd-standard) for secondary disk | `string` | `"pd-ssd"` | no | -| [controller\_service\_account](#input\_controller\_service\_account) | Service Account for the controller | `string` | `null` | no | -| [controller\_startup\_script](#input\_controller\_startup\_script) | Custom startup script to run on the controller | `string` | `null` | no | -| [deployment\_name](#input\_deployment\_name) | Name of the deployment | `string` | n/a | yes | -| [disable\_compute\_public\_ips](#input\_disable\_compute\_public\_ips) | If set to true, create Cloud NAT gateway and enable IAP FW rules | `bool` | `true` | no | -| [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | If set to true, create Cloud NAT gateway and enable IAP FW rules | `bool` | `false` | no | -| [instance\_image](#input\_instance\_image) | Slurm image to use for the controller instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.
Custom images must comply with Slurm on GCP requirements. | `map(string)` |
{
"family": "schedmd-slurm-21-08-8-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | -| [intel\_select\_solution](#input\_intel\_select\_solution) | Configure the cluster to meet the performance requirement of the Intel Select Solution | `string` | `null` | no | -| [jwt\_key](#input\_jwt\_key) | Specific libjwt key to use | `any` | `null` | no | -| [labels](#input\_labels) | Labels to add to controller instance. Key-value pairs. | `map(string)` | `{}` | no | -| [login\_node\_count](#input\_login\_node\_count) | Number of login nodes in the cluster | `number` | `0` | no | -| [munge\_key](#input\_munge\_key) | Specific munge key to use | `any` | `null` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [partition](#input\_partition) | An array of configurations for specifying multiple machine types residing in their own Slurm partitions. |
list(object({
name = string,
machine_type = string,
max_node_count = number,
zone = string,
image = string,
image_hyperthreads = bool,
compute_disk_type = string,
compute_disk_size_gb = number,
compute_labels = any,
cpu_platform = string,
gpu_type = string,
gpu_count = number,
network_storage = list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string
})),
preemptible_bursting = string,
vpc_subnet = string,
exclusive = bool,
enable_placement = bool,
regional_capacity = bool,
regional_policy = any,
instance_template = string,
bandwidth_tier = string,
static_node_count = number
}))
| n/a | yes | -| [project\_id](#input\_project\_id) | Compute Platform project that will host the Slurm cluster | `string` | n/a | yes | -| [region](#input\_region) | Compute Platform region where the Slurm cluster will be located | `string` | n/a | yes | -| [shared\_vpc\_host\_project](#input\_shared\_vpc\_host\_project) | Host project of shared VPC | `string` | `null` | no | -| [startup\_script](#input\_startup\_script) | Custom startup script to run on compute nodes and controller.
`controller_startup_script` for the controller and `compute_startup_script` for compute nodes take presidence if specified.
This variable allows Slurm to [use](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules#use-optional) the [startup\_script](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/scripts/startup-script) module. | `string` | `null` | no | -| [subnetwork\_name](#input\_subnetwork\_name) | The name of the pre-defined VPC subnet you want the nodes to attach to based on Region. | `string` | `null` | no | -| [suspend\_time](#input\_suspend\_time) | Idle time (in sec) to wait before nodes go away | `number` | `300` | no | -| [zone](#input\_zone) | Compute Platform zone where the servers will be located | `string` | n/a | yes | - -## Outputs - -| Name | Description | -|------|-------------| -| [controller\_name](#output\_controller\_name) | Name of the controller node | - diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf deleted file mode 100644 index 76e74fca34..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -locals { - # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "schedmd-slurm-on-gcp-controller", ghpc_role = "scheduler" }) -} - -locals { - controller_startup_script = var.controller_startup_script != null ? var.controller_startup_script : var.startup_script - compute_startup_script = var.compute_startup_script != null ? var.compute_startup_script : var.startup_script - cluster_name = var.cluster_name != null ? var.cluster_name : "slurm-${var.deployment_name}" - - instance_name = lookup(var.instance_image, "name", null) - instance_family = lookup(var.instance_image, "family", null) - instance_image = ( - local.instance_name != null ? - "projects/${var.instance_image["project"]}/global/images/${local.instance_name}" : - "projects/${var.instance_image["project"]}/global/images/family/${local.instance_family}" - ) -} - -module "slurm_cluster_controller" { - source = "github.com/SchedMD/slurm-gcp//tf/modules/controller/?ref=v4.2.1" - boot_disk_size = var.boot_disk_size - boot_disk_type = var.boot_disk_type - image = local.instance_image - instance_template = var.controller_instance_template - cluster_name = local.cluster_name - compute_node_scopes = var.compute_node_scopes - compute_node_service_account = var.compute_node_service_account - disable_compute_public_ips = var.disable_compute_public_ips - disable_controller_public_ips = var.disable_controller_public_ips - labels = local.labels - login_network_storage = var.network_storage - login_node_count = var.login_node_count - machine_type = var.controller_machine_type - munge_key = var.munge_key - jwt_key = var.jwt_key - network_storage = var.network_storage - partitions = var.partition - controller_startup_script = local.controller_startup_script - compute_startup_script = local.compute_startup_script - project = var.project_id - region = var.region - secondary_disk = var.controller_secondary_disk - secondary_disk_size = var.controller_secondary_disk_size - secondary_disk_type = var.controller_secondary_disk_type - shared_vpc_host_project = var.shared_vpc_host_project - scopes = var.controller_scopes - service_account = var.controller_service_account - subnetwork_name = var.subnetwork_name - suspend_time = var.suspend_time - zone = var.zone - intel_select_solution = var.intel_select_solution - cloudsql = var.cloudsql -} - -module "slurm_cluster_compute_node" { - source = "github.com/SchedMD/slurm-gcp//tf/modules/compute/?ref=v4.2.1" - project = var.project_id - cluster_name = local.cluster_name - region = var.region - zone = var.zone - controller_name = module.slurm_cluster_controller.controller_node_name - controller_secondary_disk = var.controller_secondary_disk - disable_compute_public_ips = var.disable_compute_public_ips - network_storage = var.network_storage - partitions = var.partition - compute_startup_script = local.compute_startup_script - scopes = var.compute_node_scopes - service_account = var.compute_node_service_account - shared_vpc_host_project = var.shared_vpc_host_project - subnetwork_name = var.subnetwork_name - intel_select_solution = var.intel_select_solution - munge_key = var.munge_key -} diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/metadata.yaml b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/metadata.yaml deleted file mode 100644 index 4c2f23a8d7..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/metadata.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2023 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -spec: - requirements: - services: - - compute.googleapis.com diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/outputs.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/outputs.tf deleted file mode 100644 index 81be162e5b..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/outputs.tf +++ /dev/null @@ -1,20 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "controller_name" { - description = "Name of the controller node" - value = module.slurm_cluster_controller.controller_node_name -} diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/variables.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/variables.tf deleted file mode 100644 index c5ce7900f3..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/variables.tf +++ /dev/null @@ -1,294 +0,0 @@ -# -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -variable "boot_disk_size" { - description = "Size of boot disk to create for the cluster controller node" - type = number - default = 50 -} - -variable "boot_disk_type" { - description = <<-EOT - Type of boot disk to create for the cluster controller node. - Choose from: pd-ssd, pd-standard, pd-balanced, pd-extreme. - pd-ssd is recommended if the controller is hosting the SlurmDB and NFS share. - If SlurmDB and NFS share are not running on the controller, pd-standard is - recommended. See "Controller configuration recommendations" in the Slurm on - Google Cloud User Guide for more information: - https://goo.gle/slurm-gcp-user-guide - EOT - type = string - default = "pd-ssd" -} - -variable "instance_image" { - description = <<-EOD - Slurm image to use for the controller instance. - - Expected Fields: - name: The name of the image. Mutually exclusive with family. - family: The image family to use. Mutually exclusive with name. - project: The project where the image is hosted. - Custom images must comply with Slurm on GCP requirements. - EOD - type = map(string) - default = { - project = "schedmd-slurm-public" - family = "schedmd-slurm-21-08-8-hpc-centos-7" - } - - validation { - condition = length(var.instance_image) == 0 || ( - can(var.instance_image["family"]) || can(var.instance_image["name"])) == can(var.instance_image["project"]) - error_message = "The \"project\" is required if \"family\" or \"name\" are provided in var.instance_image." - } - validation { - condition = length(var.instance_image) == 0 || can(var.instance_image["family"]) != can(var.instance_image["name"]) - error_message = "Exactly one of \"family\" and \"name\" must be provided in var.instance_image." - } -} - -variable "controller_instance_template" { - description = "Instance template to use to create controller instance" - type = string - default = null -} - -variable "cluster_name" { - description = "Name of the cluster" - type = string - default = null -} - -variable "deployment_name" { - description = "Name of the deployment" - type = string -} - -variable "compute_node_scopes" { - description = "Scopes to apply to compute nodes." - type = list(string) - default = [ - "https://www.googleapis.com/auth/monitoring.write", - "https://www.googleapis.com/auth/logging.write", - "https://www.googleapis.com/auth/devstorage.read_only", - ] -} - -variable "compute_node_service_account" { - description = "Service Account for compute nodes." - type = string - default = null -} - -variable "disable_controller_public_ips" { - description = "If set to true, create Cloud NAT gateway and enable IAP FW rules" - type = bool - default = false -} - -variable "disable_compute_public_ips" { - description = "If set to true, create Cloud NAT gateway and enable IAP FW rules" - type = bool - default = true -} - -variable "labels" { - description = "Labels to add to controller instance. Key-value pairs." - type = map(string) - default = {} -} - -variable "login_node_count" { - description = "Number of login nodes in the cluster" - type = number - default = 0 -} - -variable "controller_machine_type" { - description = <<-EOT - Compute Platform machine type to use in controller node creation. `c2-standard-4` - is recommended for clusters up to 50 nodes, for larger clusters see - "Controller configuration recommendations" in the Slurm on Google Cloud User - Guide: https://goo.gle/slurm-gcp-user-guide - EOT - type = string - default = "c2-standard-4" -} - -variable "munge_key" { - description = "Specific munge key to use" - type = any - default = null -} - -variable "jwt_key" { - description = "Specific libjwt key to use" - type = any - default = null -} - -variable "network_storage" { - description = "An array of network attached storage mounts to be configured on all instances." - type = list(object({ - server_ip = string, - remote_mount = string, - local_mount = string, - fs_type = string, - mount_options = string, - client_install_runner = map(string) - mount_runner = map(string) - })) - default = [] -} - -variable "partition" { - description = "An array of configurations for specifying multiple machine types residing in their own Slurm partitions." - type = list(object({ - name = string, - machine_type = string, - max_node_count = number, - zone = string, - image = string, - image_hyperthreads = bool, - compute_disk_type = string, - compute_disk_size_gb = number, - compute_labels = any, - cpu_platform = string, - gpu_type = string, - gpu_count = number, - network_storage = list(object({ - server_ip = string, - remote_mount = string, - local_mount = string, - fs_type = string, - mount_options = string - })), - preemptible_bursting = string, - vpc_subnet = string, - exclusive = bool, - enable_placement = bool, - regional_capacity = bool, - regional_policy = any, - instance_template = string, - bandwidth_tier = string, - static_node_count = number - })) -} - -variable "controller_startup_script" { - description = "Custom startup script to run on the controller" - type = string - default = null -} - -variable "compute_startup_script" { - description = "Custom startup script to run on the compute nodes" - type = string - default = null -} - -variable "startup_script" { - description = < **Warning**: this module is now deprecated. We recommend using the Slurm on GCP V5 -> [schedmd-slurm-gcp-v5-login](../schedmd-slurm-gcp-v5-login/README.md) instead. - -This module creates a login node for a Slurm cluster based on the -[Slurm on GCP][slurm-on-gcp] terraform [login module][login-module]. The login -node is used in conjunction with the -[Slurm controller](../SchedMD-slurm-on-gcp-controller). - -> **_Warning:_**: Slurm handles startup scripts differently from virtual -> machines. This will not work in conjunction with the -> [startup_script](../../../scripts/startup-script/README.md) module. - -[login-module]: https://github.com/SchedMD/slurm-gcp/tree/master/tf/modules/login - -### Example - -```yaml -- id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - homefs - - slurm_controller - settings: - login_machine_type: n2-standard-4 -``` - -This creates a Slurm login node which is: - -* connected to the primary subnet of network1 via `use` -* mounted to the homefs filesystem via `use` -* associated with the `slurm_controller` module as the slurm controller via - `use` -* of VM machine type `n2-standard-4` - -## GPU Support - -More information on GPU support in Slurm on GCP and other HPC Toolkit modules -can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md) - -## Support -The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform -modules. For support with the underlying modules, see the instructions in the -[slurm-gcp README][slurm-gcp-readme]. - -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform - -## License - - -Copyright 2022 Google LLC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 0.14.0 | - -## Providers - -No providers. - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [slurm\_cluster\_login\_node](#module\_slurm\_cluster\_login\_node) | github.com/SchedMD/slurm-gcp//tf/modules/login/ | v4.2.1 | - -## Resources - -No resources. - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [boot\_disk\_size](#input\_boot\_disk\_size) | Size of boot disk to create for the cluster login node | `number` | `20` | no | -| [boot\_disk\_type](#input\_boot\_disk\_type) | Type of boot disk to create for the cluster login node | `string` | `"pd-standard"` | no | -| [cluster\_name](#input\_cluster\_name) | Name of the cluster | `string` | `null` | no | -| [controller\_name](#input\_controller\_name) | FQDN or IP address of the controller node | `string` | n/a | yes | -| [controller\_secondary\_disk](#input\_controller\_secondary\_disk) | Create secondary disk mounted to controller node | `bool` | `false` | no | -| [deployment\_name](#input\_deployment\_name) | Name of the deployment | `string` | n/a | yes | -| [disable\_login\_public\_ips](#input\_disable\_login\_public\_ips) | If set to true, create Cloud NAT gateway and enable IAP FW rules | `bool` | `false` | no | -| [instance\_image](#input\_instance\_image) | Disk OS image with Slurm preinstalled to use for login node.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.
Custom images must comply with Slurm on GCP requirements. | `map(string)` |
{
"family": "schedmd-slurm-21-08-8-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | -| [labels](#input\_labels) | Labels to add to login instances. Key-value pairs. | `map(string)` | `{}` | no | -| [login\_instance\_template](#input\_login\_instance\_template) | Instance template to use to create controller instance | `string` | `null` | no | -| [login\_machine\_type](#input\_login\_machine\_type) | Machine type to use for login node instances. | `string` | `"n2-standard-2"` | no | -| [login\_node\_count](#input\_login\_node\_count) | Number of login nodes in the cluster | `number` | `1` | no | -| [login\_scopes](#input\_login\_scopes) | Scopes to apply to login nodes. | `list(string)` |
[
"https://www.googleapis.com/auth/monitoring.write",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/devstorage.read_only"
]
| no | -| [login\_service\_account](#input\_login\_service\_account) | Service Account for compute nodes. | `string` | `null` | no | -| [login\_startup\_script](#input\_login\_startup\_script) | Custom startup script to run on the login node | `string` | `null` | no | -| [munge\_key](#input\_munge\_key) | Specific munge key to use | `any` | `null` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [region](#input\_region) | Compute Platform region where the Slurm cluster will be located | `string` | n/a | yes | -| [shared\_vpc\_host\_project](#input\_shared\_vpc\_host\_project) | Host project of shared VPC | `string` | `null` | no | -| [startup\_script](#input\_startup\_script) | Custom startup script to run on the login node.
Will be ignored if `login_startup_script` is specified.
This variable allows Slurm to [use](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules#use-optional) the [startup\_script](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/scripts/startup-script) module. | `string` | `null` | no | -| [subnet\_depend](#input\_subnet\_depend) | Used as a dependency between the network and instances | `string` | `""` | no | -| [subnetwork\_name](#input\_subnetwork\_name) | The name of the pre-defined VPC subnet you want the nodes to attach to based on Region. | `string` | `null` | no | -| [zone](#input\_zone) | Compute Platform zone where the notebook server will be located | `string` | n/a | yes | - -## Outputs - -No outputs. - diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/main.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/main.tf deleted file mode 100644 index 076225cbe9..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/main.tf +++ /dev/null @@ -1,62 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -locals { - # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "schedmd-slurm-on-gcp-login-node", ghpc_role = "scheduler" }) -} - -locals { - login_startup_script = var.login_startup_script != null ? var.login_startup_script : var.startup_script - - instance_name = lookup(var.instance_image, "name", null) - instance_family = lookup(var.instance_image, "family", null) - instance_image = ( - local.instance_name != null ? - "projects/${var.instance_image["project"]}/global/images/${local.instance_name}" : - "projects/${var.instance_image["project"]}/global/images/family/${local.instance_family}" - ) -} - -module "slurm_cluster_login_node" { - source = "github.com/SchedMD/slurm-gcp//tf/modules/login/?ref=v4.2.1" - boot_disk_size = var.boot_disk_size - boot_disk_type = var.boot_disk_type - image = local.instance_image - instance_template = var.login_instance_template - cluster_name = ( - var.cluster_name != null - ? var.cluster_name - : "slurm-${var.deployment_name}" - ) - controller_name = var.controller_name - controller_secondary_disk = var.controller_secondary_disk - disable_login_public_ips = var.disable_login_public_ips - labels = local.labels - login_network_storage = var.network_storage - machine_type = var.login_machine_type - munge_key = var.munge_key - network_storage = var.network_storage - node_count = var.login_node_count - region = var.region - scopes = var.login_scopes - service_account = var.login_service_account - shared_vpc_host_project = var.shared_vpc_host_project - subnet_depend = var.subnet_depend - subnetwork_name = var.subnetwork_name - zone = var.zone - login_startup_script = local.login_startup_script -} diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/metadata.yaml b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/metadata.yaml deleted file mode 100644 index 4c2f23a8d7..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/metadata.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2023 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -spec: - requirements: - services: - - compute.googleapis.com diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/variables.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/variables.tf deleted file mode 100644 index 331ce1ab28..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/variables.tf +++ /dev/null @@ -1,186 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "boot_disk_size" { - description = "Size of boot disk to create for the cluster login node" - type = number - default = 20 -} - -variable "boot_disk_type" { - description = "Type of boot disk to create for the cluster login node" - type = string - default = "pd-standard" -} - -variable "instance_image" { - description = <<-EOD - Disk OS image with Slurm preinstalled to use for login node. - - Expected Fields: - name: The name of the image. Mutually exclusive with family. - family: The image family to use. Mutually exclusive with name. - project: The project where the image is hosted. - Custom images must comply with Slurm on GCP requirements. - EOD - type = map(string) - default = { - project = "schedmd-slurm-public" - family = "schedmd-slurm-21-08-8-hpc-centos-7" - } - - validation { - condition = length(var.instance_image) == 0 || ( - can(var.instance_image["family"]) || can(var.instance_image["name"])) == can(var.instance_image["project"]) - error_message = "The \"project\" is required if \"family\" or \"name\" are provided in var.instance_image." - } - validation { - condition = length(var.instance_image) == 0 || can(var.instance_image["family"]) != can(var.instance_image["name"]) - error_message = "Exactly one of \"family\" and \"name\" must be provided in var.instance_image." - } -} - -variable "login_instance_template" { - description = "Instance template to use to create controller instance" - type = string - default = null -} - -variable "cluster_name" { - description = "Name of the cluster" - type = string - default = null -} - -variable "controller_name" { - description = "FQDN or IP address of the controller node" - type = string -} - -variable "controller_secondary_disk" { - description = "Create secondary disk mounted to controller node" - type = bool - default = false -} - -variable "deployment_name" { - description = "Name of the deployment" - type = string -} - -variable "disable_login_public_ips" { - description = "If set to true, create Cloud NAT gateway and enable IAP FW rules" - type = bool - default = false -} - -variable "labels" { - description = "Labels to add to login instances. Key-value pairs." - type = map(string) - default = {} -} - -variable "login_machine_type" { - description = "Machine type to use for login node instances." - type = string - default = "n2-standard-2" -} - -variable "munge_key" { - description = "Specific munge key to use" - type = any - default = null -} - -variable "network_storage" { - description = " An array of network attached storage mounts to be configured on all instances." - type = list(object({ - server_ip = string, - remote_mount = string, - local_mount = string, - fs_type = string, - mount_options = string, - client_install_runner = map(string) - mount_runner = map(string) - })) - default = [] -} - -variable "login_node_count" { - description = "Number of login nodes in the cluster" - type = number - default = 1 -} - -variable "region" { - description = "Compute Platform region where the Slurm cluster will be located" - type = string -} - -variable "login_scopes" { - description = "Scopes to apply to login nodes." - type = list(string) - default = [ - "https://www.googleapis.com/auth/monitoring.write", - "https://www.googleapis.com/auth/logging.write", - "https://www.googleapis.com/auth/devstorage.read_only", - ] -} - -variable "login_service_account" { - description = "Service Account for compute nodes." - type = string - default = null -} - -variable "shared_vpc_host_project" { - description = "Host project of shared VPC" - type = string - default = null -} - -variable "subnet_depend" { - description = "Used as a dependency between the network and instances" - type = string - default = "" -} - -variable "subnetwork_name" { - description = "The name of the pre-defined VPC subnet you want the nodes to attach to based on Region." - type = string - default = null -} - -variable "zone" { - description = "Compute Platform zone where the notebook server will be located" - type = string -} - -variable "login_startup_script" { - description = "Custom startup script to run on the login node" - type = string - default = null -} - -variable "startup_script" { - description = < [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the system node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [services\_ip\_range\_name](#input\_services\_ip\_range\_name) | The name of the secondary subnet range to use for services. | `string` | `"services"` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork to host the cluster in. | `string` | n/a | yes | +| [system\_node\_pool\_enable\_secure\_boot](#input\_system\_node\_pool\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [system\_node\_pool\_enabled](#input\_system\_node\_pool\_enabled) | Create a system node pool. | `bool` | `true` | no | +| [system\_node\_pool\_image\_type](#input\_system\_node\_pool\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | +| [system\_node\_pool\_kubernetes\_labels](#input\_system\_node\_pool\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [system\_node\_pool\_machine\_type](#input\_system\_node\_pool\_machine\_type) | Machine type for the system node pool. | `string` | `"e2-standard-4"` | no | | [system\_node\_pool\_name](#input\_system\_node\_pool\_name) | Name of the system node pool. | `string` | `"system"` | no | | [system\_node\_pool\_node\_count](#input\_system\_node\_pool\_node\_count) | The total min and max nodes to be maintained in the system node pool. |
object({
total_min_nodes = number
total_max_nodes = number
})
|
{
"total_max_nodes": 10,
"total_min_nodes": 2
}
| no | diff --git a/community/modules/scheduler/gke-cluster/main.tf b/community/modules/scheduler/gke-cluster/main.tf index 5237f94b71..54d3f271c1 100644 --- a/community/modules/scheduler/gke-cluster/main.tf +++ b/community/modules/scheduler/gke-cluster/main.tf @@ -195,6 +195,7 @@ resource "google_container_node_pool" "system_node_pools" { } node_config { + labels = var.system_node_pool_kubernetes_labels resource_labels = local.labels service_account = var.service_account_email oauth_scopes = var.service_account_scopes @@ -209,15 +210,15 @@ resource "google_container_node_pool" "system_node_pools" { # # We use COS_CONTAINERD to be compatible with (optional) gVisor. # https://cloud.google.com/kubernetes-engine/docs/how-to/sandbox-pods - image_type = "COS_CONTAINERD" + image_type = var.system_node_pool_image_type shielded_instance_config { - enable_secure_boot = true + enable_secure_boot = var.system_node_pool_enable_secure_boot enable_integrity_monitoring = true } gvnic { - enabled = true + enabled = var.system_node_pool_image_type == "COS_CONTAINERD" } # Implied by Workload Identity diff --git a/community/modules/scheduler/gke-cluster/variables.tf b/community/modules/scheduler/gke-cluster/variables.tf index 4b12f14852..5ace7cae91 100644 --- a/community/modules/scheduler/gke-cluster/variables.tf +++ b/community/modules/scheduler/gke-cluster/variables.tf @@ -171,6 +171,25 @@ variable "system_node_pool_taints" { }] } +variable "system_node_pool_kubernetes_labels" { + description = <<-EOT + Kubernetes labels to be applied to each node in the node group. Key-value pairs. + (The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) + EOT + type = map(string) + default = null +} +variable "system_node_pool_image_type" { + description = "The default image type used by NAP once a new node pool is being created. Use either COS_CONTAINERD or UBUNTU_CONTAINERD." + type = string + default = "COS_CONTAINERD" +} +variable "system_node_pool_enable_secure_boot" { + description = "Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info." + type = bool + default = true +} + variable "enable_private_nodes" { description = "(Beta) Whether nodes have internal IP addresses only." type = bool diff --git a/community/modules/scheduler/gke-cluster/versions.tf b/community/modules/scheduler/gke-cluster/versions.tf index 010cd59822..9049a6e4b4 100644 --- a/community/modules/scheduler/gke-cluster/versions.tf +++ b/community/modules/scheduler/gke-cluster/versions.tf @@ -30,6 +30,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.28.1" } } diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index c3711154ab..4d7fe60f1c 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -27,6 +27,65 @@ the functionality in these references. Their usage is demonstrated in the [htcondor-pool-secrets]: ../htcondor-pool-secrets/README.md [IDTOKEN]: https://htcondor.readthedocs.io/en/latest/admin-manual/security.html#introducing-idtokens +## Behavior of Managed Instance Group (MIG) + +A regional [MIG][mig] is used to provision the Access Point, although only +1 node will ever be active at a time. By default, the node will be provisioned +in any of the zones available in that region, however, it can be constrained to +run in fewer zones (or a single zone) using [var.zones](#input_zones). + +When the configuration of the Central Manager is changed, the MIG can be +configured to [replace the VM][replacement] using a "proactive" or +"opportunistic" policy. By default, the Access Point replacement policy is +opportunistic. In practice, this means that the Access Point will _NOT_ be +automatically replaced by Terraform when changes to the instance template / +HTCondor configuration are made. The Access Point is _NOT_ safe to replace +automatically as its local storage contains the state of the job queue. By +default, the Access Point will be replaced only when: + +- intentionally by issuing an update via Cloud Console or using gcloud (below) +- the VM becomes unhealthy or is otherwise automatically replaced (e.g. regular + Google Cloud maintenance) + +For example, to manually update all instances in a MIG: + +```text +gcloud compute instance-groups managed update-instances \ + <> --all-instances --region <> \ + --project <> --minimal-action replace +``` + +This mode can be switched to proactive (automatic) replacement by setting +[var.update_policy](#input_update_policy) to "PROACTIVE". In this case we +recommend the use of Filestore to store the job queue state ("spool") and +setting [var.spool_parent_dir][#input_spool_parent_dir] to its mount point: + +```yaml + - id: spoolfs + source: modules/file-system/filestore + use: + - network1 + settings: + filestore_tier: ENTERPRISE + local_mount: /shared + +... + + - id: htcondor_access + source: community/modules/scheduler/htcondor-access-point + use: + - network1 + - spoolfs + - htcondor_secrets + - htcondor_setup + - htcondor_cm + - htcondor_execute_point_group + settings: + spool_parent_dir: /shared +``` + +[replacement]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type + Copyright 2023 Google LLC @@ -63,7 +122,7 @@ limitations under the License. |------|--------|---------| | [access\_point\_instance\_template](#module\_access\_point\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 84d7959 | | [htcondor\_ap](#module\_htcondor\_ap) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources @@ -106,6 +165,7 @@ limitations under the License. | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance (must set var.enabled\_shielded\_vm) |
object({
enable_secure_boot = bool
enable_vtpm = bool
enable_integrity_monitoring = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [spool\_parent\_dir](#input\_spool\_parent\_dir) | HTCondor access point configuration SPOOL will be set to subdirectory named "spool" | `string` | `"/var/lib/condor"` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork in which the HTCondor central manager will be created. | `string` | `null` | no | +| [update\_policy](#input\_update\_policy) | Replacement policy for Access Point Managed Instance Group ("PROACTIVE" to replace immediately or "OPPORTUNISTIC" to replace upon instance power cycle) | `string` | `"OPPORTUNISTIC"` | no | | [zones](#input\_zones) | Zone(s) in which access point may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no | ## Outputs diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 888fa44cd6..142a16eb35 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -143,7 +143,7 @@ resource "google_storage_bucket_object" "ap_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" project_id = var.project_id region = var.region @@ -210,14 +210,14 @@ module "htcondor_ap" { update_policy = [{ instance_redistribution_type = "NONE" - replacement_method = "SUBSTITUTE" - max_surge_fixed = length(local.zones) + replacement_method = "RECREATE" # preserves hostnames (necessary for PROACTIVE replacement) + max_surge_fixed = 0 # must be 0 to preserve hostnames max_unavailable_fixed = length(local.zones) max_surge_percent = null max_unavailable_percent = null min_ready_sec = 300 minimal_action = "REPLACE" - type = "OPPORTUNISTIC" + type = var.update_policy }] stateful_ips = [{ diff --git a/community/modules/scheduler/htcondor-access-point/variables.tf b/community/modules/scheduler/htcondor-access-point/variables.tf index 3f80cb6afd..292596f672 100644 --- a/community/modules/scheduler/htcondor-access-point/variables.tf +++ b/community/modules/scheduler/htcondor-access-point/variables.tf @@ -216,3 +216,13 @@ variable "shielded_instance_config" { enable_integrity_monitoring = true } } + +variable "update_policy" { + description = "Replacement policy for Access Point Managed Instance Group (\"PROACTIVE\" to replace immediately or \"OPPORTUNISTIC\" to replace upon instance power cycle)" + type = string + default = "OPPORTUNISTIC" + validation { + condition = contains(["PROACTIVE", "OPPORTUNISTIC"], var.update_policy) + error_message = "Allowed string values for var.update_policy are \"PROACTIVE\" or \"OPPORTUNISTIC\"." + } +} diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index de3bf85848..94fc3499dc 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.28.1" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index 57186e9ea8..0912b0b1c5 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -31,9 +31,18 @@ A regional [MIG][mig] is used to provision the central manager, although only in any of the zones available in that region, however, it can be constrained to run in fewer zones (or a single zone) using [var.zones](#input_zones). -The VM replacement policy is set to [opportunistic]. In practice, this means -that an active VM will not be replaced by Terraform actions, but may be -replaced when either: +When the configuration of the Central Manager is changed, the MIG can be +configured to [replace the VM][replacement] using a "proactive" or +"opportunistic" policy. By default, the Central Manager replacement policy is +set to proactive. In practice, this means that the Central Manager will be +replaced by Terraform when changes to the instance template / HTCondor +configuration are made. The Central Manager is safe to replace automatically as +it gathers its state information from periodic messages exchanged with the rest +of the HTCondor pool. + +This mode can be configured by setting [var.update_policy](#input_update_policy) +to either "PROACTIVE" (default) or "OPPORTUNISTIC". If set to opportunistic +replacement, the Central Manager will be replaced only when: - intentionally by issuing an update via Cloud Console or using gcloud (below) - the VM becomes unhealthy or is otherwise automatically replaced (e.g. regular @@ -47,7 +56,7 @@ gcloud compute instance-groups managed update-instances \ --project <> --minimal-action replace ``` -[opportunistic]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type +[replacement]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type ## Limiting inter-zone egress @@ -99,7 +108,7 @@ limitations under the License. |------|--------|---------| | [central\_manager\_instance\_template](#module\_central\_manager\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 84d7959 | | [htcondor\_cm](#module\_htcondor\_cm) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources @@ -135,6 +144,7 @@ limitations under the License. | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes by which to limit service account attached to central manager. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance (must set var.enabled\_shielded\_vm) |
object({
enable_secure_boot = bool
enable_vtpm = bool
enable_integrity_monitoring = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork in which the HTCondor central manager will be created. | `string` | `null` | no | +| [update\_policy](#input\_update\_policy) | Replacement policy for Central Manager ("PROACTIVE" to replace immediately or "OPPORTUNISTIC" to replace upon instance power cycle). | `string` | `"PROACTIVE"` | no | | [zones](#input\_zones) | Zone(s) in which central manager may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no | ## Outputs diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index 66d78c4059..a47cb0ed49 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -110,7 +110,7 @@ resource "google_storage_bucket_object" "cm_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" project_id = var.project_id region = var.region @@ -177,14 +177,14 @@ module "htcondor_cm" { update_policy = [{ instance_redistribution_type = "NONE" - replacement_method = "SUBSTITUTE" - max_surge_fixed = length(local.zones) + replacement_method = "RECREATE" # preserves hostnames (necessary for PROACTIVE replacement) + max_surge_fixed = 0 # must be 0 to preserve hostnames max_unavailable_fixed = length(local.zones) max_surge_percent = null max_unavailable_percent = null min_ready_sec = 300 minimal_action = "REPLACE" - type = "OPPORTUNISTIC" + type = var.update_policy }] stateful_ips = [{ diff --git a/community/modules/scheduler/htcondor-central-manager/templates/condor_config.tftpl b/community/modules/scheduler/htcondor-central-manager/templates/condor_config.tftpl index d6a6d451b5..1f8089cc1b 100644 --- a/community/modules/scheduler/htcondor-central-manager/templates/condor_config.tftpl +++ b/community/modules/scheduler/htcondor-central-manager/templates/condor_config.tftpl @@ -20,6 +20,11 @@ use role:get_htcondor_central_manager CONDOR_HOST = $(IPV4_ADDRESS) # Central Manager configuration settings +# https://htcondor.readthedocs.io/en/23.0/admin-manual/configuration-macros.html#condor-collector-configuration-file-entries +# https://htcondor.readthedocs.io/en/23.0/admin-manual/configuration-macros.html#condor-negotiator-configuration-file-entries +# set classad lifetime (expiration) to ~5x the update interval for all daemons +# defaults to 900s +CLASSAD_LIFETIME = 180 COLLECTOR_UPDATE_INTERVAL = 30 NEGOTIATOR_UPDATE_INTERVAL = 30 NEGOTIATOR_DEPTH_FIRST = True diff --git a/community/modules/scheduler/htcondor-central-manager/variables.tf b/community/modules/scheduler/htcondor-central-manager/variables.tf index 885df7567b..b99d29c779 100644 --- a/community/modules/scheduler/htcondor-central-manager/variables.tf +++ b/community/modules/scheduler/htcondor-central-manager/variables.tf @@ -169,3 +169,13 @@ variable "shielded_instance_config" { enable_integrity_monitoring = true } } + +variable "update_policy" { + description = "Replacement policy for Central Manager (\"PROACTIVE\" to replace immediately or \"OPPORTUNISTIC\" to replace upon instance power cycle)." + type = string + default = "PROACTIVE" + validation { + condition = contains(["PROACTIVE", "OPPORTUNISTIC"], var.update_policy) + error_message = "Allowed string values for var.update_policy are \"PROACTIVE\" or \"OPPORTUNISTIC\"." + } +} diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index b3e74b2579..9245cd3c5f 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.28.1" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl b/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl index ec6773053c..04c96291ee 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl +++ b/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl @@ -1,6 +1,10 @@ +Set-StrictMode -Version latest +$ErrorActionPreference = 'Stop' + $config_dir = 'C:\Condor\config' -if(!(test-path -PathType container -Path $config_dir)) { - New-Item -ItemType Directory -Path $config_dir +if(!(test-path -PathType container -Path $config_dir)) +{ + New-Item -ItemType Directory -Path $config_dir } $config_file = "$config_dir\51-ghpc-trust-domain" @@ -15,3 +19,8 @@ Set-Content -Path "$config_file" -Value "$config_string" # obtain IDTOKEN for authentication by StartD to Central Manager gcloud secrets versions access latest --secret ${xp_idtoken_secret_id} ` --out-file C:\condor\tokens.d\condor@${trust_domain} + +if ($LASTEXITCODE -ne 0) +{ + throw "Could not download HTCondor IDTOKEN; exiting startup script" +} diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 93df6ec656..ba70633daa 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.28.1" } required_version = ">= 0.13.0" diff --git a/community/modules/scheduler/htcondor-setup/versions.tf b/community/modules/scheduler/htcondor-setup/versions.tf index 1e13bdbd36..1b19a3f751 100644 --- a/community/modules/scheduler/htcondor-setup/versions.tf +++ b/community/modules/scheduler/htcondor-setup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-setup/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-setup/v1.28.1" } required_version = ">= 0.13.0" diff --git a/community/modules/scheduler/pbspro-client/README.md b/community/modules/scheduler/pbspro-client/README.md index 92c684d4d6..1d40e411bd 100644 --- a/community/modules/scheduler/pbspro-client/README.md +++ b/community/modules/scheduler/pbspro-client/README.md @@ -74,7 +74,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | | [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | bb47067 | | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.22.1 | diff --git a/community/modules/scheduler/pbspro-client/main.tf b/community/modules/scheduler/pbspro-client/main.tf index 82e335936c..5801bc15c0 100644 --- a/community/modules/scheduler/pbspro-client/main.tf +++ b/community/modules/scheduler/pbspro-client/main.tf @@ -43,7 +43,7 @@ module "pbs_install" { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id diff --git a/community/modules/scheduler/pbspro-server/README.md b/community/modules/scheduler/pbspro-server/README.md index 64911acfe4..9e1b047655 100644 --- a/community/modules/scheduler/pbspro-server/README.md +++ b/community/modules/scheduler/pbspro-server/README.md @@ -72,7 +72,7 @@ No providers. | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.22.1 | | [pbs\_qmgr](#module\_pbs\_qmgr) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr | v1.22.1 | | [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | bb47067 | -| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/pbspro-server/main.tf b/community/modules/scheduler/pbspro-server/main.tf index 622a924dff..250e1ac9fc 100644 --- a/community/modules/scheduler/pbspro-server/main.tf +++ b/community/modules/scheduler/pbspro-server/main.tf @@ -55,7 +55,7 @@ module "pbs_qmgr" { } module "server_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 213c12975e..34e96043f6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -17,14 +17,14 @@ controller for optimal performance at different scales. > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.9.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.10.2/scripts/requirements.txt > ``` -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm\_controller\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. -[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/scripts/requirements.txt [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions [enable\_reconfigure]: #input\_enable\_reconfigure @@ -94,12 +94,12 @@ This option has some additional requirements: development environment deploying the cluster. One can use following commands: ```bash - pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.9.1/scripts/requirements.txt + pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.10.2/scripts/requirements.txt ``` For more information, see the [description][optdeps] of this module. -[optdeps]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster#optional +[optdeps]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster#optional ## Custom Images @@ -178,8 +178,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License @@ -215,8 +215,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.9.1 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.9.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.10.2 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.10.2 | ## Resources @@ -248,8 +248,8 @@ limitations under the License. | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enable loading of cluster job usage into big query. | `bool` | `false` | no | -| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/SchedMD/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | -| [enable\_cleanup\_subscriptions](#input\_enable\_cleanup\_subscriptions) | Enables automatic cleanup of pub/sub subscriptions managed by this module, when
cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/SchedMD/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may temporarily impact var.enable\_reconfigure behavior. | `bool` | `false` | no | +| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | +| [enable\_cleanup\_subscriptions](#input\_enable\_cleanup\_subscriptions) | Enables automatic cleanup of pub/sub subscriptions managed by this module, when
cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may temporarily impact var.enable\_reconfigure behavior. | `bool` | `false` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | @@ -259,7 +259,7 @@ limitations under the License. | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-9-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-10-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | @@ -271,7 +271,7 @@ limitations under the License. | [network\_self\_link](#input\_network\_self\_link) | Network to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | -| [partition](#input\_partition) | Cluster partitions as a list. |
list(object({
compute_list = list(string)
partition = object({
enable_job_exclusive = bool
enable_placement_groups = bool
network_storage = list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
partition_conf = map(string)
partition_feature = string
partition_name = string
partition_nodes = map(object({
access_config = list(object({
network_tier = string
}))
bandwidth_tier = string
node_count_dynamic_max = number
node_count_static = number
enable_spot_vm = bool
group_name = string
instance_template = string
node_conf = map(string)
reservation_name = string
spot_instance_config = object({
termination_action = string
})
}))
partition_startup_scripts_timeout = number
subnetwork = string
zone_policy_allow = list(string)
zone_policy_deny = list(string)
zone_target_shape = string
})
}))
| `[]` | no | +| [partition](#input\_partition) | Cluster partitions as a list. |
list(object({
compute_list = list(string)
partition = object({
enable_job_exclusive = bool
enable_placement_groups = bool
network_storage = list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
partition_conf = map(string)
partition_feature = string
partition_name = string
partition_nodes = map(object({
access_config = list(object({
network_tier = string
}))
bandwidth_tier = string
node_count_dynamic_max = number
node_count_static = number
enable_spot_vm = bool
group_name = string
instance_template = string
maintenance_interval = string
node_conf = map(string)
reservation_name = string
spot_instance_config = object({
termination_action = string
})
}))
partition_startup_scripts_timeout = number
subnetwork = string
zone_policy_allow = list(string)
zone_policy_deny = list(string)
zone_target_shape = string
})
}))
| `[]` | no | | [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | | [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = string
}))
| `[]` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/slurm.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl similarity index 100% rename from community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/slurm.conf.tpl rename to community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/slurmdbd.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl similarity index 100% rename from community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/slurmdbd.conf.tpl rename to community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl new file mode 100644 index 0000000000..5ae4184db3 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl @@ -0,0 +1,70 @@ +# slurm.conf +# https://slurm.schedmd.com/slurm.conf.html +# https://slurm.schedmd.com/configurator.html + +ProctrackType=proctrack/cgroup +SlurmctldPidFile=/var/run/slurm/slurmctld.pid +SlurmdPidFile=/var/run/slurm/slurmd.pid +TaskPlugin=task/affinity,task/cgroup +MaxNodeCount=64000 + +# +# +# SCHEDULING +SchedulerType=sched/backfill +SelectType=select/cons_tres +SelectTypeParameters=CR_Core_Memory + +# +# +# LOGGING AND ACCOUNTING +AccountingStoreFlags=job_comment +JobAcctGatherFrequency=30 +JobAcctGatherType=jobacct_gather/cgroup +SlurmctldDebug=info +SlurmdDebug=info +DebugFlags=Power + +# +# +# TIMERS +MessageTimeout=60 +BatchStartTimeout=600 +PrologEpilogTimeout=600 +PrologFlags=Contain + +################################################################################ +# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv # +################################################################################ + +SlurmctldHost={control_host}({control_addr}) + +AuthType=auth/munge +AuthInfo=cred_expire=120 +AuthAltTypes=auth/jwt +CredType=cred/munge +MpiDefault={mpi_default} +ReturnToService=2 +SlurmctldPort={control_host_port} +SlurmdPort=6818 +SlurmdSpoolDir=/var/spool/slurmd +SlurmUser=slurm +StateSaveLocation={state_save} + +# +# +# LOGGING AND ACCOUNTING +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost={control_host} +ClusterName={name} +SlurmctldLogFile={slurmlog}/slurmctld.log +SlurmdLogFile={slurmlog}/slurmd-%n.log + +# +# +# GENERATED CLOUD CONFIGURATIONS +include cloud.conf + +################################################################################ +# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ # +################################################################################ diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index 834b00240b..fbbc0c0b5c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -55,7 +55,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.10.2" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name @@ -92,7 +92,7 @@ module "slurm_controller_instance" { } module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.10.2" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf index 3acb583f3b..ddcb1ff6ee 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-5-9-debian-11", - "slurm-gcp-5-9-hpc-rocky-linux-8", - "slurm-gcp-5-9-ubuntu-2004-lts", - "slurm-gcp-5-9-ubuntu-2204-lts-arm64", - "slurm-gcp-5-9-hpc-centos-7-k80", - "slurm-gcp-5-9-hpc-centos-7" + "slurm-gcp-5-10-debian-11", + "slurm-gcp-5-10-hpc-rocky-linux-8", + "slurm-gcp-5-10-ubuntu-2004-lts", + "slurm-gcp-5-10-ubuntu-2204-lts-arm64", + "slurm-gcp-5-10-hpc-centos-7-k80", + "slurm-gcp-5-10-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index a36449123b..27fac71324 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "access_config" { description = "Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet." @@ -128,6 +128,11 @@ variable "login_startup_scripts_timeout" { EOD type = number default = 300 + + validation { + condition = var.login_startup_scripts_timeout == 300 + error_message = "Changes to login_startup_scripts_timeout (default: 300s) are not respected, this is a known issue that will be fixed in a later release" + } } variable "cgroup_conf_tpl" { @@ -209,7 +214,7 @@ variable "enable_cleanup_compute" { placement groups) managed by this module, when cluster is destroyed. NOTE: Requires Python and pip packages listed at the following link: - https://github.com/SchedMD/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt + https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt *WARNING*: Toggling this may impact the running workload. Deployed compute nodes may be destroyed and their jobs will be requeued. @@ -224,7 +229,7 @@ variable "enable_cleanup_subscriptions" { cluster is destroyed. NOTE: Requires Python and pip packages listed at the following link: - https://github.com/SchedMD/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt + https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt *WARNING*: Toggling this may temporarily impact var.enable_reconfigure behavior. EOD @@ -408,6 +413,7 @@ variable "partition" { enable_spot_vm = bool group_name = string instance_template = string + maintenance_interval = string node_conf = map(string) reservation_name = string spot_instance_config = object({ @@ -547,7 +553,7 @@ variable "instance_image" { type = map(string) default = { project = "schedmd-slurm-public" - family = "slurm-gcp-5-9-hpc-centos-7" + family = "slurm-gcp-5-10-hpc-centos-7" } validation { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index 6aaf32455e..f915ec9723 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.28.1" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index e50e1baddf..8a897be889 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -38,7 +38,7 @@ manually. This will require addition configuration and verification of permissions. For more information see the [hybrid.md] documentation on [slurm-gcp]. -[slurm-controller-hybrid]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_controller_hybrid +[slurm-controller-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_controller_hybrid > **_NOTE:_** The hybrid module requires the following dependencies to be > installed on the system deploying the module: @@ -58,15 +58,15 @@ permissions. For more information see the [hybrid.md] documentation on [pyyaml]: https://pypi.org/project/PyYAML/ [google-api-python-client]: https://pypi.org/project/google-api-python-client/ [google-cloud-pubsub]: https://pypi.org/project/google-cloud-pubsub/ -[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/scripts/requirements.txt ### Manual Configuration This module *does not* complete the installation of hybrid partitions on your slurm cluster. After deploying, you must follow the steps listed out in the [hybrid.md] documentation under [manual steps]. -[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/hybrid.md -[manual steps]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/hybrid.md#manual-configurations +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/docs/hybrid.md +[manual steps]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/docs/hybrid.md#manual-configurations ### Example Usage The hybrid module can be added to a blueprint as follows: @@ -146,10 +146,10 @@ strongly advise only using versions 21 or 22 when using this module. Attempting to use this module with any version older than 21 may lead to unexpected results. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 [pre-existing-network-storage]: ../../../../modules/file-system/pre-existing-network-storage/ [schedmd-slurm-gcp-v5-partition]: ../../compute/schedmd-slurm-gcp-v5-partition/ -[packer templates]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/packer +[packer templates]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/packer ## License @@ -181,7 +181,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.9.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.10.2 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf index a3d30a1f24..411cec0dd0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf @@ -28,7 +28,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.10.2" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 4833a999e5..f4d39e56a0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm\_login\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -28,9 +28,6 @@ This creates a Slurm login node which is: `use` * of VM machine type `n2-standard-4` -For a complete example using this module, see -[hpc-slurm.yaml](../../../../examples/hpc-slurm.yaml). - ## Custom Images For more information on creating valid custom images for the login node VM @@ -49,8 +46,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2#slurm-on-google-cloud-platform ## License @@ -85,8 +82,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.9.1 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.9.1 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.10.2 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.10.2 | ## Resources @@ -116,7 +113,7 @@ limitations under the License. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm login node VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-9-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm login node VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-10-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf index 214019af31..9888a764d6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf @@ -50,7 +50,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.10.2" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward @@ -88,7 +88,7 @@ module "slurm_login_template" { } module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.10.2" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf index 3acb583f3b..ddcb1ff6ee 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-5-9-debian-11", - "slurm-gcp-5-9-hpc-rocky-linux-8", - "slurm-gcp-5-9-ubuntu-2004-lts", - "slurm-gcp-5-9-ubuntu-2204-lts-arm64", - "slurm-gcp-5-9-hpc-centos-7-k80", - "slurm-gcp-5-9-hpc-centos-7" + "slurm-gcp-5-10-debian-11", + "slurm-gcp-5-10-hpc-rocky-linux-8", + "slurm-gcp-5-10-ubuntu-2004-lts", + "slurm-gcp-5-10-ubuntu-2204-lts-arm64", + "slurm-gcp-5-10-hpc-centos-7-k80", + "slurm-gcp-5-10-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf index 0b43011968..709df950be 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "project_id" { type = string @@ -296,7 +296,7 @@ variable "instance_image" { type = map(string) default = { project = "schedmd-slurm-public" - family = "slurm-gcp-5-9-hpc-centos-7" + family = "slurm-gcp-5-10-hpc-centos-7" } validation { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index 20b0a31fdf..06cfaa93ae 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.28.1" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 2f161702bd..f7cc534b03 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -11,11 +11,11 @@ The [user guide][slurm-ug] provides detailed instructions on customizing and enhancing the Slurm on GCP cluster as well as recommendations on configuring the controller for optimal performance at different scales. -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0 -[slurm\_controller\_instance]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. -[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/6.2.0/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/6.2.0/scripts/requirements.txt [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions [enable\_reconfigure]: #input\_enable\_reconfigure @@ -87,8 +87,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License @@ -125,17 +125,17 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 3.0 | -| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes | 6.2.0 | -| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies | 6.2.0 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.2.0 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.2.0 | -| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.2.0 | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 6.2.0 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.2.0 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.2.0 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.2.0 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.2.0 | -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 6.2.0 | +| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes | 6.3.4 | +| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies | 6.3.4 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.3.4 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.4 | +| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.3.4 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 6.3.4 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.4 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.3.4 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.4 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.3.4 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 6.3.4 | ## Resources @@ -177,28 +177,28 @@ limitations under the License. | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | | [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

NOTE: Requires Python and script dependencies.
*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | -| [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. Not for production use. | `bool` | `false` | no | -| [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | +| [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. | `bool` | `false` | no | +| [enable\_devel](#input\_enable\_devel) | Enables development mode. | `bool` | `true` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `any` | `false` | no | | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [extra\_logging\_flags](#input\_extra\_logging\_flags) | The list of extra flags for the logging system to use. See the logging\_flags variable in scripts/util.py to get the list of supported log flags. | `map(bool)` | `{}` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-1-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-3-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | | [login\_network\_storage](#input\_login\_network\_storage) | An array of network attached storage mounts to be configured on all login nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string) # TODO: is it used? should remove it?
mount_runner = map(string)
}))
| `[]` | no | -| [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. |
list(object({
name_prefix = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string, "n1-standard-1")
enable_confidential_vm = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
num_instances = optional(number, 1)
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
static_ips = optional(list(string), [])
subnetwork_project = optional(string)
subnetwork = optional(string)
spot = optional(bool, false)
tags = optional(list(string), [])
zone = optional(string)
termination_action = optional(string)
}))
| `[]` | no | +| [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. |
list(object({
name_prefix = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string, "n1-standard-1")
enable_confidential_vm = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
num_instances = optional(number, 1)
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
static_ips = optional(list(string), [])
subnetwork = string
spot = optional(bool, false)
tags = optional(list(string), [])
zone = optional(string)
termination_action = optional(string)
}))
| `[]` | no | | [login\_startup\_script](#input\_login\_startup\_script) | Startup script used by the login VMs. | `string` | `"# no-op"` | no | | [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string) # TODO: is it used? should remove it?
mount_runner = map(string)
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_project = optional(string)
# TODO: rename to subnetwork_self_link
subnetwork = optional(string)
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
}))
| `[]` | no | -| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, true)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = optional(string, "")
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
reservation_name = optional(string)
}))
| `[]` | no | +| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, true)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | | [partitions](#input\_partitions) | Cluster partitions as a list. See module slurm\_partition. |
list(object({
default = optional(bool, false)
enable_job_exclusive = optional(bool, false)
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
})), [])
partition_conf = optional(map(string), {})
partition_name = string
partition_nodeset = optional(list(string), [])
partition_nodeset_dyn = optional(list(string), [])
partition_nodeset_tpu = optional(list(string), [])
resume_timeout = optional(number)
suspend_time = optional(number, 300)
suspend_timeout = optional(number)
}))
| n/a | yes | | [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | @@ -211,8 +211,7 @@ limitations under the License. | [slurm\_conf\_tpl](#input\_slurm\_conf\_tpl) | Slurm slurm.conf template file path. | `string` | `null` | no | | [slurmdbd\_conf\_tpl](#input\_slurmdbd\_conf\_tpl) | Slurm slurmdbd.conf template file path. | `string` | `null` | no | | [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to. | `string` | `null` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | | [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 030b433f18..09061581e1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -35,7 +35,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.4" count = local.have_template ? 0 : 1 project_id = var.project_id @@ -76,8 +76,7 @@ module "slurm_controller_template" { source_image = local.source_image # requires source_image_logic.tf # spot = TODO: add support for spot (?) - subnetwork_project = var.subnetwork_project - subnetwork = var.subnetwork_self_link + subnetwork = var.subnetwork_self_link tags = concat([local.slurm_cluster_name], var.tags) # termination_action = TODO: add support for termination_action (?) @@ -93,7 +92,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.3.4" access_config = !var.disable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false @@ -105,14 +104,15 @@ module "slurm_controller_instance" { slurm_cluster_name = local.slurm_cluster_name slurm_instance_role = "controller" static_ips = var.static_ips - subnetwork_project = var.subnetwork_project subnetwork = var.subnetwork_self_link zone = var.zone + metadata = var.metadata - metadata = var.metadata + labels = merge(local.labels, { + slurm_files_checksum = module.slurm_files.checksum + }) depends_on = [ - module.slurm_files, # Ensure nodes are destroyed before controller is module.cleanup_compute_nodes[0], ] @@ -145,24 +145,34 @@ resource "google_secret_manager_secret_iam_member" "cloudsql_secret_accessor" { secret_id = google_secret_manager_secret.cloudsql[0].id role = "roles/secretmanager.secretAccessor" - member = "serviceAccount:${local.service_account[0].email}" + member = "serviceAccount:${local.service_account.email}" } # Destroy all compute nodes on `terraform destroy` module "cleanup_compute_nodes" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes?ref=6.3.4" count = var.enable_cleanup_compute ? 1 : 0 slurm_cluster_name = local.slurm_cluster_name project_id = var.project_id when_destroy = true + + + depends_on = [ + # Depend on controller network, as a best effort to avoid + # subnetwork resourceInUseByAnotherResource error + # NOTE: Can not use nodeset subnetworks as "A static list expression is required" + var.subnetwork_self_link, + # Ensure VMs are destroyed before resource policies + module.cleanup_resource_policies[0], + ] } # Destroy all resource policies on `terraform destroy` module "cleanup_resource_policies" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies?ref=6.3.4" count = var.enable_cleanup_compute ? 1 : 0 slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index ee8d354670..fd5c83bf34 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.4" for_each = { for x in var.login_nodes : x.name_prefix => x @@ -52,7 +52,6 @@ module "slurm_login_template" { source_image_project = each.value.source_image_project source_image = each.value.source_image spot = each.value.spot - subnetwork_project = each.value.subnetwork_project subnetwork = each.value.subnetwork tags = concat([local.slurm_cluster_name], each.value.tags) termination_action = each.value.termination_action @@ -60,7 +59,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=6.3.4" for_each = { for x in var.login_nodes : x.name_prefix => x } project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 68ddbabcc4..56e0b688dc 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -21,7 +21,7 @@ locals { # NODESET module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.4" for_each = local.nodeset_map project_id = var.project_id @@ -53,31 +53,33 @@ module "slurm_nodeset_template" { source_image_family = each.value.source_image_family source_image_project = each.value.source_image_project source_image = each.value.source_image - subnetwork = each.value.subnetwork + subnetwork = each.value.subnetwork_self_link + additional_networks = each.value.additional_networks + access_config = each.value.access_config tags = concat([local.slurm_cluster_name], each.value.tags) } module "slurm_nodeset" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.3.4" for_each = local.nodeset_map instance_template_self_link = module.slurm_nodeset_template[each.key].self_link enable_placement = each.value.enable_placement - enable_public_ip = each.value.enable_public_ip network_tier = each.value.network_tier node_count_dynamic_max = each.value.node_count_dynamic_max node_count_static = each.value.node_count_static nodeset_name = each.value.nodeset_name node_conf = each.value.node_conf - subnetwork_self_link = each.value.subnetwork + subnetwork_self_link = each.value.subnetwork_self_link zones = each.value.zones zone_target_shape = each.value.zone_target_shape + reservation_name = each.value.reservation_name } # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.3.4" for_each = local.nodeset_tpu_map project_id = var.project_id @@ -99,7 +101,7 @@ module "slurm_nodeset_tpu" { # PARTITION module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=6.3.4" for_each = local.partition_map partition_nodeset = [for x in each.value.partition_nodeset : module.slurm_nodeset[x].nodeset_name if try(module.slurm_nodeset[x], null) != null] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index 3593858220..15403688db 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -87,7 +87,7 @@ locals { } module "slurm_files" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.3.4" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf index 8759a268cc..532749e7ba 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-1-debian-11", - "slurm-gcp-6-1-hpc-rocky-linux-8", - "slurm-gcp-6-1-ubuntu-2004-lts", - "slurm-gcp-6-1-ubuntu-2204-lts-arm64", - "slurm-gcp-6-1-hpc-centos-7-k80", - "slurm-gcp-6-1-hpc-centos-7" + "slurm-gcp-6-3-debian-11", + "slurm-gcp-6-3-hpc-rocky-linux-8", + "slurm-gcp-6-3-ubuntu-2004-lts", + "slurm-gcp-6-3-ubuntu-2204-lts-arm64", + "slurm-gcp-6-3-hpc-centos-7-k80", + "slurm-gcp-6-3-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 9fe20fac1e..80e0c996b9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -142,8 +142,7 @@ variable "login_nodes" { source_image_project = optional(string) source_image = optional(string) static_ips = optional(list(string), []) - subnetwork_project = optional(string) - subnetwork = optional(string) + subnetwork = string spot = optional(bool, false) tags = optional(list(string), []) zone = optional(string) @@ -185,7 +184,6 @@ variable "nodeset" { disk_type = optional(string) enable_confidential_vm = optional(bool, false) enable_placement = optional(bool, false) - enable_public_ip = optional(bool, false) enable_oslogin = optional(bool, true) enable_shielded_vm = optional(bool, false) gpu = optional(object({ @@ -213,14 +211,30 @@ variable "nodeset" { source_image_family = optional(string) source_image_project = optional(string) source_image = optional(string) - subnetwork_project = optional(string) - # TODO: rename to subnetwork_self_link - subnetwork = optional(string) + subnetwork_self_link = string + additional_networks = optional(list(object({ + network = string + subnetwork = string + subnetwork_project = string + network_ip = string + access_config = list(object({ + nat_ip = string + network_tier = string + })) + ipv6_access_config = list(object({ + network_tier = string + })) + }))) + access_config = optional(list(object({ + nat_ip = string + network_tier = string + }))) spot = optional(bool, false) tags = optional(list(string), []) termination_action = optional(string) zones = optional(list(string), []) zone_target_shape = optional(string, "ANY_SINGLE_ZONE") + reservation_name = optional(string) })) default = [] @@ -252,7 +266,7 @@ variable "nodeset_tpu" { zone = string data_disks = optional(list(string), []) docker_image = optional(string, "") - subnetwork = optional(string, "") + subnetwork = string service_account = optional(object({ email = optional(string) scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"]) @@ -306,13 +320,13 @@ EOD variable "enable_devel" { type = bool - description = "Enables development mode. Not for production use." - default = false + description = "Enables development mode." + default = true } variable "enable_debug_logging" { type = bool - description = "Enables debug logging mode. Not for production use." + description = "Enables debug logging mode." default = false } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf index 2d643f79de..df013e93c8 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf @@ -244,7 +244,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-1-hpc-rocky-linux-8" + family = "slurm-gcp-6-3-hpc-rocky-linux-8" project = "schedmd-slurm-public" } @@ -284,12 +284,5 @@ variable "tags" { variable "subnetwork_self_link" { type = string - description = "Subnet to deploy to. Either network_self_link or subnetwork_self_link must be specified." - default = null -} - -variable "subnetwork_project" { - type = string - description = "The project that subnetwork belongs to." - default = null + description = "Subnet to deploy to." } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index 94be8259e1..59f3cb746d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.28.1" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index 3849dcb8f6..8fad372646 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0 -[slurm\_login\_instance]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -52,8 +52,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0 -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0 +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0#slurm-on-google-cloud-platform ## Requirements @@ -97,7 +97,7 @@ No modules. | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-1-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-3-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | @@ -113,8 +113,7 @@ No modules. | [service\_account](#input\_service\_account) | Service account to attach to the controller instance. If not set, the
default compute service account for the given project will be used with the
"https://www.googleapis.com/auth/cloud-platform" scope. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to. | `string` | `null` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | | [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf index 1964828cad..e894aed64f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf @@ -77,9 +77,7 @@ locals { static_ips = var.static_ips bandwidth_tier = var.bandwidth_tier - subnetwork_project = var.subnetwork_project - subnetwork = var.subnetwork_self_link - - tags = var.tags + subnetwork = var.subnetwork_self_link + tags = var.tags } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/metadata.yaml b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/metadata.yaml index 641832182d..13ea127b3c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/metadata.yaml +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/metadata.yaml @@ -16,3 +16,5 @@ spec: requirements: services: [] +ghpc: + has_to_be_used: true diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf index 8759a268cc..532749e7ba 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-1-debian-11", - "slurm-gcp-6-1-hpc-rocky-linux-8", - "slurm-gcp-6-1-ubuntu-2004-lts", - "slurm-gcp-6-1-ubuntu-2204-lts-arm64", - "slurm-gcp-6-1-hpc-centos-7-k80", - "slurm-gcp-6-1-hpc-centos-7" + "slurm-gcp-6-3-debian-11", + "slurm-gcp-6-3-hpc-rocky-linux-8", + "slurm-gcp-6-3-ubuntu-2004-lts", + "slurm-gcp-6-3-ubuntu-2204-lts-arm64", + "slurm-gcp-6-3-hpc-centos-7-k80", + "slurm-gcp-6-3-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf index 76957f5680..2d8b59e2b1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf @@ -276,7 +276,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-1-hpc-rocky-linux-8" + family = "slurm-gcp-6-3-hpc-rocky-linux-8" project = "schedmd-slurm-public" } @@ -315,12 +315,5 @@ variable "tags" { variable "subnetwork_self_link" { type = string - description = "Subnet to deploy to. Either network_self_link or subnetwork_self_link must be specified." - default = null -} - -variable "subnetwork_project" { - type = string - description = "The project that subnetwork belongs to." - default = null + description = "Subnet to deploy to." } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index a09f51df99..f1e679c49e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.28.1" } } diff --git a/community/modules/scripts/htcondor-install/files/autoscaler.py b/community/modules/scripts/htcondor-install/files/autoscaler.py index 40b7612e47..77bafa0310 100644 --- a/community/modules/scripts/htcondor-install/files/autoscaler.py +++ b/community/modules/scripts/htcondor-install/files/autoscaler.py @@ -253,17 +253,17 @@ def scale(self): current_target = responseGroupInfo["targetSize"] print(f"Current MIG target size: {current_target}") - being_born_states = ["CREATING", "RECREATING", "VERIFYING"] - being_born_filters = [ f"currentAction = \"{state}\"" for state in being_born_states ] - being_born_combined_filter = ' OR '.join(being_born_filters) - reqCreatingInstances = self.instanceGroupManagers.listManagedInstances( + # Find instances that are being modified by the MIG (currentAction is + # any value other than "NONE"). A common reason an instance is modified + # is it because it has failed a health check. + reqModifyingInstances = self.instanceGroupManagers.listManagedInstances( project=self.project, **self.zoneargs, instanceGroupManager=self.instance_group_manager, - filter=being_born_combined_filter, + filter="currentAction != \"NONE\"", orderBy="creationTimestamp desc" ) - respCreatingInstances = reqCreatingInstances.execute() + respModifyingInstances = reqModifyingInstances.execute() # Find VMs that are idle (no dynamic slots created from partitionable # slots) in the MIG handled by this autoscaler @@ -287,17 +287,19 @@ def scale(self): # their readiness to join pool (creating, unhealthy, healthy+idle) idle_nodes = OrderedDict() try: - creatingInstances = respCreatingInstances["managedInstances"] + modifyingInstances = respModifyingInstances["managedInstances"] except KeyError: - creatingInstances = [] + modifyingInstances = [] + + print(f"There are {len(modifyingInstances)} VMs being modified by the managed instance group") # there is potential for nodes in MIG health check "VERIFYING" state # to have already joined the pool and be running jobs - for instance in creatingInstances: + for instance in modifyingInstances: self_link = instance["instance"] node_name = self_link.rsplit("/", 1)[-1] if node_name not in claimed_nodes: - idle_nodes[self_link] = "creating" + idle_nodes[self_link] = "modifying" for ad in idle_node_ads: node = ad["Machine"].split(".")[0] @@ -311,7 +313,7 @@ def scale(self): idle_nodes[self_link] = "idle" n_idle = len(idle_nodes) - print(f"There are {n_idle} VMs being created or idle in the pool") + print(f"There are {n_idle} VMs being modified or idle in the pool") if self.debug > 1: print("Listing idle nodes:") pprint(idle_nodes) diff --git a/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl index 8ec2701c3f..79941524dd 100644 --- a/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl +++ b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl @@ -15,7 +15,7 @@ Remove-Item "$runtime_installer" # download HTCondor installer $htcondor_installer = 'C:\htcondor.msi' -%{ if condor_version == "10.*" } +%{ if condor_version == "23.*" } Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/23.0/current/condor-Windows-x64.msi -OutFile "$htcondor_installer" %{ else ~} Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/23.0/${condor_version}/release/condor-${condor_version}-Windows-x64.msi -OutFile "$htcondor_installer" @@ -29,12 +29,17 @@ $args=$args + ' INSTALLDIR="C:\Condor"' Start-Process "msiexec.exe" -Wait -ArgumentList "$args" Remove-Item "$htcondor_installer" +# do not start HTCondor on boot by default. Allow startup script to download +# configuration first and then start HTCondor +Set-Service -StartupType Manual condor + # remove settings from condor_config that we want to override in configuration step Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^CONDOR_HOST' -NotMatch) Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^INSTALL_USER' -NotMatch) Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^DAEMON_LIST' -NotMatch) Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^use SECURITY' -NotMatch) +# install Python so that custom ClassAd hooks can execute $python_installer = 'C:\python-installer.exe' Invoke-WebRequest -Uri "https://www.python.org/ftp/python/3.11.4/python-3.11.4-amd64.exe" -OutFile "$python_installer" Start-Process -FilePath "$python_installer" -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1 Include_test=0' diff --git a/community/modules/scripts/ramble-execute/README.md b/community/modules/scripts/ramble-execute/README.md index 2af5b2218d..f0a50ccb2b 100644 --- a/community/modules/scripts/ramble-execute/README.md +++ b/community/modules/scripts/ramble-execute/README.md @@ -77,7 +77,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/community/modules/scripts/ramble-execute/main.tf b/community/modules/scripts/ramble-execute/main.tf index 99af3cfa66..0958ebbb3c 100644 --- a/community/modules/scripts/ramble-execute/main.tf +++ b/community/modules/scripts/ramble-execute/main.tf @@ -55,7 +55,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/ramble-setup/README.md b/community/modules/scripts/ramble-setup/README.md index f8d52c336c..61ead2de0d 100644 --- a/community/modules/scripts/ramble-setup/README.md +++ b/community/modules/scripts/ramble-setup/README.md @@ -86,7 +86,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf index 16a0fcf29d..65afeafb81 100644 --- a/community/modules/scripts/ramble-setup/main.tf +++ b/community/modules/scripts/ramble-setup/main.tf @@ -72,7 +72,7 @@ locals { "destination" = "install_ramble.yml" } - bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}"), 0, 4) + bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}"), 0, 8) bucket_name = "ramble-scripts-${local.bucket_md5}" runners = [local.install_ramble_deps_runner, local.install_ramble_runner, local.python_reqs_runner] @@ -94,7 +94,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-execute/README.md b/community/modules/scripts/spack-execute/README.md index 99a375d29a..5e789b2808 100644 --- a/community/modules/scripts/spack-execute/README.md +++ b/community/modules/scripts/spack-execute/README.md @@ -104,7 +104,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/community/modules/scripts/spack-execute/main.tf b/community/modules/scripts/spack-execute/main.tf index d432041841..e3706b674a 100644 --- a/community/modules/scripts/spack-execute/main.tf +++ b/community/modules/scripts/spack-execute/main.tf @@ -54,7 +54,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index 55314173fe..ec2bd4a38c 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -336,7 +336,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf index af10cfc53e..2245718d75 100644 --- a/community/modules/scripts/spack-setup/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -79,7 +79,7 @@ locals { "destination" = "install_spack.yml" } - bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}.${local.script_content}"), 0, 4) + bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}.${local.script_content}"), 0, 8) bucket_name = "spack-scripts-${local.bucket_md5}" runners = [local.install_spack_deps_runner, local.install_spack_runner] @@ -100,7 +100,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index 7de489f570..1b227ea480 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.28.1" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index 9f5cc3a6a9..2e0fd50b80 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.28.1" } required_version = ">= 0.14.0" diff --git a/docs/gpu-support.md b/docs/gpu-support.md index b5542e2359..c1aa4989a3 100644 --- a/docs/gpu-support.md +++ b/docs/gpu-support.md @@ -6,9 +6,9 @@ * HTCondor modules including [htcondor-install], [htcondor-setup] and [htcondor-execute-point]. * [omnia-install] -* Slurm on GCP modules where applicable, both version 4 and version 5 +* Slurm on GCP modules where applicable, both version 5 and version 6 * `schedmd-slurm-gcp-v5-*` - * `SchedMD-slurm-on-gcp-*` + * `schedmd-slurm-gcp-v6-*` * PBS Pro modules (`pbspro-*`) * Cloud Batch modules through custom instance templates @@ -69,10 +69,6 @@ resources. #### Interface Considerations -The Slurm on GCP v4 modules (`SchedMD-slurm-on-gcp-*`) have a different -interface for defining attached accelerators, `gpu_type` and `gpu_count`. These -must be set even if the machine type implies GPUs. - The Slurm on GCP v5 HPC Toolkit modules (`schedmd-slurm-gcp-v5-*`) have two variables that can be used to define attached GPUs. The variable `guest_accelerators` is the recommended option as it is consistent with other @@ -84,9 +80,8 @@ provides consistency with the underlying terraform modules from the As mentioned above, VMs with many guest accelerators can take longer to deploy. Slurm sets timeouts for creating VMs, and it's possible for high GPU -configurations to push past the default timeout. The timeout in the -Slurm on GCP v4 HPC Toolkit modules (`SchedMD-slurm-on-gcp-*`) cannot -be increased, therefore we recommend using the Slurm on GCP v5 modules. +configurations to push past the default timeout. We recommend using the Slurm on +GCP v5 modules. The v5 Toolkit modules (`schedmd-slurm-gcp-v5-*`) allow Slurm configuration timeouts to customized via the [cloud_parameters] variable on the [controller]. @@ -137,7 +132,7 @@ information, see the SchedMD documentation: * [srun Documentation](https://slurm.schedmd.com/srun.html) * [sbatch Documentation](https://slurm.schedmd.com/sbatch.html) -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp [cloud_parameters]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller#input_cloud_parameters ## Further Reading diff --git a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md index 0fa3b5595f..5bd753d2e7 100644 --- a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md +++ b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md @@ -22,7 +22,7 @@ for use with an on-premise slurm-cluster. > further testing is done, documentation on applying the hybrid module to > on-premise slurm clusters will be added and expanded. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 ## Definitions diff --git a/docs/hybrid-slurm-cluster/deploy-instructions.md b/docs/hybrid-slurm-cluster/deploy-instructions.md index 1b3f60a354..ada2606dea 100644 --- a/docs/hybrid-slurm-cluster/deploy-instructions.md +++ b/docs/hybrid-slurm-cluster/deploy-instructions.md @@ -264,8 +264,8 @@ sudo systemctl restart slurmctld If the restart did not succeed, the logs at `/var/log/slurm/slurmctld.log` should point you in the right direction. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm-gcp-hybrid]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/hybrid.md +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm-gcp-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/docs/hybrid.md [demo-with-cloud-controller-instructions.md]: ./demo-with-cloud-controller-instructions.md ## Validate the Hybrid Cluster diff --git a/docs/hybrid-slurm-cluster/on-prem-instructions.md b/docs/hybrid-slurm-cluster/on-prem-instructions.md index 1ab5f94d4b..037019e887 100644 --- a/docs/hybrid-slurm-cluster/on-prem-instructions.md +++ b/docs/hybrid-slurm-cluster/on-prem-instructions.md @@ -39,9 +39,9 @@ detail, as well as how to customize many of these assumptions to fit your needs. deployments in their [hybrid.md] documentation. [hybridmodule]: ../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm\_controller\_hybrid]: https://github.com/SchedMD/slurm-gcp/tree/master/terraform/slurm_cluster/modules/slurm_controller_hybrid -[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/hybrid.md +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm\_controller\_hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master/terraform/slurm_cluster/modules/slurm_controller_hybrid +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/docs/hybrid.md ### NFS Mounts @@ -224,7 +224,7 @@ image created with slurm 21.08.8: node_count_dynamic_max: 20 instance_image: project: $(vars.project_id) - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 - id: compute-partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition @@ -235,12 +235,12 @@ image created with slurm 21.08.8: partition_name: compute ``` -[slurmgcppacker]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/packer -[example.pkrvars.hcl]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/packer/example.pkrvars.hcl -[slurmversion]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/packer/variables.pkr.hcl#L97 -[`service_account_scopes`]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/packer/variables.pkr.hcl#L166 -[`munge_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/ansible/roles/munge/defaults/main.yml#L17 -[`slurm_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/ansible/roles/slurm/defaults/main.yml#L31 +[slurmgcppacker]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/packer +[example.pkrvars.hcl]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/packer/example.pkrvars.hcl +[slurmversion]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/packer/variables.pkr.hcl#L97 +[`service_account_scopes`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/packer/variables.pkr.hcl#L166 +[`munge_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/ansible/roles/munge/defaults/main.yml#L17 +[`slurm_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/ansible/roles/slurm/defaults/main.yml#L31 ## On Premise Setup diff --git a/docs/image-building.md b/docs/image-building.md index 46ce0064b1..e8ff335f8d 100644 --- a/docs/image-building.md +++ b/docs/image-building.md @@ -15,7 +15,7 @@ operating system and your HPC applications. A typical custom image workflow is: [images]: https://cloud.google.com/compute/docs/images [standard-os]: https://cloud.google.com/compute/docs/images/os-details -[slurm-images]: https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#public-image +[slurm-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#public-image ## Examples @@ -154,7 +154,7 @@ a subdirectory. > to Ansible playbooks by a relative path (`../ansible`) that will not be > downloaded. -[schedmd-packer]: https://github.com/SchedMD/slurm-gcp/tree/master/packer#readme +[schedmd-packer]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master/packer#readme For example, to address the issue noted above: @@ -168,7 +168,7 @@ deployment_groups: - group: packer modules: - id: custom-image - source: github.com/SchedMD/slurm-gcp//packer?ref=5.9.1&depth=1 + source: github.com/GoogleCloudPlatform/slurm-gcp//packer?ref=5.10.2&depth=1 kind: packer settings: use_iap: true diff --git a/docs/module-guidelines.md b/docs/module-guidelines.md index 0c95cba054..e8339663d1 100644 --- a/docs/module-guidelines.md +++ b/docs/module-guidelines.md @@ -201,5 +201,8 @@ spec: ghpc: # [optional] # [optional] `inject_module_id`, if set, will inject blueprint # module id as a value for the module variable `var_name`. - inject_module_id: var_name + inject_module_id: var_name + # [optional] `has_to_be_used` is a boolean flag, if set to true, + # the creation will fail if the module is not used. + has_to_be_used: true ``` diff --git a/docs/network_storage.md b/docs/network_storage.md index eaea94de93..ebec65ef8a 100644 --- a/docs/network_storage.md +++ b/docs/network_storage.md @@ -98,19 +98,19 @@ The following is an example setting up a filestore using startup script: The following matrix shows the best method by which each type of network storage device should be mounted to each mount capable module. -  | Slurm V4 | Slurm V5 | Batch | vm-instance | Packer (client install) | HTCondor\* | PBS Pro\* --- | -- | -- | -- | -- | -- | -- | -- -filestore | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE -nfs-server | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE -cloud-storage-bucket (GCS) | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE -DDN EXAScaler lustre | via USE | via USE | via USE | via USE | Needs Testing | via USE | via USE -Intel DAOS** | via STARTUP | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing -  |   |   |   |   |   |   |   -filestore (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE -nfs-server (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE -DDN EXAScaler lustre (pre-existing) | via USE | via USE | via USE | via USE | Needs Testing | via USE | via USE -Intel DAOS (pre-existing) | Planned Development | Planned Development | Planned Development | Planned Development | Planned Development | Planned Development | Planned Development -GCS FUSE (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | Needs Testing +  | Slurm V5 | Batch | vm-instance | Packer (client install) | HTCondor\* | PBS Pro\* +-- | -- | -- | -- | -- | -- | -- +filestore | via USE | via USE | via USE | via STARTUP | via USE | via USE +nfs-server | via USE | via USE | via USE | via STARTUP | via USE | via USE +cloud-storage-bucket (GCS)| via USE | via USE | via USE | via STARTUP | via USE | via USE +DDN EXAScaler lustre | via USE | via USE | via USE | Needs Testing | via USE | via USE +Intel DAOS** | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing +  |   |   |   |   |   |   +filestore (pre-existing) | via USE | via USE | via USE | via STARTUP | via USE | via USE +nfs-server (pre-existing) | via USE | via USE | via USE | via STARTUP | via USE | via USE +DDN EXAScaler lustre (pre-existing) | via USE | via USE | via USE | Needs Testing | via USE | via USE +Intel DAOS (pre-existing) | Planned Development | Planned Development | Planned Development | Planned Development | Planned Development | Planned Development +GCS FUSE (pre-existing) | via USE | via USE | via USE | via STARTUP | via USE | Needs Testing - **via USE:** Client installation and mounting occur automatically when connected with the use field. See diff --git a/docs/slurm-troubleshooting.md b/docs/slurm-troubleshooting.md index 26446cf536..ae185a6e71 100644 --- a/docs/slurm-troubleshooting.md +++ b/docs/slurm-troubleshooting.md @@ -88,7 +88,7 @@ The solution here is to [request more of the specified quota](#gcp-quotas), `C2 CPUs` in the example above. Alternatively, you could switch the partition's [machine type][partition-machine-type], to one which has sufficient quota. -[partition-machine-type]: community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md#input_machine_type +[partition-machine-type]: community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md#input_machine_type #### Placement Groups (Slurm) @@ -109,7 +109,7 @@ resume.py ERROR: group operation failed: Requested minimum count of 6 VMs could One way to resolve this is to set [enable_placement][partition-enable-placement] to `false` on the partition in question. -[partition-enable-placement]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/compute/SchedMD-slurm-on-gcp-partition#input_enable_placement +[partition-enable-placement]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/compute/schedmd-slurm-gcp-v6-nodeset#input_enable_placement #### VMs Get Stuck in Status Staging When Using Placement Groups With vm-instance @@ -188,8 +188,8 @@ After creating the service account, it can be set via the [def-compute-sa]: https://cloud.google.com/compute/docs/access/service-accounts#default_service_account [slurm-on-gcp-ug]: https://goo.gle/slurm-gcp-user-guide -[slurm-on-gcp-con]: community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md -[slurm-on-gcp-login]: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md +[slurm-on-gcp-con]: community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +[slurm-on-gcp-login]: community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md ### Timeout Error / Startup Script Failure (Slurm V5) diff --git a/docs/tutorials/README.md b/docs/tutorials/README.md index f56e99074c..66904bd0b7 100644 --- a/docs/tutorials/README.md +++ b/docs/tutorials/README.md @@ -5,19 +5,6 @@ Find the quickstart tutorial on [Google Cloud docs](https://cloud.google.com/hpc-toolkit/docs/quickstarts/slurm-cluster). -## Intel Select Tutorial - -Walks through deploying an HPC cluster that is based on the -[HPC virtual machine (VM) image][hpc-vm-image] and complies to the -[Intel Select Solution for Simulation and Modeling criteria][intel-select]. - -Click the button below to launch the Intel Select tutorial. - -[![Open in Cloud Shell](https://gstatic.com/cloudssh/images/open-btn.svg)](https://shell.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https%3A%2F%2Fgithub.com%2FGoogleCloudPlatform%2Fhpc-toolkit&cloudshell_open_in_editor=docs%2Ftutorials%2Fintel-select%2Fhpc-cluster-intel-select.yaml&cloudshell_tutorial=docs%2Ftutorials%2Fintel-select%2Fintel-select.md) - -[hpc-vm-image]: https://cloud.google.com/compute/docs/instances/create-hpc-vm -[intel-select]: https://www.intel.com/content/www/us/en/products/solutions/select-solutions/hpc/simulation-modeling.html - ## HTCondor Tutorial Walk through deploying an HTCondor pool that supports jobs running inside Docker @@ -27,6 +14,8 @@ Click the button below to launch the HTCondor tutorial. [![Open in Cloud Shell](https://gstatic.com/cloudssh/images/open-btn.svg)](https://shell.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https%3A%2F%2Fgithub.com%2FGoogleCloudPlatform%2Fhpc-toolkit&cloudshell_open_in_editor=community%2Fexamples%2Fhtc-htcondor.yaml&cloudshell_tutorial=docs%2Ftutorials%2Fhtcondor.md) +[hpc-vm-image]: https://cloud.google.com/compute/docs/instances/create-hpc-vm + ## SC-23 Tutorial [Blueprint](./sc23-tutorial/hcls-blueprint.yaml) used in the Supercomputing 2023 tutorial “Unlocking the potential of HPC in the Google Cloud with Open-Source Tools” @@ -61,11 +50,11 @@ modules relate to each other. ```mermaid graph TB - A(Virtual Private Cloud) + A(Virtual Private Cloud) C(Spack Install Script) D(Startup Scripts) E(Compute Partition) - F(Slurm Controller) + F(Slurm Controller) G(Slurm Login Node) B(Monitoring Dashboard) C --> D diff --git a/docs/tutorials/gromacs/spack-gromacs.md b/docs/tutorials/gromacs/spack-gromacs.md index c8719aaba7..67bd157926 100644 --- a/docs/tutorials/gromacs/spack-gromacs.md +++ b/docs/tutorials/gromacs/spack-gromacs.md @@ -5,7 +5,7 @@ easy for customers to deploy HPC environments on Google Cloud. In this tutorial you will use the HPC Toolkit to: -* Deploy a [Slurm](https://github.com/SchedMD/slurm-gcp#readme) HPC cluster on +* Deploy a [Slurm](https://github.com/GoogleCloudPlatform/slurm-gcp#readme) HPC cluster on Google Cloud * Use [Spack](https://spack.io/) to install the Gromacs application and all of its dependencies @@ -13,10 +13,10 @@ In this tutorial you will use the HPC Toolkit to: cluster * Tear down the cluster -Estimated time to complete: -The tutorial takes 2 hr. to complete, -of which 1.5 hr is for installing software -(without cache). +Estimated time to complete: +The tutorial takes 2 hr. to complete, +of which 1.5 hr is for installing software +(without cache). > **_NOTE:_** With a complete Spack cache, the tutorial takes 30 min. @@ -75,7 +75,7 @@ which should be open in the Cloud Shell Editor (on the left). This file describes the cluster you will deploy. It defines: -* the existing default network from your project +* a vpc network * a monitoring dashboard with metrics on your cluster * a definition of a custom Spack installation * a startup script that @@ -106,27 +106,13 @@ contains the terraform needed to deploy your cluster. ## Deploy the Cluster -Use the following commands to run terraform and deploy your cluster. +Use below command to deploy your cluster. ```bash -terraform -chdir=spack-gromacs/primary init -terraform -chdir=spack-gromacs/primary apply +./ghpc deploy spack-gromacs ``` -The `terraform apply` command will generate a _plan_ that describes the Google -Cloud resources that will be deployed. - -You can review the plan and then start the deployment by typing -**`yes [enter]`**. - -The deployment will take about 30 seconds. There should be regular status updates -in the terminal. - -If the `apply` is successful, a message similar to the following will be -displayed: - - - +After the deployment is finished, you should see below message. ```shell Apply complete! Resources: xx added, 0 changed, 0 destroyed. @@ -144,16 +130,16 @@ controller. This command can be used to view progress and check for completion of the startup script: ```bash -gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project slurm-spack-gromacs-controller | grep google_metadata_script_runner +gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project spackgroma-controller | grep google_metadata_script_runner ``` When the startup script has finished running you will see the following line as the final output from the above command: -> _`slurm-spack-gromacs-controller google_metadata_script_runner: Finished running startup scripts.`_ +> _`spackgroma-controller google_metadata_script_runner: Finished running startup scripts.`_ Optionally while you wait, you can see your deployed VMs on Google Cloud Console. Open the link below in a new window. Look for -`slurm-spack-gromacs-controller` and `slurm-spack-gromacs-login0`. If you don't +`spackgroma-controller` and `spackgroma-login-login-001`. If you don't see your VMs make sure you have the correct project selected (top left). ```text @@ -167,7 +153,7 @@ Once the startup script has completed, connect to the login node. Use the following command to ssh into the login node from cloud shell: ```bash -gcloud compute ssh slurm-spack-gromacs-login0 --zone us-central1-c --project +gcloud compute ssh spackgroma-login-login-001 --zone us-central1-c --project ``` You may be prompted to set up SSH. If so follow the prompts and if asked for a @@ -191,7 +177,7 @@ following instructions: https://console.cloud.google.com/compute?project= ``` -1. Click on the `SSH` button associated with the `slurm-spack-gromacs-login0` +1. Click on the `SSH` button associated with the `spackgroma-login-login-001` instance. This will open a separate pop up window with a terminal into our newly @@ -213,7 +199,7 @@ Gromacs job. 2. Submit the job to Slurm to be scheduled: ```bash - sbatch /apps/gromacs/submit_gromacs.sh + sbatch /opt/apps/gromacs/submit_gromacs.sh ``` 3. Once submitted, you can watch the job progress by repeatedly calling the @@ -227,7 +213,7 @@ The `sbatch` command trigger Slurm to auto-scale up several nodes to run the job You can refresh the `Compute Engine` > `VM instances` page and see that additional VMs are being/have been created. These will be named something like -`slurm-spack-gromacs-compute-0-0`. +`spackgroma-comput-0`. When running `squeue`, observe the job status start as `CF` (configuring), change to `R` (running) once the compute VMs have been created, and finally `CG` @@ -247,7 +233,7 @@ about 5 minutes to run. Several files will have been generated in the `test_run/` folder you created. The `md.log` and `slurm-1.out` files have information on the run such as -performance. You can view these files by running the following commandsq on the +performance. You can view these files by running the following commands on the login node: ```bash @@ -285,7 +271,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -terraform -chdir=spack-gromacs/primary destroy -auto-approve +./ghpc destroy spack-gromacs ``` When complete you should see something like: diff --git a/docs/tutorials/gromacs/spack-gromacs.yaml b/docs/tutorials/gromacs/spack-gromacs.yaml index fe5bf475b1..014a0d5d9b 100644 --- a/docs/tutorials/gromacs/spack-gromacs.yaml +++ b/docs/tutorials/gromacs/spack-gromacs.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: modules/network/pre-existing-vpc + source: modules/network/vpc - id: hpc_dash source: modules/monitoring/dashboard @@ -35,8 +35,8 @@ deployment_groups: - id: spack-setup source: community/modules/scripts/spack-setup settings: - install_dir: /apps/spack - spack_ref: v0.19.0 + install_dir: /opt/apps/spack + spack_ref: v0.20.0 - id: spack-execute source: community/modules/scripts/spack-execute @@ -88,7 +88,7 @@ deployment_groups: # fi # spack buildcache keys --install --trust - spack config --scope defaults add config:build_stage:/apps/spack/spack-stage + spack config --scope defaults add config:build_stage:/opt/apps/spack/spack-stage spack config --scope defaults add -f /tmp/projections-config.yaml spack config --scope site add -f /tmp/slurm-external-config.yaml @@ -103,26 +103,31 @@ deployment_groups: spack install fi - - id: controller-setup + - id: login-setup source: modules/scripts/startup-script settings: runners: + # remove lustre client temporary to avoid startup failure due to known + # issue. + - type: shell + destination: remove_lustre_client.sh + content: | + #!/bin/bash + rm /etc/yum.repos.d/lustre-client.repo - $(spack-execute.spack_runner) - type: shell destination: setup_gromacs.sh content: | #!/bin/bash - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate gromacs - chmod -R a+rwX /apps/spack/var/spack/environments/gromacs - mkdir -p /apps/gromacs - chmod a+rwx /apps/gromacs - cd /apps/gromacs + mkdir -p /opt/apps/gromacs + cd /opt/apps/gromacs wget --no-verbose https://ftp.gromacs.org/pub/benchmarks/water_GMX50_bare.tar.gz tar xzf water_GMX50_bare.tar.gz - type: data - destination: /apps/gromacs/submit_gromacs.sh + destination: /opt/apps/gromacs/submit_gromacs.sh content: | #!/bin/bash #SBATCH -N 2 @@ -131,36 +136,44 @@ deployment_groups: # Size can be 0000.65 0000.96 0001.5 0003 0006 0012 0024 0048 0096 0192 0384 0768 1536 3072 # Type can be 'pme' or 'rf' - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate gromacs # Check that gmx_mpi exists which gmx_mpi cd $SLURM_SUBMIT_DIR - cp /apps/gromacs/water-cut1.0_GMX50_bare/1536/* . + cp /opt/apps/gromacs/water-cut1.0_GMX50_bare/1536/* . scontrol show hostnames ${SLURM_JOB_NODELIST} > hostfile gmx_mpi grompp -f pme.mdp -c conf.gro -p topol.top -o input.tpr mpirun -n 60 -hostfile hostfile -ppn 30 gmx_mpi mdrun -notunepme -dlb yes -v -resethway -noconfout -nsteps 4000 -s input.tpr + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + node_count_dynamic_max: 20 + bandwidth_tier: gvnic_enabled + - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset] settings: partition_name: compute - max_node_count: 20 + is_default: true + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network1] + settings: + name_prefix: login - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 - compute_partition + - slurm_login settings: - controller_startup_script: $(controller-setup.startup_script) - login_node_count: 1 - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - slurm_controller + disable_controller_public_ips: false + login_startup_scripts_timeout: 21600 + login_startup_script: $(login-setup.startup_script) diff --git a/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml b/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml deleted file mode 100644 index dfe2a9f276..0000000000 --- a/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-cluster-intel-select - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-intel-select - region: us-central1 - zone: us-central1-c - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: startup-controller - source: modules/scripts/startup-script - settings: - runners: - - type: shell - content: | - #!/bin/bash - yum -y update google-hpc-compute - google_install_mpi --prefix /apps --intel_compliance - destination: /var/tmp/install_intel_controller.sh - - - id: startup-compute - source: modules/scripts/startup-script - settings: - runners: - - type: shell - content: | - #!/bin/bash - yum -y update google-hpc-compute - google_install_mpi --intel_comp_meta - destination: /var/tmp/install_intel_compute.sh - - # This debug_partition will work out of the box without requesting additional GCP quota. - - id: debug_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - settings: - partition_name: debug - max_node_count: 4 - enable_placement: false - exclusive: false - machine_type: n2-standard-2 - - # This compute_partition is far more performant than debug_partition but may require requesting GCP quotas first. - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - settings: - partition_name: compute - max_node_count: 20 - - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - debug_partition # debug partition will be default as it is listed first - - compute_partition - settings: - login_node_count: 1 - controller_startup_script: $(startup-controller.startup_script) - compute_startup_script: $(startup-compute.startup_script) - - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - homefs - - slurm_controller - settings: - login_startup_script: $(startup-compute.startup_script) diff --git a/docs/tutorials/intel-select/intel-select.md b/docs/tutorials/intel-select/intel-select.md deleted file mode 100644 index 13956ea3de..0000000000 --- a/docs/tutorials/intel-select/intel-select.md +++ /dev/null @@ -1,242 +0,0 @@ -# HPC Toolkit Intel Select Solution Cluster Deployment - -HPC Toolkit is an open-source software offered by Google Cloud which makes it -easy for customers to deploy HPC environments on Google Cloud. - -This tutorial will walk you through deploying an HPC cluster that is based on -the [HPC virtual machine (VM) image](https://cloud.google.com/compute/docs/instances/create-hpc-vm) -and comply to the [Intel Select Solution for Simulation and Modeling criteria](https://www.intel.com/content/www/us/en/products/solutions/select-solutions/hpc/simulation-modeling.html). - -[Click here for more information](https://cloud.google.com/compute/docs/instances/create-intel-select-solution-hpc-clusters). - -## Select a Project - -Select a project in which to deploy an HPC cluster on Google . - - - -Once you have selected a project, click START. - -## Enable APIs & Permissions - -*Skip this step if you already ran this as part of a previous tutorial.* - -In a new Google Cloud project there are several apis that must be enabled to -deploy your HPC cluster. These will be caught when you perform `terraform apply` -but you can save time by enabling them now by running: - - - -We also need to grant the default compute service account project edit access so -the slurm controller can perform actions such as auto-scaling. - - - -```bash -PROJECT_NUMBER=$(gcloud projects describe --format='value(projectNumber)') - -echo "granting roles/editor to $PROJECT_NUMBER-compute@developer.gserviceaccount.com" - -gcloud iam service-accounts enable --project $PROJECT_NUMBER-compute@developer.gserviceaccount.com - -gcloud projects add-iam-policy-binding --member=serviceAccount:$PROJECT_NUMBER-compute@developer.gserviceaccount.com --role=roles/editor -``` - -## Build the Toolkit Binary - -*Skip this step if you already ran this as part of a previous tutorial.* - -To build HPC Toolkit binary from source run: - -```bash -make -``` - -You should now have a binary named ghpc in the current directory. To verify the -build run: - -```bash -./ghpc --version -``` - -This should show you the version of the HPC Toolkit you are using. - -## Generate a Deployment - -This tutorial will use the blueprint docs/tutorials/intel-select/hpc-cluster-intel-select.yaml, which should be open in the Cloud Shell Editor (on the left). - -This file describes the cluster you will deploy. It contains: - -* a new network -* a filestore instance -* a custom startup script for the slurm controller -* a custom startup script for the slurm login and compute nodes -* a Slurm cluster with Intel software components pre-installed throughout - * a Slurm login node - * a Slurm controller - * several auto-scaling Slurm partitions - -Do you notice the difference between this blueprint and the hpc-slurm example? - -After you have inspected the file, use the ghpc binary to create a deployment folder by running: - -```bash -./ghpc create --vars project_id= docs/tutorials/intel-select/hpc-cluster-intel-select.yaml -``` - -> **_NOTE:_** The `--vars` argument is used to override `project_id` in the -> deployment variables. - -This will create a deployment directory named `hpc-intel-select/`, which -contains the terraform needed to deploy your cluster. - -## Deploy the Cluster - -Use the following commands to run terraform and deploy your cluster. - -```bash -terraform -chdir=hpc-intel-select/primary init -terraform -chdir=hpc-intel-select/primary apply -``` - -The `terraform apply` command will generate a _plan_ that describes the Google -Cloud resources that will be deployed. - -You can review the plan and then start the deployment by typing -**`yes [enter]`**. - -The deployment will take about 5 minutes. There should be regular status updates -in the terminal. - -If the `apply` is successful, a message similar to the following will be -displayed: - - - - -```shell -Apply complete! Resources: xx added, 0 changed, 0 destroyed. -``` - -## Waiting for the cluster to be configured - -Although the cluster has been successfully deployed, the startup scripts that -install the additional required software take time to complete. Typically, this -can be around 8 minutes on the controller and 2-3 minutes on the login and -compute nodes. - -If you see the following message when you SSH into the login node following the -instructions in the next step, you should logout and give more time for the -startup script to complete. - -> _`Slurm is currently being configured in the background`_ - -Running the following command will allow monitoring the startup scripts on the controller: - -```bash -gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project slurm-hpc-intel-select-controller | grep startup-script -``` - -And the login node: - -```bash -gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project slurm-hpc-intel-select-login0 | grep startup-script -``` - -The following line would indicate that the startup script completed on the controller: ->_`slurm-hpc-intel-select-controller google_metadata_script_runner: startup-script exit status 0`_ - -## Connecting to the login node - -Once the startup script has completed and Slurm reports readiness, connect to the login node. - -1. Open the following URL in a new tab. This will take you to `Compute Engine` > - `VM instances` in the Google Cloud Console: - - - - ```text - https://console.cloud.google.com/compute?project= - ``` - - - - -1. Click on the `SSH` button associated with the `slurm-hpc-small-login0` - instance. - - This will open a separate pop up window with a terminal into our newly created - Slurm login VM. - -## Run a Job on the Cluster - - **The commands below should be run on the login node.** - -1. Create a default ssh key to be able to ssh between nodes: - - ```shell - ssh-keygen -N '' -f ~/.ssh/id_rsa - cp ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys - chmod 0600 ~/.ssh/authorized_keys - ``` - -1. Execute the following commands to activate Intel software components and - allocate machines to run the Intel Cluster Checker: - -```shell -export PATH=/apps/intelpython3/bin/:/sbin:/bin:/usr/sbin:/usr/bin:$PATH -source /apps/clck/2019.10/bin/clckvars.sh -source /apps/psxe_runtime/linux/bin/psxevars.sh -salloc -N4 -p compute -``` - -This may take a minute while Slurm auto-scales to create the nodes. If you are -curious you can refresh the `Compute Engine` > `VM instances` page and see that -additional VMs have been created. - -If the allocation fails, try submitting the job to the debug partition, -by removing the `-p compute` parameter to `salloc`. The message `salloc: -PrologSlurmctld failed, job killed` most likely indicates that your project does -not have sufficient quota for C2 instances in your region. - -1. Once the allocation is complete, you will be presented with a shell. Run: - -```shell -clck -F intel_hpc_platform_compat-hpc-2018.0 -``` - -Notice this job took ~2-3 minutes to start, since all compute nodes have to install the packages at boot time. In a real production system, this would be part of the slurm image (which is also possible with the HPC Toolkit). - -Since we used the compute partition, the job ran on [Compute Optimized -instances](https://cloud.google.com/compute/docs/compute-optimized-machines), -using Intel 3.9 GHz Cascade Lake processors and with placement groups enabled. -Nodes will not be re-used across jobs and will be immediately destroyed after -the job is completed. - -The outputs of `clck` will be stored in `clck_execution_warnings.log` and `clck_results.log`. - -> **_NOTE:_** If the Slurm controller is shut down before the auto-scale nodes -> are destroyed then they will be left running. - -## Destroy the Cluster - -To avoid incurring ongoing charges we will want to destroy our cluster. Run the -following command in the cloud shell terminal (not in the pop-up): - -```bash -terraform -chdir=hpc-intel-select/primary destroy -auto-approve -``` - -When complete you should see something like: - -```shell -Destroy complete! Resources: xx destroyed. -``` - -> **_NOTE:_** If destroy is run before Slurm shut down the auto-scale nodes then -> they will be left behind and destroy may fail. In this case you can delete the -> VMs manually and rerun the destroy command above. - -## Tutorial Complete - - diff --git a/docs/tutorials/openfoam/spack-openfoam.md b/docs/tutorials/openfoam/spack-openfoam.md index b408ee15d4..fa3a29fd73 100644 --- a/docs/tutorials/openfoam/spack-openfoam.md +++ b/docs/tutorials/openfoam/spack-openfoam.md @@ -5,7 +5,7 @@ easy for customers to deploy HPC environments on Google Cloud. In this tutorial you will use the HPC Toolkit to: -* Deploy a [Slurm](https://github.com/SchedMD/slurm-gcp#readme) HPC cluster on +* Deploy a [Slurm](https://github.com/GoogleCloudPlatform/slurm-gcp#readme) HPC cluster on Google Cloud * Use [Spack](https://spack.io/) to install the OpenFOAM application and all of its dependencies @@ -13,10 +13,10 @@ In this tutorial you will use the HPC Toolkit to: cluster * Tear down the cluster -Estimated time to complete: -The tutorial takes 3 hr. to complete, -of which 2.5 hr is for installing software -(without cache). +Estimated time to complete: +The tutorial takes 3 hr. to complete, +of which 2.5 hr is for installing software +(without cache). > **_NOTE:_** With a complete Spack cache, the tutorial takes 30 min. @@ -31,7 +31,7 @@ Once you have selected a project, click START. ## Enable APIs & Permissions In a new Google Cloud project there are several apis that must be enabled to -deploy your HPC cluster. These will be caught when you perform `terraform apply` +deploy your HPC cluster. These will be caught when you perform `./ghpc create` but you can save time by enabling them now by running: @@ -75,7 +75,7 @@ which should be open in the Cloud Shell Editor (on the left). This file describes the cluster you will deploy. It defines: -* the existing default network from your project +* a vpc network * a monitoring dashboard with metrics on your cluster * a definition of a custom Spack installation * a startup script that @@ -88,9 +88,6 @@ This file describes the cluster you will deploy. It defines: * a Slurm controller * An auto-scaling Slurm partition -[This diagram](https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/application_demo/docs/tutorials/application_demo.md#blueprint-diagram) -shows how the different modules relate to each other. - After you have inspected the file, use the ghpc binary to create a deployment folder by running: @@ -106,24 +103,19 @@ contains the terraform needed to deploy your cluster. ## Deploy the Cluster -Use the following commands to run terraform and deploy your cluster. +Use below command to deploy your cluster. ```bash -terraform -chdir=spack-openfoam/primary init -terraform -chdir=spack-openfoam/primary apply +./ghpc deploy spack-openfoam ``` -The `terraform apply` command will generate a _plan_ that describes the Google +You can also use below command to generate a _plan_ that describes the Google Cloud resources that will be deployed. -You can review the plan and then start the deployment by typing -**`yes [enter]`**. - -The deployment will take about 30 seconds. There should be regular status updates -in the terminal. - -If the `apply` is successful, a message similar to the following will be -displayed: +```bash +terraform -chdir=spack-openfoam/primary init +terraform -chdir=spack-openfoam/primary apply +``` @@ -144,16 +136,16 @@ controller. This command can be used to view progress and check for completion of the startup script: ```bash -gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project slurm-spack-openfoam-controller | grep google_metadata_script_runner +gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project spackopenf-controller | grep google_metadata_script_runner ``` When the startup script has finished running you will see the following line as the final output from the above command: -> _`slurm-spack-openfoam-controller google_metadata_script_runner: Finished running startup scripts.`_ +> _`spackopenf-controller google_metadata_script_runner: Finished running startup scripts.`_ Optionally while you wait, you can see your deployed VMs on Google Cloud Console. Open the link below in a new window. Look for -`slurm-spack-openfoam-controller` and `slurm-spack-openfoam-login0`. If you don't +`spackopenf-controller` and `spackopenf-login-login-001`. If you don't see your VMs make sure you have the correct project selected (top left). ```text @@ -167,7 +159,7 @@ Once the startup script has completed, connect to the login node. Use the following command to ssh into the login node from cloud shell: ```bash -gcloud compute ssh slurm-spack-openfoam-login0 --zone us-central1-c --project +gcloud compute ssh spackopenf-login-login-001 --zone us-central1-c --project ``` You may be prompted to set up SSH. If so follow the prompts and if asked for a @@ -191,7 +183,7 @@ following instructions: https://console.cloud.google.com/compute?project= ``` -1. Click on the `SSH` button associated with the `slurm-spack-openfoam-login0` +1. Click on the `SSH` button associated with the `spackopenf-login-login-001` instance. This will open a separate pop up window with a terminal into our newly @@ -213,7 +205,7 @@ OpenFOAM job. 2. Submit the job to Slurm to be scheduled: ```bash - sbatch /apps/openfoam/submit_openfoam.sh + sbatch /opt/apps/openfoam/submit_openfoam.sh ``` 3. Once submitted, you can watch the job progress by repeatedly calling the @@ -227,7 +219,7 @@ The `sbatch` command trigger Slurm to auto-scale up several nodes to run the job You can refresh the `Compute Engine` > `VM instances` page and see that additional VMs are being/have been created. These will be named something like -`slurm-spack-openfoam-compute-0-0`. +`spackopenf-comput-0`. When running `squeue`, observe the job status start as `CF` (configuring), change to `R` (running) once the compute VMs have been created, and finally `CG` @@ -280,7 +272,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -terraform -chdir=spack-openfoam/primary destroy -auto-approve +./ghpc destroy spack-openfoam ``` When complete you should see something like: diff --git a/docs/tutorials/openfoam/spack-openfoam.yaml b/docs/tutorials/openfoam/spack-openfoam.yaml index 4512c0446d..5b6635ff36 100644 --- a/docs/tutorials/openfoam/spack-openfoam.yaml +++ b/docs/tutorials/openfoam/spack-openfoam.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: modules/network/pre-existing-vpc + source: modules/network/vpc - id: hpc_dash source: modules/monitoring/dashboard @@ -35,8 +35,8 @@ deployment_groups: - id: spack-setup source: community/modules/scripts/spack-setup settings: - install_dir: /apps/spack - spack_ref: v0.19.0 + install_dir: /opt/apps/spack + spack_ref: v0.20.0 - id: spack-execute source: community/modules/scripts/spack-execute @@ -95,12 +95,12 @@ deployment_groups: # fi # spack buildcache keys --install --trust - spack config --scope defaults add config:build_stage:/apps/spack/spack-stage + spack config --scope defaults add config:build_stage:/opt/apps/spack/spack-stage spack config --scope defaults add -f /tmp/projections-config.yaml spack config --scope site add -f /tmp/slurm-external-config.yaml - spack install gcc@9.3.0 %gcc@4.8.5 target=x86_64 - spack load gcc@9.3.0 %gcc@4.8.5 target=x86_64 + spack install gcc@9.3.0 %gcc@8.5.0 target=x86_64 + spack load gcc@9.3.0 %gcc@8.5.0 target=x86_64 spack compiler find --scope site if ! spack env list | grep -q openfoam; then @@ -110,28 +110,30 @@ deployment_groups: spack install fi - - id: controller-setup + - id: login-setup source: modules/scripts/startup-script settings: runners: + - type: shell + destination: remove_lustre_client.sh + content: | + #!/bin/bash + rm /etc/yum.repos.d/lustre-client.repo - $(spack-execute.spack_runner) - type: shell destination: setup_openfoam.sh content: | #!/bin/bash - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate openfoam - chmod -R a+rwX /apps/spack/var/spack/environments/openfoam - mkdir -p /apps/openfoam - chmod a+rwx /apps/openfoam - type: data - destination: /apps/openfoam/submit_openfoam.sh + destination: /opt/apps/openfoam/submit_openfoam.sh content: | #!/bin/bash #SBATCH -N 2 #SBATCH --ntasks-per-node 30 - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate openfoam cd $SLURM_SUBMIT_DIR @@ -153,25 +155,36 @@ deployment_groups: mpirun -n 60 -npernode 30 -hostfile hostfile snappyHexMesh -overwrite -parallel mpirun -n 60 -npernode 30 -hostfile hostfile potentialFoam -parallel mpirun -n 60 -npernode 30 -hostfile hostfile simpleFoam -parallel + + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + node_count_dynamic_max: 2 + bandwidth_tier: gvnic_enabled + - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset] settings: partition_name: compute - max_node_count: 20 + is_default: true + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network1] + settings: + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: false - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 - compute_partition + - slurm_login settings: - controller_startup_script: $(controller-setup.startup_script) - login_node_count: 1 - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - slurm_controller + login_startup_script: $(login-setup.startup_script) + login_startup_scripts_timeout: 21600 + disable_controller_public_ips: false diff --git a/docs/tutorials/wrfv3/spack-wrfv3.md b/docs/tutorials/wrfv3/spack-wrfv3.md index 275af22cff..db39790670 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.md +++ b/docs/tutorials/wrfv3/spack-wrfv3.md @@ -5,7 +5,7 @@ easy for customers to deploy HPC environments on Google Cloud. In this tutorial you will use the HPC Toolkit to: -* Deploy a [Slurm](https://github.com/SchedMD/slurm-gcp#readme) HPC cluster on +* Deploy a [Slurm](https://github.com/GoogleCloudPlatform/slurm-gcp#readme) HPC cluster on Google Cloud * Use [Spack](https://spack.io/) to install the Weather Research and Forecasting (WRF) Model application and all of its dependencies @@ -13,10 +13,10 @@ In this tutorial you will use the HPC Toolkit to: cluster * Tear down the cluster -Estimated time to complete: -The tutorial takes 2 hr. to complete, -of which 1.5 hr is for installing software -(without cache). +Estimated time to complete: +The tutorial takes 2 hr. to complete, +of which 1.5 hr is for installing software +(without cache). > **_NOTE:_** With a complete Spack cache, the tutorial takes 30 min. @@ -75,7 +75,7 @@ which should be open in the Cloud Shell Editor (on the left). This file describes the cluster you will deploy. It defines: -* the existing default network from your project +* a vpc network * a monitoring dashboard with metrics on your cluster * a definition of a custom Spack installation * a startup script that @@ -84,7 +84,6 @@ This file describes the cluster you will deploy. It defines: * sets up a Spack environment including downloading an example input deck * places a submission script on a shared drive * a Slurm cluster - * a Slurm login node * a Slurm controller * An auto-scaling Slurm partition @@ -106,24 +105,18 @@ contains the terraform needed to deploy your cluster. ## Deploy the Cluster -Use the following commands to run terraform and deploy your cluster. +Use below command to deploy your cluster. ```bash -terraform -chdir=spack-wrfv3/primary init -terraform -chdir=spack-wrfv3/primary apply +./ghpc deploy spack-wrfv3 ``` -The `terraform apply` command will generate a _plan_ that describes the Google -Cloud resources that will be deployed. - -You can review the plan and then start the deployment by typing -**`yes [enter]`**. - -The deployment will take about 30 seconds. There should be regular status updates -in the terminal. +You can also use below command to generate a plan that describes the Google Cloud resources that will be deployed. -If the `apply` is successful, a message similar to the following will be -displayed: +```bash +terraform -chdir=spack-wrfv3/primary init +terraform -chdir=spack-wrfv3/primary apply +``` @@ -144,30 +137,30 @@ controller. This command can be used to view progress and check for completion of the startup script: ```bash -gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project slurm-spack-wrfv3-controller | grep google_metadata_script_runner +gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project spackwrfv3-controller | grep google_metadata_script_runner ``` When the startup script has finished running you will see the following line as the final output from the above command: -> _`slurm-spack-wrfv3-controller google_metadata_script_runner: Finished running startup scripts.`_ +> _`spackwrfv3-controller google_metadata_script_runner: Finished running startup scripts.`_ Optionally while you wait, you can see your deployed VMs on Google Cloud Console. Open the link below in a new window. Look for -`slurm-spack-wrfv3-controller` and `slurm-spack-wrfv3-login0`. If you don't +`spackwrfv3-controller`. If you don't see your VMs make sure you have the correct project selected (top left). ```text https://console.cloud.google.com/compute?project= ``` -## Connecting to the login node +## Connecting to the controller node -Once the startup script has completed, connect to the login node. +Once the startup script has completed, connect to the controller node. -Use the following command to ssh into the login node from cloud shell: +Use the following command to ssh into the controller node from cloud shell: ```bash -gcloud compute ssh slurm-spack-wrfv3-login0 --zone us-central1-c --project +gcloud compute ssh spackwrfv3-controller --zone us-central1-c --project ``` You may be prompted to set up SSH. If so follow the prompts and if asked for a @@ -191,15 +184,15 @@ following instructions: https://console.cloud.google.com/compute?project= ``` -1. Click on the `SSH` button associated with the `slurm-spack-wrfv3-login0` +1. Click on the `SSH` button associated with the `spackwrfv3-controller` instance. This will open a separate pop up window with a terminal into our newly - created Slurm login VM. + created Slurm controller VM. ## Run a Job on the Cluster - **The commands below should be run on the Slurm login node.** + **The commands below should be run on the Slurm controller node.** We will use the submission script (see line 122 of the blueprint) to submit a Weather Research and Forecasting (WRF) Model job. @@ -213,7 +206,7 @@ Weather Research and Forecasting (WRF) Model job. 2. Submit the job to Slurm to be scheduled: ```bash - sbatch /apps/wrfv3/submit_wrfv3.sh + sbatch /opt/apps/wrfv3/submit_wrfv3.sh ``` 3. Once submitted, you can watch the job progress by repeatedly calling the @@ -227,7 +220,7 @@ The `sbatch` command trigger Slurm to auto-scale up several nodes to run the job You can refresh the `Compute Engine` > `VM instances` page and see that additional VMs are being/have been created. These will be named something like -`slurm-spack-wrfv3-compute-0-0`. +`spackwrfv3-compute-0`. When running `squeue`, observe the job status start as `CF` (configuring), change to `R` (running) once the compute VMs have been created, and finally `CG` @@ -247,7 +240,7 @@ about 5 minutes to run. Several files will have been generated in the `test_run/` folder you created. The `rsl.out.0000` file has information on the run. You can view this file by -running the following command on the login node: +running the following command on the controller node: ```bash cat rsl.out.0000 @@ -268,9 +261,9 @@ https://console.cloud.google.com/monitoring/dashboards?project= **_NOTE:_** If you are accessing the login node terminal via a separate pop-up +> **_NOTE:_** If you are accessing the controller node terminal via a separate pop-up > then make sure to call `exit` in the pop-up window. ```bash @@ -280,7 +273,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -terraform -chdir=spack-wrfv3/primary destroy -auto-approve +./ghpc destroy spack-wrfv3 ``` When complete you should see something like: diff --git a/docs/tutorials/wrfv3/spack-wrfv3.yaml b/docs/tutorials/wrfv3/spack-wrfv3.yaml index 921669ce33..d9af2e8695 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.yaml +++ b/docs/tutorials/wrfv3/spack-wrfv3.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: modules/network/pre-existing-vpc + source: modules/network/vpc - id: hpc_dash source: modules/monitoring/dashboard @@ -35,8 +35,8 @@ deployment_groups: - id: spack-setup source: community/modules/scripts/spack-setup settings: - install_dir: /apps/spack - spack_ref: v0.19.0 + install_dir: /opt/apps/spack + spack_ref: v0.20.0 - id: spack-execute source: community/modules/scripts/spack-execute @@ -88,7 +88,7 @@ deployment_groups: # fi # spack buildcache keys --install --trust - spack config --scope defaults add config:build_stage:/apps/spack/spack-stage + spack config --scope defaults add config:build_stage:/opt/apps/spack/spack-stage spack config --scope defaults add -f /tmp/projections-config.yaml spack config --scope site add -f /tmp/slurm-external-config.yaml @@ -107,58 +107,62 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: + - type: shell + destination: remove_lustre_client.sh + content: | + #!/bin/bash + rm /etc/yum.repos.d/lustre-client.repo - $(spack-execute.spack_runner) - type: shell destination: wrfv3_setup.sh content: | #!/bin/bash - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate wrfv3 - chmod -R a+rwX /apps/spack/var/spack/environments/wrfv3 - mkdir -p /apps/wrfv3 - chmod a+rwx /apps/wrfv3 - cd /apps/wrfv3 + chmod -R a+rwX /opt/apps/spack/var/spack/environments/wrfv3 + mkdir -p /opt/apps/wrfv3 + chmod a+rwx /opt/apps/wrfv3 + cd /opt/apps/wrfv3 wget --no-verbose https://www2.mmm.ucar.edu/wrf/bench/conus12km_v3911/bench_12km.tar.bz2 tar xjf bench_12km.tar.bz2 - type: data - destination: /apps/wrfv3/submit_wrfv3.sh + destination: /opt/apps/wrfv3/submit_wrfv3.sh content: | #!/bin/bash #SBATCH -N 2 #SBATCH --ntasks-per-node 30 - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate wrfv3 # Check that wrf.exe exists which wrf.exe cd $SLURM_SUBMIT_DIR - cp /apps/wrfv3/bench_12km/* . + cp /opt/apps/wrfv3/bench_12km/* . WRF=`spack location -i wrf` ln -s $WRF/run/* . scontrol show hostnames ${SLURM_JOB_NODELIST} > hostfile mpirun -n 60 -hostfile hostfile -ppn ${SLURM_NTASKS_PER_NODE} wrf.exe + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + node_count_dynamic_max: 20 + - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset] settings: partition_name: compute - max_node_count: 20 - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 - compute_partition settings: + disable_controller_public_ips: false + controller_startup_scripts_timeout: 21600 controller_startup_script: $(controller-setup.startup_script) - login_node_count: 1 - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - slurm_controller diff --git a/docs/vm-images.md b/docs/vm-images.md index 5c95f9759e..89f2d87d05 100644 --- a/docs/vm-images.md +++ b/docs/vm-images.md @@ -313,10 +313,10 @@ These instructions apply to the following modules: * [schedmd-slurm-gcp-v5-login] * [schedmd-slurm-gcp-v5-node-group] -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/v5 -[slurm-gcp-packer]: https://github.com/SchedMD/slurm-gcp/tree/v5/packer -[slurm-gcp-images]: https://github.com/SchedMD/slurm-gcp/blob/v5/docs/images.md -[slurm-gcp-published-images]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/images.md#published-image-family +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5 +[slurm-gcp-packer]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5/packer +[slurm-gcp-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md +[slurm-gcp-published-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family [gcloud-compute-images]: https://cloud.google.com/sdk/gcloud/reference/compute/images/create [vm-instance]: ../modules/compute/vm-instance diff --git a/examples/README.md b/examples/README.md index 6acd823bde..15fc1bb50f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -13,21 +13,21 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [Blueprint Descriptions](#blueprint-descriptions) * [hpc-slurm.yaml](#hpc-slurmyaml-) ![core-badge] * [hpc-enterprise-slurm.yaml](#hpc-enterprise-slurmyaml-) ![core-badge] + * [hpc-slurm6-tpu.yaml](#hpc-slurm6-tpuyaml--) ![community-badge] ![experimental-badge] * [ml-slurm.yaml](#ml-slurmyaml-) ![core-badge] * [image-builder.yaml](#image-builderyaml-) ![core-badge] * [serverless-batch.yaml](#serverless-batchyaml-) ![core-badge] * [serverless-batch-mpi.yaml](#serverless-batch-mpiyaml-) ![core-badge] * [pfs-lustre.yaml](#pfs-lustreyaml-) ![core-badge] * [cae-slurm.yaml](#cae-slurmyaml-) ![core-badge] + * [hpc-build-slurm-image.yaml](#hpc-build-slurm-imageyaml-) ![community-badge] * [hpc-slurm-ubuntu2004.yaml](#hpc-slurm-ubuntu2004yaml-) ![community-badge] - * [hpc-intel-select-slurm.yaml](#hpc-intel-select-slurmyaml-) ![community-badge] * [pfs-daos.yaml](#pfs-daosyaml-) ![community-badge] * [hpc-slurm-daos.yaml](#hpc-slurm-daosyaml-) ![community-badge] * [hpc-amd-slurm.yaml](#hpc-amd-slurmyaml-) ![community-badge] - * [quantum-circuit-simulator.yaml](#quantum-circuit-simulatoryaml-) ![community-badge] + * [hpc-slurm-sharedvpc.yaml](#hpc-slurm-sharedvpcyaml-) ![community-badge] * [client-google-cloud-storage.yaml](#client-google-cloud-storageyaml--) ![community-badge] ![experimental-badge] * [hpc-slurm-gromacs.yaml](#hpc-slurm-gromacsyaml--) ![community-badge] ![experimental-badge] - * [omnia-cluster.yaml](#omnia-clusteryaml--) ![community-badge] ![experimental-badge] * [hpc-slurm-local-ssd.yaml](#hpc-slurm-local-ssdyaml--) ![community-badge] ![experimental-badge] * [hpc-gke.yaml](#hpc-gkeyaml--) ![community-badge] ![experimental-badge] * [ml-gke](#ml-gkeyaml--) ![community-badge] ![experimental-badge] @@ -41,18 +41,16 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [hpc-slurm-chromedesktop.yaml](#hpc-slurm-chromedesktopyaml--) ![community-badge] ![experimental-badge] * [flux-cluster](#flux-clusteryaml--) ![community-badge] ![experimental-badge] * [tutorial-fluent.yaml](#tutorial-fluentyaml--) ![community-badge] ![experimental-badge] - * [hpc-slurm-legacy.yaml](#hpc-slurm-legacyyaml--) ![community-badge] ![deprecated-badge] - * [hpc-slurm-legacy-sharedvpc.yaml](#hpc-slurm-legacy-sharedvpcyaml--) ![community-badge] ![deprecated-badge] + * [omnia-cluster.yaml](#omnia-clusteryaml---) ![community-badge] ![experimental-badge] ![deprecated-badge] * [Blueprint Schema](#blueprint-schema) * [Writing an HPC Blueprint](#writing-an-hpc-blueprint) * [Blueprint Boilerplate](#blueprint-boilerplate) * [Top Level Parameters](#top-level-parameters) * [Deployment Variables](#deployment-variables) * [Deployment Groups](#deployment-groups) -* [Variables](#variables) - * [Blueprint Variables](#blueprint-variables) - * [Literal Variables](#literal-variables) - * [Escape Variables](#escape-variables) +* [Variables and expressions](#variables-and-expressions) + * [Blueprint expressions](#blueprint-expressions) + * [Escape expressions](#escape-expressions) ## Instructions @@ -117,13 +115,11 @@ the experimental badge (![experimental-badge]). ### [hpc-slurm.yaml] ![core-badge] -> **Warning**: The variables `enable_reconfigure`, -> `enable_cleanup_compute`, and `enable_cleanup_subscriptions`, if set to -> `true`, require additional dependencies **to be installed on the system deploying the infrastructure**. +> **Warning**: Requires additional dependencies **to be installed on the system deploying the infrastructure**. > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.9.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/6.2.1/scripts/requirements.txt > ``` Creates a basic auto-scaling Slurm cluster with mostly default settings. The @@ -264,6 +260,19 @@ to 256 [hpc-enterprise-slurm.yaml]: ./hpc-enterprise-slurm.yaml +### [hpc-slurm6-tpu.yaml] ![community-badge] ![experimental-badge] + +> **Warning**: Requires additional dependencies **to be installed on the system deploying the infrastructure**. +> +> ```shell +> # Install Python3 and run +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/6.2.1/scripts/requirements.txt +> ``` + +Creates an auto-scaling Slurm cluster with TPU nodes. + +[hpc-slurm6-tpu.yaml]: ../community/examples/hpc-slurm6-tpu.yaml + ### [ml-slurm.yaml] ![core-badge] This blueprint provisions an HPC cluster running the Slurm scheduler with the @@ -569,6 +578,25 @@ For this example the following is needed in the selected region: [cae-slurm.yaml]: ../examples/cae/cae-slurm.yaml +### [hpc-build-slurm-image.yaml] ![community-badge] + +This blueprint demonstrates how to use HPC Toolkit to build a Slurm image on top +of an existing image, `hpc-rocky-linux-8` in the case of this example. + +The blueprint contains 3 groups: + +1. The first group creates a network and generates the scripts that will install + Slurm. This uses the Ansible Playbook contained in the + [Slurm on GCP](https://github.com/GoogleCloudPlatform/slurm-gcp) repo. +2. The second group executes the build using Packer to run the scripts from the + first group. This can take ~30 min and will generate a custom Slurm image in + your project. +3. The third group deploys a demo cluster that uses the newly built image. For a + real world use case the demo cluster can be swapped out for a more powerful + slurm cluster from other examples. + +[hpc-build-slurm-image.yaml]: ../community/examples/hpc-build-slurm-image.yaml + ### [hpc-slurm-ubuntu2004.yaml] ![community-badge] > **Warning**: The variables `enable_reconfigure`, @@ -577,11 +605,11 @@ For this example the following is needed in the selected region: > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.9.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.10.2/scripts/requirements.txt > ``` Similar to the [hpc-slurm.yaml] example, but using Ubuntu 20.04 instead of CentOS 7. -[Other operating systems] are supported by SchedMD for the the Slurm on GCP project and images are listed [here](https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family). Only the examples listed in this page been tested by the Cloud HPC Toolkit team. +[Other operating systems] are supported by SchedMD for the the Slurm on GCP project and images are listed [here](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family). Only the examples listed in this page been tested by the Cloud HPC Toolkit team. The cluster will support 2 partitions named `debug` and `compute`. The `debug` partition is the default partition and runs on smaller @@ -590,7 +618,7 @@ specifying in the `srun` command via the `--partition` flag. The `compute` partition runs on compute optimized nodes of type `cs-standard-60`. The `compute` partition may require additional quota before using. -[Other operating systems]: https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems +[Other operating systems]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems [hpc-slurm-ubuntu2004.yaml]: ../community/examples/hpc-slurm-ubuntu2004.yaml #### Quota Requirements for hpc-slurm-ubuntu2004.yaml @@ -609,17 +637,6 @@ For this example the following is needed in the selected region: * Compute Engine API: Resource policies: **one for each job in parallel** - _only needed for `compute` partition_ -### [hpc-intel-select-slurm.yaml] ![community-badge] - -This example provisions a Slurm cluster automating the [steps to comply to the -Intel Select Solutions for Simulation & Modeling Criteria][intelselect]. It is -more extensively discussed in a dedicated [README for Intel -examples][intel-examples-readme]. - -[hpc-intel-select-slurm.yaml]: ../community/examples/intel/hpc-intel-select-slurm.yaml -[intel-examples-readme]: ../community/examples/intel/README.md -[intelselect]: https://cloud.google.com/compute/docs/instances/create-intel-select-solution-hpc-clusters - ### [pfs-daos.yaml] ![community-badge] This example provisions a DAOS cluster with [managed instance groups][migs] for the servers and for clients. It is more extensively discussed in a dedicated [README for Intel @@ -648,27 +665,6 @@ examples][amd-examples-readme]. [AOCC]: https://developer.amd.com/amd-aocc/ [amd-examples-readme]: ../community/examples/AMD/README.md -### [quantum-circuit-simulator.yaml] ![community-badge] - -This blueprint provisions a [N1 series VM with NVIDIA T4 GPU accelerator][t4] -and compiles [qsim], a [Google Quantum AI][gqai]-developed tool that simulates -quantum circuits using CPUs and GPUs. The installation of qsim, the [CUDA -Toolkit][cudatk], and the [cuQuantum SDK][cqsdk] is fully automated but takes a -significant time (approx. 20 minutes). Once complete, a qsim example can be run -by connecting to the VM by SSH and running - -```shell -conda activate qsim -python /var/tmp/qsim-example.py -``` - -[gqai]: https://quantumai.google/ -[quantum-circuit-simulator.yaml]: ../community/examples/quantum-circuit-simulator.yaml -[t4]: https://cloud.google.com/compute/docs/gpus#nvidia_t4_gpus -[qsim]: https://quantumai.google/qsim -[cqsdk]: https://developer.nvidia.com/cuquantum-sdk -[cudatk]: https://developer.nvidia.com/cuda-toolkit - ### [client-google-cloud-storage.yaml] ![community-badge] ![experimental-badge] [client-google-cloud-storage.yaml]: ../community/examples/client-google-cloud-storage.yaml @@ -791,7 +787,9 @@ node scaling study of the Lignocellulose benchmark for Gromacs. [hpc-slurm-ramble-gromacs.yaml]: ../community/examples/hpc-slurm-ramble-gromacs.yaml -### [omnia-cluster.yaml] ![community-badge] ![experimental-badge] +### [omnia-cluster.yaml] ![community-badge] ![experimental-badge] ![deprecated-badge] + +_This blueprint has been deprecated and will be removed on August 1, 2024._ Creates a simple [Dell Omnia][omnia-github] provisioned cluster with an omnia-manager node that acts as the slurm manager and 2 omnia-compute nodes on @@ -912,7 +910,7 @@ tuned for the execution of many short-duration, loosely-coupled (non-MPI) jobs. For more information see: -* [Slurm on Google Cloud High Throughput documentation](https://github.com/SchedMD/slurm-gcp/blob/master/docs/htc.md) +* [Slurm on Google Cloud High Throughput documentation](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md) * [General Slurm High Throughput documentation](https://slurm.schedmd.com/high_throughput.html) [htc-slurm.yaml]: ../community/examples/htc-slurm.yaml @@ -981,62 +979,19 @@ See [README](../community/examples/flux-framework/README.md) [flux-cluster.yaml]: ../community/examples/flux-framework/flux-cluster.yaml -### [hpc-slurm-legacy.yaml] ![community-badge] ![deprecated-badge] - -Creates a Slurm cluster with tiered file systems for higher performance. It -connects to the default VPC of the project and creates two partitions and a -login node. - -File systems: - -* The homefs mounted at `/home` is a default "BASIC_HDD" tier filestore with - 1 TiB of capacity -* The projectsfs is mounted at `/projects` and is a high scale SSD filestore - instance with 10TiB of capacity. -* The scratchfs is mounted at `/scratch` and is a - [DDN Exascaler Lustre](../community/modules/file-system/DDN-EXAScaler/README.md) - file system designed for high IO performance. The capacity is ~10TiB. - -> **Warning**: The DDN Exascaler Lustre file system has a license cost as -> described in the pricing section of the -> [DDN EXAScaler Cloud Marketplace Solution](https://console.developers.google.com/marketplace/product/ddnstorage/). - -There are two partitions in this example: `low_cost` and `compute`. The -`low_cost` partition uses `n2-standard-4` VMs. This partition can be used for -debugging and workloads that do not require high performance. - -Similar to the small example, there is a -[compute partition](#compute-partition) that should be used for any performance -analysis. - -#### Quota Requirements for hpc-slurm-legacy.yaml - -For this example the following is needed in the selected region: - -* Cloud Filestore API: Basic HDD (Standard) capacity (GB) per region: **1,024 GB** -* Cloud Filestore API: High Scale SSD capacity (GB) per region: **10,240 GiB** - _min - quota request is 61,440 GiB_ -* Compute Engine API: Persistent Disk SSD (GB): **~14,050 GB** -* Compute Engine API: Persistent Disk Standard (GB): **~396 GB static + 20 - GB/node** up to 4596 GB -* Compute Engine API: N2 CPUs: **158** -* Compute Engine API: C2 CPUs: **8** for controller node and **60/node** active - in `compute` partition up to 12,008 -* Compute Engine API: Affinity Groups: **one for each job in parallel** - _only - needed for `compute` partition_ -* Compute Engine API: Resource policies: **one for each job in parallel** - - _only needed for `compute` partition_ - -[hpc-slurm-legacy.yaml]: ../community/examples/hpc-slurm-legacy.yaml - -### [hpc-slurm-legacy-sharedvpc.yaml] ![community-badge] ![deprecated-badge] +### [hpc-slurm-sharedvpc.yaml] ![community-badge] This blueprint demonstrates the use of the Slurm and Filestore modules in -the service project of an existing Shared VPC. Before attempting to deploy the +the service project of an existing Shared VPC. Before attempting to deploy the blueprint, one must first complete [initial setup for provisioning Filestore in -a Shared VPC service project][fs-shared-vpc]. - -[hpc-slurm-legacy-sharedvpc.yaml]: ../community/examples/hpc-slurm-legacy-sharedvpc.yaml +a Shared VPC service project][fs-shared-vpc]. Depending on how the shared VPC +was created one may have to perform a few additional manual steps to configure +the VPC. One may need to create firewall rules allowing SSH to be able to access +the controller and login nodes. Also since this blueprint doesn't use external +IPs for compute nodes, one must needs to [set up cloud nat][cloudnat] and +[set up iap][iap]. + +[hpc-slurm-sharedvpc.yaml]: ../community/examples/hpc-slurm-sharedvpc.yaml [fs-shared-vpc]: https://cloud.google.com/filestore/docs/shared-vpc ## Blueprint Schema @@ -1242,19 +1197,20 @@ default in the [modules](../modules/README.md) folder. To learn more about how to refer to a module in a blueprint file, please consult the [modules README file.](../modules/README.md) -## Variables +## Variables and expressions Variables can be used to refer both to values defined elsewhere in the blueprint and to the output and structure of other modules. -### Blueprint Variables +### Blueprint expressions -Variables in a blueprint file can refer to deployment variables or the outputs -of other modules. For deployment and module variables, the syntax is as follows: +Expressions in a blueprint file can refer to deployment variables or the outputs +of other modules. The entire expression is wrapped in `$()`. The syntax is as follows: ```yaml vars: zone: us-central1-a + num_nodes: 2 deployment_groups: - group: primary @@ -1268,43 +1224,23 @@ deployment_groups: settings: key1: $(vars.zone) key2: $(resource1.name) + # access nested fields + key3: $(resource1.nodes[0].private_ip) + # arithmetic expression + key4: $(vars.num_nodes + 5) + # string interpolation + key5: $(resource1.name)_$(vars.zone) + # multiline string interpolation + key6: | + #!/bin/bash + echo "Hello $(vars.project_id) from $(vars.region)" + # use a function, supported by Terraform + key7: $(jsonencode(resource1.config)) ``` -The variable is referred to by the source, either vars for deploment variables -or the module ID for module variables, followed by the name of the value being -referenced. The entire variable is then wrapped in “$()”. - -Currently, string interpolation with variables is not supported. - -### Literal Variables +### Escape expressions -Literal variables should only be used by those familiar -with the underlying module technology (Terraform or Packer); -Literal variables are occasionally needed when calling a function or other complex statements. For example, to JSON-encode network storage metadata: - -```yaml -metadata: - network_storage: ((jsonencode([module.appfs.network_storage]))) -``` - -Here the network1 module is referenced, the terraform module name is the same as -the ID in the blueprint file. From the module we can refer to it's underlying -variables as deep as we need, in this case the self_link for it's -primary_subnetwork. - -The entire text of the variable is wrapped in double parentheses indicating that -everything inside will be provided as is to the module. - -Whenever possible, blueprint variables are preferred over literal variables. -`ghpc` will perform basic validation making sure all blueprint variables are -defined before creating a deployment, making debugging quicker and easier. - -### Escape Variables - -Under circumstances where the variable notation conflicts with the content of a setting or string, for instance when defining a startup-script runner that uses a subshell like in the example below, a non-quoted backslash (`\`) can be used as an escape character. It preserves the literal value of the next character that follows: - -* `\$(not.bp_var)` evaluates to `$(not.bp_var)`. -* `\((not.literal_var))` evaluates to `((not.literal_var))`. +Under circumstances where the expression notation conflicts with the content of a setting or string, for instance when defining a startup-script runner that uses a subshell like in the example below, a non-quoted backslash (`\`) can be used as an escape character. It preserves the literal value of the next character that follows: `\$(not.bp_var)` evaluates to `$(not.bp_var)`. ```yaml deployment_groups: @@ -1312,12 +1248,6 @@ deployment_groups: modules: - id: resource1 source: path/to/module/1 - settings: - key1: \((not.literal_var)) ## Evaluates to "((not.literal_var))". - ... - - id: resource2 - source: path/to/module/2 - ... settings: key1: | #!/bin/bash diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml index 5da17f2777..7d2f6ee2cf 100644 --- a/examples/cae/cae-slurm.yaml +++ b/examples/cae/cae-slurm.yaml @@ -36,14 +36,14 @@ vars: # zone: europe-west4-b region: us-central1 zone: us-central1-a - # Visit https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family + # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm; note: the image types for the compute nodes # and the Chrome Remote Desktop (CRD) need to have the same Slurm base. instance_image: - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public crd_instance_image: - family: slurm-gcp-5-9-debian-11 # must be Debian for CRD + family: slurm-gcp-5-10-debian-11 # must be Debian for CRD project: schedmd-slurm-public # Documentation for each of the modules used below can be found at diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index fefe24cc2b..57acb9cf8c 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -23,9 +23,9 @@ vars: zone: us-central1-a gpu_zones: [us-central1-a, us-central1-b, us-central1-c, us-central1-f] slurm_image: - # Visit https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family + # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public # Set to true for active cluster reconfiguration. # Note that setting this option requires additional dependencies to be installed locally. @@ -89,16 +89,12 @@ deployment_groups: source: modules/file-system/filestore use: [network1] settings: - filestore_tier: BASIC_SSD - size_gb: 2560 # smallest size for BASIC_SSD local_mount: /home - id: projectsfs source: modules/file-system/filestore use: [network1] settings: - filestore_tier: HIGH_SCALE_SSD - size_gb: 10240 # smallest size for HIGH_SCALE_SSD local_mount: /projects # This file system has an associated license cost. diff --git a/examples/hpc-slurm.yaml b/examples/hpc-slurm.yaml index 439870b8fe..59edc0586b 100644 --- a/examples/hpc-slurm.yaml +++ b/examples/hpc-slurm.yaml @@ -18,7 +18,7 @@ blueprint_name: hpc-slurm vars: project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-small + deployment_name: hpc-slurm region: us-central1 zone: us-central1-a @@ -28,53 +28,54 @@ vars: deployment_groups: - group: primary modules: - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - - id: network1 + # Source is an embedded module, denoted by "modules/*" without ./, ../, / + # as a prefix. To refer to a local module, prefix with ./, ../ or / + # Example - ./modules/network/vpc + - id: network source: modules/network/vpc - id: homefs source: modules/file-system/filestore - use: [network1] + use: [network] settings: local_mount: /home - - id: debug_node_group - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + - id: debug_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] settings: node_count_dynamic_max: 4 machine_type: n2-standard-2 + enable_placement: false # the default is: true - id: debug_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - network1 - homefs - - debug_node_group + - debug_nodeset settings: partition_name: debug exclusive: false # allows nodes to stay up after jobs are done - enable_placement: false # the default is: true is_default: true - - id: compute_node_group - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] settings: node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled - id: compute_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - network1 - homefs - - compute_node_group + - compute_nodeset settings: partition_name: compute - - id: h3_node_group - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + - id: h3_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] settings: node_count_dynamic_max: 20 machine_type: h3-standard-88 @@ -84,30 +85,29 @@ deployment_groups: bandwidth_tier: gvnic_enabled - id: h3_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - network1 - homefs - - h3_node_group + - h3_nodeset settings: partition_name: h3 + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network] + settings: + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: false + - id: slurm_controller - source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - - network1 + - network - debug_partition - compute_partition - h3_partition - homefs + - slurm_login settings: disable_controller_public_ips: false - - - id: slurm_login - source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - use: - - network1 - - slurm_controller - settings: - machine_type: n2-standard-4 - disable_login_public_ips: false diff --git a/examples/image-builder.yaml b/examples/image-builder.yaml index 4eeae609b5..3a11e001b3 100644 --- a/examples/image-builder.yaml +++ b/examples/image-builder.yaml @@ -59,8 +59,8 @@ deployment_groups: - scripts_for_image settings: source_image_project_id: [schedmd-slurm-public] - # see latest in https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-5-9-hpc-centos-7 + # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family + source_image_family: slurm-gcp-5-10-hpc-centos-7 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size) diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml index aa06aaddbb..00f77b0afb 100644 --- a/examples/ml-slurm.yaml +++ b/examples/ml-slurm.yaml @@ -135,8 +135,8 @@ deployment_groups: # w/o new VPC omit_external_ip: false source_image_project_id: [schedmd-slurm-public] - # see latest in https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-5-9-debian-11 + # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family + source_image_family: slurm-gcp-5-10-debian-11 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size_gb) diff --git a/examples/serverless-batch-mpi.yaml b/examples/serverless-batch-mpi.yaml index 640913aba4..95e58c612e 100644 --- a/examples/serverless-batch-mpi.yaml +++ b/examples/serverless-batch-mpi.yaml @@ -154,6 +154,9 @@ deployment_groups: machine_type: c2-standard-60 task_count: 2 mpi_mode: true + instance_image: + family: batch-centos-7-official + project: batch-custom-image - id: batch-login source: modules/scheduler/batch-login-node diff --git a/examples/serverless-batch.yaml b/examples/serverless-batch.yaml index f7777dd1c5..c459b584ee 100644 --- a/examples/serverless-batch.yaml +++ b/examples/serverless-batch.yaml @@ -52,8 +52,8 @@ deployment_groups: runnable: "cat /sw/hello.txt" machine_type: n2-standard-4 instance_image: - family: hpc-rocky-linux-8 - project: cloud-hpc-image-public + family: batch-centos-7-official + project: batch-custom-image - id: batch-login source: modules/scheduler/batch-login-node diff --git a/go.mod b/go.mod index d4ec929b54..3cb98ce4eb 100644 --- a/go.mod +++ b/go.mod @@ -14,9 +14,9 @@ require ( github.com/pkg/errors v0.9.1 github.com/spf13/afero v1.11.0 github.com/spf13/cobra v1.8.0 - github.com/zclconf/go-cty v1.14.1 + github.com/zclconf/go-cty v1.14.2 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa - google.golang.org/genproto v0.0.0-20231120223509-83a465c0220f // indirect + google.golang.org/genproto v0.0.0-20240102182953-50ed04b92917 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) @@ -25,10 +25,10 @@ require ( github.com/fatih/color v1.16.0 github.com/go-git/go-billy/v5 v5.5.0 github.com/google/go-cmp v0.6.0 - github.com/hashicorp/terraform-exec v0.19.0 + github.com/hashicorp/terraform-exec v0.20.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b - google.golang.org/api v0.154.0 + google.golang.org/api v0.161.0 ) require ( @@ -37,29 +37,29 @@ require ( github.com/cyphar/filepath-securejoin v0.2.4 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/felixge/httpsnoop v1.0.4 // indirect - github.com/go-logr/logr v1.3.0 // indirect + github.com/go-logr/logr v1.4.1 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/googleapis/gax-go/v2 v2.12.0 // indirect - github.com/hashicorp/terraform-json v0.17.1 // indirect + github.com/hashicorp/terraform-json v0.19.0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/rogpeppe/go-internal v1.11.0 // indirect - go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.1 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1 // indirect - go.opentelemetry.io/otel v1.21.0 // indirect - go.opentelemetry.io/otel/metric v1.21.0 // indirect - go.opentelemetry.io/otel/trace v1.21.0 // indirect + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.47.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.47.0 // indirect + go.opentelemetry.io/otel v1.22.0 // indirect + go.opentelemetry.io/otel/metric v1.22.0 // indirect + go.opentelemetry.io/otel/trace v1.22.0 // indirect golang.org/x/mod v0.14.0 // indirect - golang.org/x/sync v0.5.0 // indirect + golang.org/x/sync v0.6.0 // indirect golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.15.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20231120223509-83a465c0220f // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20231127180814-3a041ad873d4 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240102182953-50ed04b92917 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240116215550-a9fa1716bcac // indirect gopkg.in/yaml.v2 v2.4.0 // indirect ) require ( - cloud.google.com/go v0.110.10 // indirect + cloud.google.com/go v0.111.0 // indirect cloud.google.com/go/compute/metadata v0.2.3 // indirect cloud.google.com/go/iam v1.1.5 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect @@ -73,7 +73,7 @@ require ( github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/google/s2a-go v0.1.7 // indirect - github.com/google/uuid v1.4.0 // indirect + github.com/google/uuid v1.5.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect github.com/hashicorp/go-cleanhttp v0.5.2 // indirect github.com/hashicorp/go-safetemp v1.0.0 // indirect @@ -95,14 +95,13 @@ require ( github.com/ulikunitz/xz v0.5.10 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.17.0 // indirect - golang.org/x/net v0.19.0 // indirect - golang.org/x/oauth2 v0.15.0 // indirect - golang.org/x/sys v0.15.0 + golang.org/x/crypto v0.18.0 // indirect + golang.org/x/net v0.20.0 // indirect + golang.org/x/oauth2 v0.16.0 // indirect + golang.org/x/sys v0.16.0 golang.org/x/text v0.14.0 // indirect - golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.8 // indirect - google.golang.org/grpc v1.59.0 // indirect - google.golang.org/protobuf v1.31.0 // indirect + google.golang.org/grpc v1.60.1 // indirect + google.golang.org/protobuf v1.32.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index b0504fffbc..256e294865 100644 --- a/go.sum +++ b/go.sum @@ -30,8 +30,8 @@ cloud.google.com/go v0.100.2/go.mod h1:4Xra9TjzAeYHrl5+oeLlzbM2k3mjVhZh4UqTZ//w9 cloud.google.com/go v0.102.0/go.mod h1:oWcCzKlqJ5zgHQt9YsaeTY9KzIvjyy0ArmiBUgpQ+nc= cloud.google.com/go v0.102.1/go.mod h1:XZ77E9qnTEnrgEOvr4xzfdX5TRo7fB4T2F4O6+34hIU= cloud.google.com/go v0.104.0/go.mod h1:OO6xxXdJyvuJPcEPBLN9BJPD+jep5G1+2U5B5gkRYtA= -cloud.google.com/go v0.110.10 h1:LXy9GEO+timppncPIAZoOj3l58LIU9k+kn48AN7IO3Y= -cloud.google.com/go v0.110.10/go.mod h1:v1OoFqYxiBkUrruItNM3eT4lLByNjxmJSV/xDKJNnic= +cloud.google.com/go v0.111.0 h1:YHLKNupSD1KqjDbQ3+LVdQ81h/UJbJyZG203cEfnQgM= +cloud.google.com/go v0.111.0/go.mod h1:0mibmpKP1TyOOFYQY5izo0LnT+ecvOQ0Sg3OdmMiNRU= cloud.google.com/go/aiplatform v1.22.0/go.mod h1:ig5Nct50bZlzV6NvKaTwmplLLddFx0YReh9WfTO5jKw= cloud.google.com/go/aiplatform v1.24.0/go.mod h1:67UUvRBKG6GTayHKV8DBv2RtR1t93YRu5B1P3x99mYY= cloud.google.com/go/analytics v0.11.0/go.mod h1:DjEWCu41bVbYcKyvlws9Er60YE4a//bK6mnhWvQeFNI= @@ -269,8 +269,8 @@ github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.3.0 h1:2y3SDp0ZXuc6/cjLSZ+Q3ir+QB9T/iG5yYRXqsagWSY= -github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-test/deep v1.0.3 h1:ZrJSEWsXzPOxaZnFteGEfooLba+ju3FYIbOrS+rQd68= @@ -354,8 +354,8 @@ github.com/google/s2a-go v0.1.7 h1:60BLSyTrOV4/haCDW4zb1guZItoSq8foHCXrAnjBo/o= github.com/google/s2a-go v0.1.7/go.mod h1:50CgR4k1jNlWBu4UfS4AcfhVe1r6pdZPygJ3R8F0Qdw= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/google/uuid v1.4.0 h1:MtMxsa51/r9yyhkyLsVeVt0B+BGQZzpQiTQ4eHZ8bc4= -github.com/google/uuid v1.4.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU= +github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/enterprise-certificate-proxy v0.0.0-20220520183353-fd19c99a87aa/go.mod h1:17drOmN3MwGY7t0e+Ei9b45FFGA3fBs3x36SsCg1hq8= github.com/googleapis/enterprise-certificate-proxy v0.1.0/go.mod h1:17drOmN3MwGY7t0e+Ei9b45FFGA3fBs3x36SsCg1hq8= github.com/googleapis/enterprise-certificate-proxy v0.2.0/go.mod h1:8C0jb7/mgJe/9KK8Lm7X9ctZC2t60YyIpYEI16jx0Qg= @@ -384,17 +384,17 @@ github.com/hashicorp/go-version v1.6.0 h1:feTTfFNnjP967rlCxM/I9g701jU+RN74YKx2mO github.com/hashicorp/go-version v1.6.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/hashicorp/hc-install v0.6.0 h1:fDHnU7JNFNSQebVKYhHZ0va1bC6SrPQ8fpebsvNr2w4= +github.com/hashicorp/hc-install v0.6.2 h1:V1k+Vraqz4olgZ9UzKiAcbman9i9scg9GgSt/U3mw/M= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hashicorp/hcl/v2 v2.19.1 h1://i05Jqznmb2EXqa39Nsvyan2o5XyMowW5fnCKW5RPI= github.com/hashicorp/hcl/v2 v2.19.1/go.mod h1:ThLC89FV4p9MPW804KVbe/cEXoQ8NZEh+JtMeeGErHE= github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d h1:g6kHlvZrFPFKeWRj5q/zyJA5gu7rlJGPf17h8hX7LHY= github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d/go.mod h1:l8HcFPm9cQh6Q0KSWoYPiePqMvRFenybP1CH2MjKdlg= -github.com/hashicorp/terraform-exec v0.19.0 h1:FpqZ6n50Tk95mItTSS9BjeOVUb4eg81SpgVtZNNtFSM= -github.com/hashicorp/terraform-exec v0.19.0/go.mod h1:tbxUpe3JKruE9Cuf65mycSIT8KiNPZ0FkuTE3H4urQg= -github.com/hashicorp/terraform-json v0.17.1 h1:eMfvh/uWggKmY7Pmb3T85u86E2EQg6EQHgyRwf3RkyA= -github.com/hashicorp/terraform-json v0.17.1/go.mod h1:Huy6zt6euxaY9knPAFKjUITn8QxUFIe9VuSzb4zn/0o= +github.com/hashicorp/terraform-exec v0.20.0 h1:DIZnPsqzPGuUnq6cH8jWcPunBfY+C+M8JyYF3vpnuEo= +github.com/hashicorp/terraform-exec v0.20.0/go.mod h1:ckKGkJWbsNqFKV1itgMnE0hY9IYf1HoiekpuN0eWoDw= +github.com/hashicorp/terraform-json v0.19.0 h1:e9DBKC5sxDfiJT7Zoi+yRIwqLVtFur/fwK/FuE6AWsA= +github.com/hashicorp/terraform-json v0.19.0/go.mod h1:qdeBs11ovMzo5puhrRibdD6d2Dq6TyE/28JiU4tIQxk= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -493,8 +493,8 @@ github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9dec github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/zclconf/go-cty v1.2.0/go.mod h1:hOPWgoHbaTUnI5k4D2ld+GRpFJSCe6bCM7m1q/N4PQ8= -github.com/zclconf/go-cty v1.14.1 h1:t9fyA35fwjjUMcmL5hLER+e/rEPqrbCK1/OSE4SI9KA= -github.com/zclconf/go-cty v1.14.1/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= +github.com/zclconf/go-cty v1.14.2 h1:kTG7lqmBou0Zkx35r6HJHUQTvaRPr5bIAf3AoHS0izI= +github.com/zclconf/go-cty v1.14.2/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b h1:FosyBZYxY34Wul7O/MSKey3txpPYyCqVO5ZyceuQJEI= github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b/go.mod h1:ZRKQfBXbGkpdV6QMzT3rU1kSTAnfu1dO8dPKjYprgj8= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= @@ -506,16 +506,17 @@ go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.1 h1:SpGay3w+nEwMpfVnbqOLH5gY52/foP8RE8UzTZ1pdSE= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.1/go.mod h1:4UoMYEZOC0yN/sPGH76KPkkU7zgiEWYWL9vwmbnTJPE= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1 h1:aFJWCqJMNjENlcleuuOkGAPH82y0yULBScfXcIEdS24= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1/go.mod h1:sEGXWArGqc3tVa+ekntsN65DmVbVeW+7lTKTjZF3/Fo= -go.opentelemetry.io/otel v1.21.0 h1:hzLeKBZEL7Okw2mGzZ0cc4k/A7Fta0uoPgaJCr8fsFc= -go.opentelemetry.io/otel v1.21.0/go.mod h1:QZzNPQPm1zLX4gZK4cMi+71eaorMSGT3A4znnUvNNEo= -go.opentelemetry.io/otel/metric v1.21.0 h1:tlYWfeo+Bocx5kLEloTjbcDwBuELRrIFxwdQ36PlJu4= -go.opentelemetry.io/otel/metric v1.21.0/go.mod h1:o1p3CA8nNHW8j5yuQLdc1eeqEaPfzug24uvsyIEJRWM= -go.opentelemetry.io/otel/trace v1.21.0 h1:WD9i5gzvoUPuXIXH24ZNBudiarZDKuekPqi/E8fpfLc= -go.opentelemetry.io/otel/trace v1.21.0/go.mod h1:LGbsEB0f9LGjN+OZaQQ26sohbOmiMR+BaslueVtS/qQ= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.47.0 h1:UNQQKPfTDe1J81ViolILjTKPr9WetKW6uei2hFgJmFs= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.47.0/go.mod h1:r9vWsPS/3AQItv3OSlEJ/E4mbrhUbbw18meOjArPtKQ= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.47.0 h1:sv9kVfal0MK0wBMCOGr+HeJm9v803BkJxGrk2au7j08= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.47.0/go.mod h1:SK2UL73Zy1quvRPonmOmRDiWk1KBV3LyIeeIxcEApWw= +go.opentelemetry.io/otel v1.22.0 h1:xS7Ku+7yTFvDfDraDIJVpw7XPyuHlB9MCiqqX5mcJ6Y= +go.opentelemetry.io/otel v1.22.0/go.mod h1:eoV4iAi3Ea8LkAEI9+GFT44O6T/D0GWAVFyZVCC6pMI= +go.opentelemetry.io/otel/metric v1.22.0 h1:lypMQnGyJYeuYPhOM/bgjbFM6WE44W1/T45er4d8Hhg= +go.opentelemetry.io/otel/metric v1.22.0/go.mod h1:evJGjVpZv0mQ5QBRJoBF64yMuOf4xCWdXjK8pzFvliY= +go.opentelemetry.io/otel/sdk v1.19.0 h1:6USY6zH+L8uMH8L3t1enZPR3WFEmSTADlqldyHtJi3o= +go.opentelemetry.io/otel/trace v1.22.0 h1:Hg6pPujv0XG9QaVbGOBVHunyuLcCC3jN7WEhPx83XD0= +go.opentelemetry.io/otel/trace v1.22.0/go.mod h1:RbbHXVqKES9QhzZq/fE5UnOSILqRt40a21sPw2He1xo= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -526,8 +527,8 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.3.1-0.20221117191849-2c476679df9a/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= -golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k= -golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= +golang.org/x/crypto v0.18.0 h1:PGVlW0xEltQnzFZ55hkuX5+KLyrMYhHld1YHO4AKcdc= +golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -621,8 +622,8 @@ golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= -golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= -golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= +golang.org/x/net v0.20.0 h1:aCL9BSgETF1k+blQaYUBx9hJ9LOGP3gAVemcZlf1Kpo= +golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -648,8 +649,8 @@ golang.org/x/oauth2 v0.0.0-20220822191816-0ebed06d0094/go.mod h1:h4gKUeWbJ4rQPri golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.1.0/go.mod h1:G9FE4dLTsbXUu90h/Pf85g4w1D+SSAgR+q46nJZ8M4A= -golang.org/x/oauth2 v0.15.0 h1:s8pnnxNVzjWyrvYdFUQq5llS1PX2zhPXmccZv99h7uQ= -golang.org/x/oauth2 v0.15.0/go.mod h1:q48ptWNTY5XWf+JNten23lcvHpLJ0ZSxF5ttTHKVCAM= +golang.org/x/oauth2 v0.16.0 h1:aDkGMBSYxElaoP81NpoUoz2oo2R2wHdZpGToUxfyQrQ= +golang.org/x/oauth2 v0.16.0/go.mod h1:hqZ+0LWXsiVoZpeld6jVt06P3adbS2Uu911W1SsJv2o= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -665,8 +666,8 @@ golang.org/x/sync v0.0.0-20220601150217-0de741cfad7f/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= -golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -738,15 +739,15 @@ golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= -golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= +golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= -golang.org/x/term v0.15.0 h1:y/Oo/a/q3IXu26lQgl04j/gjuBDOBlx7X6Om1j2CPW4= +golang.org/x/term v0.16.0 h1:m+B6fahuftsE9qjo0VWp2FW0mB3MTJvR0BaMQrq0pmE= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -879,8 +880,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.154.0 h1:X7QkVKZBskztmpPKWQXgjJRPA2dJYrL6r+sYPRLj050= -google.golang.org/api v0.154.0/go.mod h1:qhSMkM85hgqiokIYsrRyKxrjfBeIhgl4Z2JmeRkYylc= +google.golang.org/api v0.161.0 h1:oYzk/bs26WN10AV7iU7MVJVXBH8oCPS2hHyBiEeFoSU= +google.golang.org/api v0.161.0/go.mod h1:0mu0TpK33qnydLvWqbImq2b1eQ5FHRSDCBzAxX9ZHyw= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -991,12 +992,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20231120223509-83a465c0220f h1:Vn+VyHU5guc9KjB5KrjI2q0wCOWEOIh0OEsleqakHJg= -google.golang.org/genproto v0.0.0-20231120223509-83a465c0220f/go.mod h1:nWSwAFPb+qfNJXsoeO3Io7zf4tMSfN8EA8RlDA04GhY= -google.golang.org/genproto/googleapis/api v0.0.0-20231120223509-83a465c0220f h1:2yNACc1O40tTnrsbk9Cv6oxiW8pxI/pXj0wRtdlYmgY= -google.golang.org/genproto/googleapis/api v0.0.0-20231120223509-83a465c0220f/go.mod h1:Uy9bTZJqmfrw2rIBxgGLnamc78euZULUBrLZ9XTITKI= -google.golang.org/genproto/googleapis/rpc v0.0.0-20231127180814-3a041ad873d4 h1:DC7wcm+i+P1rN3Ff07vL+OndGg5OhNddHyTA+ocPqYE= -google.golang.org/genproto/googleapis/rpc v0.0.0-20231127180814-3a041ad873d4/go.mod h1:eJVxU6o+4G1PSczBr85xmyvSNYAKvAYgkub40YGomFM= +google.golang.org/genproto v0.0.0-20240102182953-50ed04b92917 h1:nz5NESFLZbJGPFxDT/HCn+V1mZ8JGNoY4nUpmW/Y2eg= +google.golang.org/genproto v0.0.0-20240102182953-50ed04b92917/go.mod h1:pZqR+glSb11aJ+JQcczCvgf47+duRuzNSKqE8YAQnV0= +google.golang.org/genproto/googleapis/api v0.0.0-20240102182953-50ed04b92917 h1:rcS6EyEaoCO52hQDupoSfrxI3R6C2Tq741is7X8OvnM= +google.golang.org/genproto/googleapis/api v0.0.0-20240102182953-50ed04b92917/go.mod h1:CmlNWB9lSezaYELKS5Ym1r44VrrbPUa7JTvw+6MbpJ0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240116215550-a9fa1716bcac h1:nUQEQmH/csSvFECKYRv6HWEyypysidKl2I6Qpsglq/0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240116215550-a9fa1716bcac/go.mod h1:daQN87bsDqDoe316QbbvX60nMoJQa4r6Ds0ZuoAe5yA= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1032,8 +1033,8 @@ google.golang.org/grpc v1.48.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACu google.golang.org/grpc v1.49.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.1/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= -google.golang.org/grpc v1.59.0 h1:Z5Iec2pjwb+LEOqzpB2MR12/eKFhDPhuqW91O+4bwUk= -google.golang.org/grpc v1.59.0/go.mod h1:aUPDwccQo6OTjy7Hct4AfBPD1GptF4fyUjIkQ9YtF98= +google.golang.org/grpc v1.60.1 h1:26+wFr+cNqSGFcOXcabYC0lUVJVRa2Sb2ortSK7VrEU= +google.golang.org/grpc v1.60.1/go.mod h1:OlCHIeLYqSSsLi6i49B5QGdzaMZK9+M7LXN2FKz4eGM= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= @@ -1050,8 +1051,8 @@ google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQ google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= -google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.32.0 h1:pPC6BG5ex8PDFnkbrGU3EixyhKcQ2aDuBS36lqK/C7I= +google.golang.org/protobuf v1.32.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/modules/README.md b/modules/README.md index 769920aa4e..9448af47e6 100644 --- a/modules/README.md +++ b/modules/README.md @@ -54,15 +54,12 @@ Modules that are still in development and less stable are labeled with the pool][htcondor-setup]. * **[pbspro-execution]** ![community-badge] ![experimental-badge] : Creates execution hosts for use in a PBS Professional cluster. -* **[SchedMD-slurm-on-gcp-partition]** ![community-badge] ![deprecated-badge] : Creates a partition - to be used by a [slurm-controller][schedmd-slurm-on-gcp-controller]. * **[notebook]** ![community-badge] ![experimental-badge] : Creates a Vertex AI Notebook. Primarily used for [FSI - MonteCarlo Tutorial][fsi-montecarlo-on-batch-tutorial]. [vm-instance]: compute/vm-instance/README.md [gke-node-pool]: ../community/modules/compute/gke-node-pool/README.md [gke-job-template]: ../community/modules/compute/gke-job-template/README.md -[schedmd-slurm-on-gcp-partition]: ../community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md [schedmd-slurm-gcp-v5-partition]: ../community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md [schedmd-slurm-gcp-v5-node-group]: ../community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md [schedmd-slurm-gcp-v6-partition]: ../community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md @@ -77,7 +74,7 @@ Modules that are still in development and less stable are labeled with the * **[slurm-cloudsql-federation]** ![community-badge] ![experimental-badge] : Creates a [Google SQL Instance](https://cloud.google.com/sql/) meant to be - integrated with a [slurm-controller][schedmd-slurm-on-gcp-controller]. + integrated with a [slurm-controller][schedmd-slurm-gcp-v6-controller]. * **[bigquery-dataset]** ![community-badge] ![experimental-badge] : Creates a BQ dataset. Primarily used for [FSI - MonteCarlo Tutorial][fsi-montecarlo-on-batch-tutorial]. * **[bigquery-table]** ![community-badge] ![experimental-badge] : Creates a BQ @@ -200,10 +197,6 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca a client host for submitting jobs to a PBS Professional cluster. * **[pbspro-server]** ![community-badge] ![experimental-badge] : Creates a server host for operating a PBS Professional cluster. -* **[SchedMD-slurm-on-gcp-controller]** ![community-badge] ![deprecated-badge] : Creates a Slurm - controller node using [slurm-gcp]. -* **[SchedMD-slurm-on-gcp-login-node]** ![community-badge] ![deprecated-badge] : Creates a Slurm - login node using [slurm-gcp]. [batch-job-template]: ../modules/scheduler/batch-job-template/README.md [batch-login-node]: ../modules/scheduler/batch-login-node/README.md @@ -216,11 +209,8 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[schedmd-slurm-on-gcp-controller]: ../community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md -[schedmd-slurm-on-gcp-login-node]: ../community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/v4.2.1 -[slurm-gcp-version-5]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm-gcp-version-6]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0 +[slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md @@ -235,9 +225,10 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca a startup script to install HTCondor and exports a list of required APIs * **[kubernetes-operations]** ![community-badge] ![experimental-badge] : Performs pre-defined operations on Kubernetes resources. -* **[omnia-install]** ![community-badge] ![experimental-badge] : Installs Slurm - via [Dell Omnia](https://github.com/dellhpc/omnia) onto a cluster of VMs - instances. +* **[omnia-install]** ![community-badge] ![experimental-badge] ![deprecated-badge] : + Installs Slurm via [Dell Omnia](https://github.com/dellhpc/omnia) onto a + cluster of VM instances. _This module has been deprecated and will be removed + on August 1, 2024_. * **[pbspro-preinstall]** ![community-badge] ![experimental-badge] : Creates a Cloud Storage bucket with PBS Pro RPM packages for use by PBS clusters. * **[pbspro-install]** ![community-badge] ![experimental-badge] : Creates a @@ -275,6 +266,11 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [spack-execute]: ../community/modules/scripts/spack-execute/README.md [wait-for-startup]: ../community/modules/scripts/wait-for-startup/README.md +> **_NOTE:_** Slurm V4 is deprecated. In case, you want to use V4 modules, please use +[ghpc-v1.27.0](https://github.com/GoogleCloudPlatform/hpc-toolkit/releases/tag/v1.27.0) +source code and build ghpc binary from this. This source code also contains +deprecated examples using V4 modules for your reference. + ## Module Fields ### ID (Required) @@ -301,7 +297,7 @@ repository: * Hosted on [GitHub](https://developer.hashicorp.com/terraform/language/modules/sources#github) * Google Cloud Storage [Buckets](https://developer.hashicorp.com/terraform/language/modules/sources#gcs-bucket) * Generic [git repositories](https://developer.hashicorp.com/terraform/language/modules/sources#generic-git-repository) - + when modules are in a subdirectory of the git repository, a special double-slash `//` notation can be required as described below @@ -437,7 +433,7 @@ are supported, `git::https://` for HTTPS or `git::git@github.com` for SSH. Additional formatting and features after `git::` are identical to that of the [GitHub Modules](#github-modules) described above. -##### Google Cloud Storage Modules +#### Google Cloud Storage Modules To use a Terraform module available in a Google Cloud Storage bucket, set the source to a URL with the special `gcs::` prefix, followed by a [GCS bucket object URL](https://cloud.google.com/storage/docs/request-endpoints#typical). diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 4728f60438..75558be86f 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -185,7 +185,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/modules/compute/vm-instance/startup_from_network_storage.tf b/modules/compute/vm-instance/startup_from_network_storage.tf index 506722d115..1febd98c9d 100644 --- a/modules/compute/vm-instance/startup_from_network_storage.tf +++ b/modules/compute/vm-instance/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 0efb9dc3c4..7695d3f93a 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.28.1" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.28.1" } required_version = ">= 1.2.0" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index a026f56db5..eaee1766ff 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.28.1" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.28.1" } required_version = ">= 0.14.0" diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index 169c30796b..8d0af26510 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.28.1" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index fd9da26b42..f211424223 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.28.1" } required_version = ">= 0.14.0" diff --git a/modules/network/vpc/main.tf b/modules/network/vpc/main.tf index ce40c0f399..a5322c8f02 100644 --- a/modules/network/vpc/main.tf +++ b/modules/network/vpc/main.tf @@ -15,8 +15,9 @@ */ locals { - network_name = var.network_name == null ? "${var.deployment_name}-net" : var.network_name - subnetwork_name = var.subnetwork_name == null ? "${var.deployment_name}-primary-subnet" : var.subnetwork_name + autoname = replace(var.deployment_name, "_", "-") + network_name = var.network_name == null ? "${local.autoname}-net" : var.network_name + subnetwork_name = var.subnetwork_name == null ? "${local.autoname}-primary-subnet" : var.subnetwork_name # define a default subnetwork for cases in which no explicit subnetworks are # defined in var.subnetworks diff --git a/modules/packer/custom-image/README.md b/modules/packer/custom-image/README.md index 86f1251e8b..b9b33ccdea 100644 --- a/modules/packer/custom-image/README.md +++ b/modules/packer/custom-image/README.md @@ -31,6 +31,9 @@ images to internal projects. [shell]: #input_shell_scripts [ansible]: #input_ansible_playbooks [hpcimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm +[Image Builder]: ../../../examples/image-builder.yaml +[startup-script]: ../../../modules/scripts/startup-script +[examples README]: ../../../examples/README.md#image-builderyaml- [startup-metadata]: https://cloud.google.com/compute/docs/instances/startup-scripts/linux ## Example blueprints @@ -48,16 +51,6 @@ The [Image Builder] blueprint demonstrates a solution that builds an image using Please review the [examples README] for usage instructions. -### Intel-Optimized Slurm Cluster - -The [Intel-Optimized] Slurm Cluster [blueprint](../../../community/examples/intel/hpc-intel-select-slurm.yaml) -adds the Intel compliance software on top of a Slurm on GCP image. - -[Image Builder]: ../../../examples/image-builder.yaml -[startup-script]: ../../../modules/scripts/startup-script -[examples README]: ../../../examples/README.md#image-builderyaml- -[Intel-Optimized]: ../../../community/examples/intel/README.md#intel-optimized-slurm-cluster - ## Order of execution The startup script specified in metadata executes in parallel with the other diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index f24b294c9c..8d343212c5 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -135,7 +135,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [instance\_template](#module\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 8.0 | -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/modules/scheduler/batch-job-template/main.tf b/modules/scheduler/batch-job-template/main.tf index 50d93462ae..f4e562b9f8 100644 --- a/modules/scheduler/batch-job-template/main.tf +++ b/modules/scheduler/batch-job-template/main.tf @@ -25,7 +25,7 @@ locals { tasks_per_node = var.task_count_per_node != null ? var.task_count_per_node : (var.mpi_mode ? 1 : null) job_template_contents = templatefile( - "${path.module}/templates/batch-job-base.json.tftpl", + "${path.module}/templates/batch-job-base.yaml.tftpl", { synchronized = var.mpi_mode runnable = var.runnable @@ -40,7 +40,7 @@ locals { ) job_id = var.job_id != null ? var.job_id : var.deployment_name - job_filename = var.job_filename != null ? var.job_filename : "cloud-batch-${local.job_id}.json" + job_filename = var.job_filename != null ? var.job_filename : "cloud-batch-${local.job_id}.yaml" job_template_output_path = "${path.root}/${local.job_filename}" subnetwork_name = var.subnetwork != null ? var.subnetwork.name : "default" diff --git a/modules/scheduler/batch-job-template/startup_from_network_storage.tf b/modules/scheduler/batch-job-template/startup_from_network_storage.tf index 506722d115..1febd98c9d 100644 --- a/modules/scheduler/batch-job-template/startup_from_network_storage.tf +++ b/modules/scheduler/batch-job-template/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/modules/scheduler/batch-job-template/templates/batch-job-base.json.tftpl b/modules/scheduler/batch-job-template/templates/batch-job-base.json.tftpl deleted file mode 100644 index cb30e5abbf..0000000000 --- a/modules/scheduler/batch-job-template/templates/batch-job-base.json.tftpl +++ /dev/null @@ -1,49 +0,0 @@ -{ - "taskGroups": [{ - "taskSpec": { - "runnables": [%{ if synchronized } - { - "barrier": {} - },%{ endif } - { - "script": { - "text": ${jsonencode(runnable)} - } - }%{ if synchronized }, - { - "barrier": {} - }%{ endif } - ], - "volumes":[ - %{~ for index, vol in nfs_volumes ~} - { - "nfs":{ - "server":"${vol.server_ip}", - "remote_path": "${vol.remote_mount}" - }, - %{~ if vol.mount_options != "" && vol.mount_options != null ~} - "mount_options": "${vol.mount_options}", - %{~ endif ~} - "mount_path": "${vol.local_mount}" - }%{~ if index != (length(nfs_volumes) -1) },%{ endif } - %{~ endfor ~} - ] - }, - "taskCount":${task_count},%{ if tasks_per_node != null } - "taskCountPerNode": ${tasks_per_node},%{ endif } - "requireHostsFile": ${require_hosts_file}, - "permissiveSsh": ${permissive_ssh} - }]%{ if instance_template != null }, - "allocationPolicy": { - "instances": [{ - "instanceTemplate": "${instance_template}" - }] - }%{ endif }%{ if log_policy == "CLOUD_LOGGING" }, - "logsPolicy": { - "destination": "CLOUD_LOGGING" - }%{ endif }%{ if log_policy == "PATH" }, - "logsPolicy": { - "destination": "PATH", - "logsPath": ## Add logging path here - }%{ endif } -} diff --git a/modules/scheduler/batch-job-template/templates/batch-job-base.yaml.tftpl b/modules/scheduler/batch-job-template/templates/batch-job-base.yaml.tftpl new file mode 100644 index 0000000000..177c2ce29f --- /dev/null +++ b/modules/scheduler/batch-job-template/templates/batch-job-base.yaml.tftpl @@ -0,0 +1,45 @@ +taskGroups: + - taskSpec: + runnables: + %{~ if synchronized ~} + - barrier: + name: "wait-for-node-startup" + %{~ endif ~} + - script: + text: ${indent(12, chomp(yamlencode(runnable)))} + %{~ if synchronized ~} + - barrier: + name: "wait-for-workload-to-complete" + %{~ endif ~} + %{~ if length(nfs_volumes) > 0 ~} + volumes: + %{~ for index, vol in nfs_volumes ~} + - nfs: + server: "${vol.server_ip}" + remotePath: "${vol.remote_mount}" + %{~ if vol.mount_options != "" && vol.mount_options != null ~} + mountOptions: "${vol.mount_options}" + %{~ endif ~} + mountPath: "${vol.local_mount}" + %{~ endfor ~} + %{~ endif ~} + taskCount: ${task_count} + %{~ if tasks_per_node != null ~} + taskCountPerNode: ${tasks_per_node} + %{~ endif ~} + requireHostsFile: ${require_hosts_file} + permissiveSsh: ${permissive_ssh} +%{~ if instance_template != null } +allocationPolicy: + instances: + - instanceTemplate: "${instance_template}" +%{~ endif } +%{~ if log_policy == "CLOUD_LOGGING" } +logsPolicy: + destination: "CLOUD_LOGGING" +%{ endif } +%{~ if log_policy == "PATH" } +logsPolicy: + destination: "PATH" + logsPath: ## Add logging path here +%{ endif } diff --git a/modules/scheduler/batch-login-node/README.md b/modules/scheduler/batch-login-node/README.md index 43469c7257..86c14f93ef 100644 --- a/modules/scheduler/batch-login-node/README.md +++ b/modules/scheduler/batch-login-node/README.md @@ -89,7 +89,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/modules/scheduler/batch-login-node/main.tf b/modules/scheduler/batch-login-node/main.tf index f5f013cb9a..9f2cba6181 100644 --- a/modules/scheduler/batch-login-node/main.tf +++ b/modules/scheduler/batch-login-node/main.tf @@ -94,7 +94,7 @@ locals { } module "login_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id deployment_name = var.deployment_name diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index 6844e61f9c..8859e492a0 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.28.1" } required_version = ">= 0.14.0" diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index dbad630070..36730850b9 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -104,12 +104,12 @@ and therefore must have access to GCS. > `https://www.googleapis.com/auth/devstorage.read_only`. > > This is set as a default scope in the [vm-instance], -> [SchedMD-slurm-on-gcp-login-node] and [SchedMD-slurm-on-gcp-controller] +> [schedMD-slurm-on-gcp-login-node] and [schedMD-slurm-on-gcp-controller] > modules [vm-instance]: ../../compute/vm-instance/README.md -[SchedMD-slurm-on-gcp-login-node]: ../../../community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md -[SchedMD-slurm-on-gcp-controller]: ../../../community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md +[schedMD-slurm-on-gcp-login-node]: ../../../community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +[schedMD-slurm-on-gcp-controller]: ../../../community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md ### Tracking startup script execution @@ -130,6 +130,60 @@ To view outputs from a Linux startup script, run: sudo journalctl -u google-startup-scripts.service ``` +### Monitoring Agent Installation + +This `startup-script` module has several options for installing a Google +monitoring agent. There are two relevant settings: `install_stackdriver_agent` +and `install_cloud_ops_agent`. + +The _Stackdriver Agent_ also called the _Legacy Cloud Monitoring Agent_ provides +better performance under some HPC workloads. While official documentation +recommends using the _Cloud Ops Agent_, it is recommended to use +`install_stackdriver_agent` when performance is important. + +If an image or machine already has Cloud Ops Agent installed and you would like +to instead use the Stackdrier Agent, the following script will remove the Cloud +Ops Agent and install the Stackdriver Agent. + +```bash +# Remove Cloud Ops Agent +sudo systemctl stop google-cloud-ops-agent.service +sudo systemctl disable google-cloud-ops-agent.service +curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh +sudo bash add-google-cloud-ops-agent-repo.sh --uninstall +sudo bash add-google-cloud-ops-agent-repo.sh --remove-repo + +# Install Stackdriver Agent +curl -sSO https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh +sudo bash add-monitoring-agent-repo.sh --also-install +curl -sSO https://dl.google.com/cloudagents/add-logging-agent-repo.sh +sudo bash add-logging-agent-repo.sh --also-install +sudo service stackdriver-agent start +``` + +You can test if one of the agents is running using the following commands: + +```bash +# For Cloud Ops Agent +$ sudo systemctl is-active google-cloud-ops-agent"*" +active +active +active +active + +# For Legacy Monitoring and Logging Agents +$ sudo service stackdriver-agent status +stackdriver-agent is running [ OK ] +$ sudo service google-fluentd status +google-fluentd is running [ OK ] +``` + +For official documentation see troubleshooting docs: + +- [Cloud Ops Agent](https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/troubleshoot-install-startup) +- [Legacy Monitoring Agent](https://cloud.google.com/stackdriver/docs/solutions/agents/monitoring/troubleshooting) +- [Legacy Logging Agent](https://cloud.google.com/stackdriver/docs/solutions/agents/logging/troubleshooting) + ### Example ```yaml @@ -171,7 +225,7 @@ they are able to do so by using the `gcs_bucket_path` as shown in the below exam source: ./modules/scripts/startup-script settings: gcs_bucket_path: gs://user-test-bucket/folder1/folder2 - install_cloud_ops_agent: true + install_stackdriver_agent: true - id: compute-cluster source: ./modules/compute/vm-instance @@ -238,7 +292,8 @@ No modules. | [gcs\_bucket\_path](#input\_gcs\_bucket\_path) | The GCS path for storage bucket and the object, starting with `gs://`. | `string` | `null` | no | | [http\_proxy](#input\_http\_proxy) | Web (http and https) proxy configuration for pip, apt, and yum/dnf | `string` | `""` | no | | [install\_ansible](#input\_install\_ansible) | Run Ansible installation script if either set to true or unset and runner of type 'ansible-local' are used. | `bool` | `null` | no | -| [install\_cloud\_ops\_agent](#input\_install\_cloud\_ops\_agent) | Run Google Ops Agent installation script if set to true. | `bool` | `false` | no | +| [install\_cloud\_ops\_agent](#input\_install\_cloud\_ops\_agent) | Warning: Consider using `install_stackdriver_agent` for better performance. Run Google Ops Agent installation script if set to true. | `bool` | `false` | no | +| [install\_stackdriver\_agent](#input\_install\_stackdriver\_agent) | Run Google Stackdriver Agent installation script if set to true. Preferred over ops agent for performance. | `bool` | `false` | no | | [labels](#input\_labels) | Labels for the created GCS bucket. Key-value pairs. | `map(string)` | n/a | yes | | [prepend\_ansible\_installer](#input\_prepend\_ansible\_installer) | DEPRECATED. Use `install_ansible=false` to prevent ansible installation. | `bool` | `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes | diff --git a/modules/scripts/startup-script/files/install_ansible.sh b/modules/scripts/startup-script/files/install_ansible.sh index ba75665eb9..7a619d5cc3 100644 --- a/modules/scripts/startup-script/files/install_ansible.sh +++ b/modules/scripts/startup-script/files/install_ansible.sh @@ -186,7 +186,7 @@ main() { fi # Create pip virtual environment for HPC Toolkit - ${python_path} -m venv "${venv_path}" + ${python_path} -m venv "${venv_path}" --copies venv_python_path=${venv_path}/bin/python3 # Upgrade pip if necessary diff --git a/modules/scripts/startup-script/files/install_cloud_ops_agent.sh b/modules/scripts/startup-script/files/install_monitoring_agent.sh similarity index 55% rename from modules/scripts/startup-script/files/install_cloud_ops_agent.sh rename to modules/scripts/startup-script/files/install_monitoring_agent.sh index fbafe17110..eb4bf899b8 100644 --- a/modules/scripts/startup-script/files/install_cloud_ops_agent.sh +++ b/modules/scripts/startup-script/files/install_monitoring_agent.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2022 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,12 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +set -e -o pipefail LEGACY_MONITORING_PACKAGE='stackdriver-agent' +LEGACY_MONITORING_SCRIPT_URL='https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh' LEGACY_LOGGING_PACKAGE='google-fluentd' +LEGACY_LOGGING_SCRIPT_URL='https://dl.google.com/cloudagents/add-logging-agent-repo.sh' + OPSAGENT_PACKAGE='google-cloud-ops-agent' OPSAGENT_SCRIPT_URL='https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh' +ops_or_legacy="${1:-legacy}" + fail() { echo >&2 "[$(date +'%Y-%m-%dT%H:%M:%S%z')] $*" exit 1 @@ -43,19 +49,30 @@ handle_debian() { grep "${OPSAGENT_PACKAGE} is installed" } - install_opsagent() { + install_with_retry() { MAX_RETRY=50 RETRY=0 - until [ ${RETRY} -eq ${MAX_RETRY} ] || curl -s "${OPSAGENT_SCRIPT_URL}" | bash -s -- --also-install; do + until [ ${RETRY} -eq ${MAX_RETRY} ] || curl -s "${1}" | bash -s -- --also-install; do RETRY=$((RETRY + 1)) - echo "WARNING: Cloud ops installation failed on try ${RETRY} of ${MAX_RETRY}" + echo "WARNING: Installation of ${1} failed on try ${RETRY} of ${MAX_RETRY}" sleep 5 done if [ $RETRY -eq $MAX_RETRY ]; then - echo "ERROR: Cloud ops installation was not successful after ${MAX_RETRY} attempts." + echo "ERROR: Installation of ${1} was not successful after ${MAX_RETRY} attempts." exit 1 fi } + + install_opsagent() { + install_with_retry "${OPSAGENT_SCRIPT_URL}" + } + + install_stackdriver_agent() { + install_with_retry "${LEGACY_MONITORING_SCRIPT_URL}" + install_with_retry "${LEGACY_LOGGING_SCRIPT_URL}" + service stackdriver-agent start + service google-fluentd start + } } handle_redhat() { @@ -79,7 +96,14 @@ handle_redhat() { } install_opsagent() { - curl -s https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh | bash -s -- --also-install + curl -s "${OPSAGENT_SCRIPT_URL}" | bash -s -- --also-install + } + + install_stackdriver_agent() { + curl -sS "${LEGACY_MONITORING_SCRIPT_URL}" | bash -s -- --also-install + curl -sS "${LEGACY_LOGGING_SCRIPT_URL}" | bash -s -- --also-install + service stackdriver-agent start + service google-fluentd start } } @@ -92,11 +116,29 @@ main() { fail "Unsupported platform." fi - if is_legacy_installed || is_opsagent_installed; then - fail "Legacy or Ops Agent is already installed." + # Handle cases that agent is already installed + if [[ -z "$(is_legacy_monitoring_installed)" && -n $(is_legacy_logging_installed) ]] || + [[ -n "$(is_legacy_monitoring_installed)" && -z $(is_legacy_logging_installed) ]]; then + fail "Bad state: legacy agent is partially installed" + elif [[ "${ops_or_legacy}" == "legacy" ]] && is_legacy_installed; then + echo "Legacy agent is already installed" + exit 0 + elif [[ "${ops_or_legacy}" != "legacy" ]] && is_opsagent_installed; then + echo "Ops agent is already installed" + exit 0 + elif is_legacy_installed || is_opsagent_installed; then + fail "Agent is already installed but does not match requested agent of ${ops_or_legacy}" fi - install_opsagent + # install agent + if [[ "${ops_or_legacy}" == "legacy" ]]; then + echo "Installing legacy monitoring agent (stackdriver)" + install_stackdriver_agent + else + echo "Installing cloud ops agent" + echo "WARNING: cloud ops agent may have a performance impact. Consider using legacy monitoring agent (stackdriver)." + install_opsagent + fi } main diff --git a/modules/scripts/startup-script/main.tf b/modules/scripts/startup-script/main.tf index 152d63ed67..42cc8791ac 100644 --- a/modules/scripts/startup-script/main.tf +++ b/modules/scripts/startup-script/main.tf @@ -20,11 +20,16 @@ locals { } locals { - ops_agent_installer = var.install_cloud_ops_agent ? [{ - type = "shell" - source = "${path.module}/files/install_cloud_ops_agent.sh" - destination = "install_cloud_ops_agent_automatic.sh" - }] : [] + monitoring_agent_installer = ( + var.install_cloud_ops_agent || var.install_stackdriver_agent ? + [{ + type = "shell" + source = "${path.module}/files/install_monitoring_agent.sh" + destination = "install_monitoring_agent_automatic.sh" + args = var.install_cloud_ops_agent ? "ops" : "legacy" # install legacy (stackdriver) + }] : + [] + ) warnings = [ { @@ -84,7 +89,7 @@ locals { runners = concat( local.warnings, local.proxy_runner, - local.ops_agent_installer, + local.monitoring_agent_installer, local.ansible_installer, local.configure_ssh_runners, var.runners @@ -167,6 +172,13 @@ resource "google_storage_bucket_object" "scripts" { create = "10m" update = "10m" } + + lifecycle { + precondition { + condition = !(var.install_cloud_ops_agent && var.install_stackdriver_agent) + error_message = "Only one of var.install_stackdriver_agent or var.install_cloud_ops_agent can be set. Stackdriver is recommended for best performance." + } + } } resource "local_file" "debug_file" { diff --git a/modules/scripts/startup-script/variables.tf b/modules/scripts/startup-script/variables.tf index 17d609aba7..e4d56a5999 100644 --- a/modules/scripts/startup-script/variables.tf +++ b/modules/scripts/startup-script/variables.tf @@ -113,7 +113,13 @@ EOT } variable "install_cloud_ops_agent" { - description = "Run Google Ops Agent installation script if set to true." + description = "Warning: Consider using `install_stackdriver_agent` for better performance. Run Google Ops Agent installation script if set to true." + type = bool + default = false +} + +variable "install_stackdriver_agent" { + description = "Run Google Stackdriver Agent installation script if set to true. Preferred over ops agent for performance." type = bool default = false } diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 42379ee359..eb900c7936 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.28.1" } required_version = ">= 0.14.0" diff --git a/pkg/config/config.go b/pkg/config/config.go index c5bab10c64..ffff6bf26f 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -24,6 +24,7 @@ import ( "strings" "github.com/agext/levenshtein" + "github.com/hashicorp/hcl/v2" "github.com/pkg/errors" "github.com/zclconf/go-cty/cty" "gopkg.in/yaml.v3" @@ -89,11 +90,10 @@ func (g DeploymentGroup) Kind() ModuleKind { // Module return the module with the given ID func (bp *Blueprint) Module(id ModuleID) (*Module, error) { var mod *Module - bp.WalkModules(func(m *Module) error { + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { if m.ID == id { mod = m } - return nil }) if mod == nil { return nil, UnknownModuleError{id} @@ -101,25 +101,19 @@ func (bp *Blueprint) Module(id ModuleID) (*Module, error) { return mod, nil } -// SuggestModuleIDHint return a correct spelling of given ModuleID id if one -// is close enough (based on maxHintDist) -func (bp Blueprint) SuggestModuleIDHint(id ModuleID) (string, bool) { - clMod := "" - minDist := -1 - bp.WalkModules(func(m *Module) error { - dist := levenshtein.Distance(string(m.ID), string(id), nil) - if minDist == -1.0 || dist < minDist { - minDist = dist - clMod = string(m.ID) +func hintSpelling(s string, dict []string, err error) error { + best, minDist := "", maxHintDist+1 + for _, w := range dict { + d := levenshtein.Distance(s, w, nil) + if d < minDist { + best, minDist = w, d } - return nil - }) - - if clMod != "" && minDist <= maxHintDist { - return clMod, true } + if minDist <= maxHintDist { + return HintError{fmt.Sprintf("did you mean %q?", best), err} + } + return err - return "", false } // ModuleGroup returns the group containing the module @@ -252,6 +246,10 @@ type Blueprint struct { Vars Dict DeploymentGroups []DeploymentGroup `yaml:"deployment_groups"` TerraformBackendDefaults TerraformBackend `yaml:"terraform_backend_defaults,omitempty"` + + // Preserves the original values of `Vars` (as defined by the user), + // while `Vars` can mutate (add `labels`, evaluate values). + origVars Dict } // DeploymentConfig is a container for the imported YAML data and supporting data for @@ -262,7 +260,7 @@ type DeploymentConfig struct { // ExpandConfig expands the yaml config in place func (dc *DeploymentConfig) ExpandConfig() error { - dc.Config.setGlobalLabels() + dc.Config.origVars = NewDict(dc.Config.Vars.Items()) // copy dc.Config.addKindToModules() if vars, err := dc.Config.evalVars(); err != nil { @@ -271,17 +269,29 @@ func (dc *DeploymentConfig) ExpandConfig() error { dc.Config.Vars = vars } + dc.expandBackends() + dc.combineLabels() + if err := validateBlueprint(dc.Config); err != nil { return err } - return dc.expand() -} + if err := dc.applyUseModules(); err != nil { + return err + } + + dc.applyGlobalVariables() -func (bp *Blueprint) setGlobalLabels() { - if !bp.Vars.Has("labels") { - bp.Vars.Set("labels", cty.EmptyObjectVal) + if err := validateInputsAllModules(dc.Config); err != nil { + return err + } + + if err := validateModulesAreUsed(dc.Config); err != nil { + return err } + + dc.Config.populateOutputs() + return nil } // ListUnusedModules provides a list modules that are in the @@ -308,7 +318,7 @@ func (m Module) ListUnusedModules() ModuleIDs { // GetUsedDeploymentVars returns a list of deployment vars used in the given value func GetUsedDeploymentVars(val cty.Value) []string { res := []string{} - for _, ref := range valueReferences(val) { + for ref := range valueReferences(val) { if ref.GlobalVar { res = append(res, ref.Name) } @@ -320,32 +330,28 @@ func GetUsedDeploymentVars(val cty.Value) []string { func (bp Blueprint) ListUnusedVariables() []string { // Gather all scopes where variables are used ns := map[string]cty.Value{ - "vars": bp.Vars.AsObject(), + "vars": bp.origVars.AsObject(), } - bp.WalkModules(func(m *Module) error { + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { ns["module_"+string(m.ID)] = m.Settings.AsObject() - return nil }) for _, v := range bp.Validators { ns["validator_"+v.Validator] = v.Inputs.AsObject() } - // these variables are required or automatically added; var used = map[string]bool{ - "labels": true, - "deployment_name": true, + "deployment_name": true, // required => always used } for _, v := range GetUsedDeploymentVars(cty.ObjectVal(ns)) { used[v] = true } unused := []string{} - for k := range bp.Vars.Items() { + for _, k := range bp.origVars.Keys() { if _, ok := used[k]; !ok { unused = append(unused, k) } } - return unused } @@ -398,11 +404,10 @@ func (dc DeploymentConfig) ExportBlueprint(outputFilename string) error { // addKindToModules sets the kind to 'terraform' when empty. func (bp *Blueprint) addKindToModules() { - bp.WalkModules(func(m *Module) error { + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { if m.Kind == UnknownKind { m.Kind = TerraformKind } - return nil }) } @@ -440,7 +445,7 @@ func checkModulesAndGroups(bp Blueprint) error { // validateModuleUseReferences verifies that any used modules exist and // are in the correct group -func validateModuleUseReferences(p modulePath, mod Module, bp Blueprint) error { +func validateModuleUseReferences(p ModulePath, mod Module, bp Blueprint) error { errs := Errors{} for iu, used := range mod.Use { errs.At(p.Use.At(iu), validateModuleReference(bp, mod, used)) @@ -449,17 +454,15 @@ func validateModuleUseReferences(p modulePath, mod Module, bp Blueprint) error { } func checkBackend(b TerraformBackend) error { - const errMsg = "can not use variables in terraform_backend block, got '%s=%s'" - // TerraformBackend.Type is typed as string, "simple" variables and HCL literals stay "as is". - if hasVariable(b.Type) { - return fmt.Errorf(errMsg, "type", b.Type) - } - if _, is := IsYamlExpressionLiteral(cty.StringVal(b.Type)); is { - return fmt.Errorf(errMsg, "type", b.Type) + err := errors.New("can not use expressions in terraform_backend block") + val, perr := parseYamlString(b.Type) + + if _, is := IsExpressionValue(val); is || perr != nil { + return err } return cty.Walk(b.Configuration.AsObject(), func(p cty.Path, v cty.Value) (bool, error) { if _, is := IsExpressionValue(v); is { - return false, fmt.Errorf("can not use variables in terraform_backend block") + return false, err } return true, nil }) @@ -487,7 +490,7 @@ func validateBlueprint(bp Blueprint) error { // SkipValidator marks validator(s) as skipped, // if no validator is present, adds one, marked as skipped. -func (dc *DeploymentConfig) SkipValidator(name string) error { +func (dc *DeploymentConfig) SkipValidator(name string) { if dc.Config.Validators == nil { dc.Config.Validators = []Validator{} } @@ -501,7 +504,6 @@ func (dc *DeploymentConfig) SkipValidator(name string) error { if !skipped { dc.Config.Validators = append(dc.Config.Validators, Validator{Validator: name, Skip: true}) } - return nil } // InputValueError signifies a problem with the blueprint name. @@ -636,12 +638,13 @@ func IsProductOfModuleUse(v cty.Value) []ModuleID { } // WalkModules walks all modules in the blueprint and calls the walker function -func (bp *Blueprint) WalkModules(walker func(*Module) error) error { +func (bp *Blueprint) WalkModules(walker func(ModulePath, *Module) error) error { for ig := range bp.DeploymentGroups { g := &bp.DeploymentGroups[ig] for im := range g.Modules { + p := Root.Groups.At(ig).Modules.At(im) m := &g.Modules[im] - if err := walker(m); err != nil { + if err := walker(p, m); err != nil { return err } } @@ -649,13 +652,21 @@ func (bp *Blueprint) WalkModules(walker func(*Module) error) error { return nil } +func (bp *Blueprint) WalkModulesSafe(walker func(ModulePath, *Module)) { + bp.WalkModules(func(p ModulePath, m *Module) error { + walker(p, m) + return nil + }) +} + // validate every module setting in the blueprint containing a reference -func validateModuleSettingReferences(p modulePath, m Module, bp Blueprint) error { +func validateModuleSettingReferences(p ModulePath, m Module, bp Blueprint) error { errs := Errors{} for k, v := range m.Settings.Items() { - for _, r := range valueReferences(v) { - // TODO: add a cty.Path suffix to the errors path for better location - errs.At(p.Settings.Dot(k), validateModuleSettingReference(bp, m, r)) + for r, rp := range valueReferences(v) { + errs.At( + p.Settings.Dot(k).Cty(rp), + validateModuleSettingReference(bp, m, r)) } } return errs.OrNil() @@ -672,50 +683,66 @@ func checkPackerGroups(groups []DeploymentGroup) error { return errs.OrNil() } -func (bp *Blueprint) evalVars() (Dict, error) { - // 0 - unvisited - // 1 - on stack - // 2 - done - used := map[string]int{} - res := Dict{} +func varsTopologicalOrder(vars Dict) ([]string, error) { + // 0, 1, 2 - unvisited, on stack, exited + used := map[string]int{} // default is 0 - unvisited + res := []string{} - // walk vars in reverse topological order, and evaluate them + // walk vars in reverse topological order var dfs func(string) error dfs = func(n string) error { used[n] = 1 // put on stack - v := bp.Vars.Get(n) - for _, ref := range valueReferences(v) { + v := vars.Get(n) + for ref, rp := range valueReferences(v) { + // TODO: instead of ref.Name render as a full reference + repr, p := ref.Name, Root.Vars.Dot(n).Cty(rp) + if !ref.GlobalVar { - return BpError{ - Root.Vars.Dot(n), - fmt.Errorf("non-global variable %q referenced in expression", ref.Name), - } + return BpError{p, fmt.Errorf("non-global variable %q referenced in expression", repr)} } + if used[ref.Name] == 1 { - return BpError{ - Root.Vars.Dot(n), - fmt.Errorf("cyclic dependency detected: %q -> %q", n, ref.Name), - } + return BpError{p, fmt.Errorf("cyclic dependency detected: %q -> %q", n, repr)} } + if used[ref.Name] == 0 { if err := dfs(ref.Name); err != nil { return err } } } - - used[n] = 2 // remove from stack and evaluate - ev, err := evalValue(v, Blueprint{Vars: res}) - res.Set(n, ev) - return err + used[n] = 2 // remove from stack and add to result + res = append(res, n) + return nil } - for n := range bp.Vars.Items() { + for n := range vars.Items() { if used[n] == 0 { // unvisited if err := dfs(n); err != nil { - return Dict{}, err + return nil, err } } } return res, nil } + +func (bp *Blueprint) evalVars() (Dict, error) { + order, err := varsTopologicalOrder(bp.Vars) + if err != nil { + return Dict{}, err + } + + res := map[string]cty.Value{} + ctx := hcl.EvalContext{ + Variables: map[string]cty.Value{}, + Functions: functions()} + for _, n := range order { + ctx.Variables["var"] = cty.ObjectVal(res) + ev, err := eval(bp.Vars.Get(n), &ctx) + if err != nil { + return Dict{}, BpError{Root.Vars.Dot(n), err} + } + res[n] = ev + } + return NewDict(res), nil +} diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 80602b5269..f582b50234 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -179,14 +179,8 @@ func (s *MySuite) getMultiGroupDeploymentConfig() DeploymentConfig { testModuleInfo0 := modulereader.ModuleInfo{ Inputs: []modulereader.VarInfo{ - { - Name: "deployment_name", - Type: "string", - }, - { - Name: altProjectIDSetting, - Type: "string", - }, + {Name: "deployment_name", Type: cty.String}, + {Name: altProjectIDSetting, Type: cty.String}, }, Outputs: []modulereader.OutputInfo{ { @@ -220,13 +214,8 @@ func (s *MySuite) getMultiGroupDeploymentConfig() DeploymentConfig { testModuleInfo2 := modulereader.ModuleInfo{ Inputs: []modulereader.VarInfo{ - { - Name: "deployment_name", - Type: "string", - }, - { - Name: matchingIntergroupName, - }, + {Name: "deployment_name", Type: cty.String}, + {Name: matchingIntergroupName}, }, Outputs: []modulereader.OutputInfo{}, } @@ -236,7 +225,7 @@ func (s *MySuite) getMultiGroupDeploymentConfig() DeploymentConfig { Kind: TerraformKind, Source: testModuleSource0, Settings: NewDict(map[string]cty.Value{ - altProjectIDSetting: GlobalRef("project_id").AsExpression().AsValue(), + altProjectIDSetting: GlobalRef("project_id").AsValue(), }), Outputs: []modulereader.OutputInfo{ {Name: matchingIntergroupName}, @@ -250,7 +239,7 @@ func (s *MySuite) getMultiGroupDeploymentConfig() DeploymentConfig { Source: testModuleSource1, Settings: NewDict(map[string]cty.Value{ matchingIntragroupName1: cty.StringVal("explicit-intra-value"), - matchingIntragroupName2: ModuleRef(mod0.ID, matchingIntragroupName2).AsExpression().AsValue(), + matchingIntragroupName2: ModuleRef(mod0.ID, matchingIntragroupName2).AsValue(), }), Use: ModuleIDs{mod0.ID}, } @@ -354,18 +343,27 @@ func (s *zeroSuite) TestListUnusedModules(c *C) { } } -func (s *MySuite) TestListUnusedVariables(c *C) { - dc := s.getDeploymentConfigForTest() - dc.applyGlobalVariables() - - unusedVars := dc.Config.ListUnusedVariables() - c.Assert(unusedVars, DeepEquals, []string{"project_id"}) - - dc = s.getMultiGroupDeploymentConfig() - dc.applyGlobalVariables() - - unusedVars = dc.Config.ListUnusedVariables() - c.Assert(unusedVars, DeepEquals, []string{"unused_key"}) +func (s *zeroSuite) TestListUnusedVariables(c *C) { + bp := Blueprint{ + Vars: NewDict(map[string]cty.Value{ + "deployment_name": cty.StringVal("green"), + "flathead_screw": cty.NumberIntVal(1), + "pony": cty.NumberIntVal(2), + "stripes": cty.NumberIntVal(3), + "zebra": MustParseExpression("var.pony + var.stripes").AsValue(), + }), + DeploymentGroups: []DeploymentGroup{{Modules: []Module{{ + Settings: NewDict(map[string]cty.Value{ + "circus": GlobalRef("pony").AsValue(), + }), + }}}}, + Validators: []Validator{{ + Inputs: NewDict(map[string]cty.Value{ + "savannah": GlobalRef("zebra").AsValue(), + })}}} + bp.origVars = NewDict(bp.Vars.Items()) + + c.Check(bp.ListUnusedVariables(), DeepEquals, []string{"flathead_screw"}) } func (s *zeroSuite) TestAddKindToModules(c *C) { @@ -683,22 +681,22 @@ func (s *zeroSuite) TestCheckBackends(c *C) { { // FAIL. Variable in defaults type b := TerraformBackend{Type: "$(vartype)"} - c.Check(check(b), ErrorMatches, ".*type.*vartype.*") + c.Check(check(b), NotNil) } { // FAIL. Variable in group backend type b := TerraformBackend{Type: "$(vartype)"} - c.Check(check(dummy, b), ErrorMatches, ".*type.*vartype.*") + c.Check(check(dummy, b), NotNil) } { // FAIL. Deployment variable in defaults type b := TerraformBackend{Type: "$(vars.type)"} - c.Check(check(b), ErrorMatches, ".*type.*vars\\.type.*") + c.Check(check(b), NotNil) } { // FAIL. HCL literal b := TerraformBackend{Type: "((var.zen))"} - c.Check(check(b), ErrorMatches, ".*type.*zen.*") + c.Check(check(b), NotNil) } { // OK. Not a variable @@ -708,13 +706,13 @@ func (s *zeroSuite) TestCheckBackends(c *C) { { // FAIL. Mid-string variable in defaults type b := TerraformBackend{Type: "hugs_$(vartype)_hugs"} - c.Check(check(b), ErrorMatches, ".*type.*vartype.*") + c.Check(check(b), NotNil) } { // FAIL. Variable in defaults configuration b := TerraformBackend{Type: "gcs"} - b.Configuration.Set("bucket", GlobalRef("trenta").AsExpression().AsValue()) - c.Check(check(b), ErrorMatches, ".*can not use variables.*") + b.Configuration.Set("bucket", GlobalRef("trenta").AsValue()) + c.Check(check(b), NotNil) } { // OK. handles nested configuration @@ -723,23 +721,23 @@ func (s *zeroSuite) TestCheckBackends(c *C) { Set("bucket", cty.StringVal("trenta")). Set("complex", cty.ObjectVal(map[string]cty.Value{ "alpha": cty.StringVal("a"), - "beta": GlobalRef("boba").AsExpression().AsValue(), + "beta": GlobalRef("boba").AsValue(), })) - c.Check(check(b), ErrorMatches, ".*can not use variables.*") + c.Check(check(b), NotNil) } } func (s *zeroSuite) TestSkipValidator(c *C) { { dc := DeploymentConfig{Config: Blueprint{Validators: nil}} - c.Check(dc.SkipValidator("zebra"), IsNil) + dc.SkipValidator("zebra") c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "zebra", Skip: true}}) } { dc := DeploymentConfig{Config: Blueprint{Validators: []Validator{ {Validator: "pony"}}}} - c.Check(dc.SkipValidator("zebra"), IsNil) + dc.SkipValidator("zebra") c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "pony"}, {Validator: "zebra", Skip: true}}) @@ -748,7 +746,7 @@ func (s *zeroSuite) TestSkipValidator(c *C) { dc := DeploymentConfig{Config: Blueprint{Validators: []Validator{ {Validator: "pony"}, {Validator: "zebra"}}}} - c.Check(dc.SkipValidator("zebra"), IsNil) + dc.SkipValidator("zebra") c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "pony"}, {Validator: "zebra", Skip: true}}) @@ -757,7 +755,7 @@ func (s *zeroSuite) TestSkipValidator(c *C) { dc := DeploymentConfig{Config: Blueprint{Validators: []Validator{ {Validator: "pony"}, {Validator: "zebra", Skip: true}}}} - c.Check(dc.SkipValidator("zebra"), IsNil) + dc.SkipValidator("zebra") c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "pony"}, {Validator: "zebra", Skip: true}}) @@ -767,7 +765,7 @@ func (s *zeroSuite) TestSkipValidator(c *C) { {Validator: "zebra"}, {Validator: "pony"}, {Validator: "zebra"}}}} - c.Check(dc.SkipValidator("zebra"), IsNil) + dc.SkipValidator("zebra") c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "zebra", Skip: true}, {Validator: "pony"}, @@ -842,23 +840,23 @@ func (s *zeroSuite) TestValidateModuleSettingReference(c *C) { // FAIL. get global hint mod := ModuleID("var") unkModErr := UnknownModuleError{mod} - c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{"Did you mean \"vars\"?", unkModErr}), Equals, true) + c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{`did you mean "vars"?`, unkModErr}), Equals, true) // FAIL. get module ID hint mod = ModuleID("pkp") unkModErr = UnknownModuleError{mod} - c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{fmt.Sprintf("Did you mean \"%s\"?", string(pkr.ID)), unkModErr}), Equals, true) + c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{fmt.Sprintf("did you mean %q?", string(pkr.ID)), unkModErr}), Equals, true) // FAIL. get no hint mod = ModuleID("test") unkModErr = UnknownModuleError{mod} - c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{fmt.Sprintf("Did you mean \"%s\"?", string(pkr.ID)), unkModErr}), Equals, false) + c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{fmt.Sprintf("did you mean %q?", string(pkr.ID)), unkModErr}), Equals, false) c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), unkModErr), Equals, true) } func (s *zeroSuite) TestValidateModuleSettingReferences(c *C) { m := Module{ID: "m"} - m.Settings.Set("white", GlobalRef("zebra").AsExpression().AsValue()) + m.Settings.Set("white", GlobalRef("zebra").AsValue()) bp := Blueprint{} p := Root.Groups.At(0).Modules.At(0) @@ -942,4 +940,19 @@ func (s *zeroSuite) TestEvalVars(c *C) { c.Error(err, " should be BpError") } } + + { // Non-computable + vars := NewDict(map[string]cty.Value{ + "uro": MustParseExpression("DoesHalt(var.bo)").AsValue(), + "bo": cty.StringVal("01_10"), + }) + _, err := (&Blueprint{Vars: vars}).evalVars() + var berr BpError + if errors.As(err, &berr) { + c.Check(berr.Error(), Matches, ".*unsupported function.*DoesHalt.*") + c.Check(berr.Path.String(), Equals, "vars.uro") + } else { + c.Error(err, " should be BpError") + } + } } diff --git a/pkg/config/dict.go b/pkg/config/dict.go index 48dac765d1..6b0c349ed0 100644 --- a/pkg/config/dict.go +++ b/pkg/config/dict.go @@ -18,6 +18,7 @@ import ( "fmt" "github.com/zclconf/go-cty/cty" + "golang.org/x/exp/maps" ) // Dict maps string key to cty.Value. @@ -76,6 +77,10 @@ func (d *Dict) Items() map[string]cty.Value { return m } +func (d *Dict) Keys() []string { + return maps.Keys(d.m) +} + // AsObject returns Dict as cty.ObjectVal func (d *Dict) AsObject() cty.Value { return cty.ObjectVal(d.Items()) @@ -92,7 +97,7 @@ func (d Dict) IsZero() bool { func (d Dict) Eval(bp Blueprint) (Dict, error) { var res Dict for k, v := range d.Items() { - r, err := evalValue(v, bp) + r, err := bp.Eval(v) if err != nil { return Dict{}, fmt.Errorf("error while trying to evaluate %#v: %w", k, err) } diff --git a/pkg/config/dict_test.go b/pkg/config/dict_test.go index 9626eea62e..2c6cdcde58 100644 --- a/pkg/config/dict_test.go +++ b/pkg/config/dict_test.go @@ -89,7 +89,7 @@ func TestEval(t *testing.T) { } d := NewDict(map[string]cty.Value{ "abyss": cty.ObjectVal(map[string]cty.Value{ - "white": GlobalRef("zebra").AsExpression().AsValue(), + "white": GlobalRef("zebra").AsValue(), "green": cty.StringVal("grass"), })}) want := NewDict(map[string]cty.Value{ diff --git a/pkg/config/errors.go b/pkg/config/errors.go index d4dcdeb248..3af20a874d 100644 --- a/pkg/config/errors.go +++ b/pkg/config/errors.go @@ -35,6 +35,20 @@ func (e BpError) Unwrap() error { return e.Err } +// PosError is an error wrapper to augment Position +type PosError struct { + Pos Pos + Err error +} + +func (e PosError) Error() string { + return fmt.Sprintf("line %d column %d: %s", e.Pos.Line, e.Pos.Column, e.Err) +} + +func (e PosError) Unwrap() error { + return e.Err +} + // HintError wraps another error to suggest other values type HintError struct { Hint string @@ -149,11 +163,9 @@ const ( errMsgFileLoadError = string("failed to read the input yaml") errMsgYamlMarshalError = string("failed to export the configuration to a blueprint yaml file") errMsgYamlSaveError = string("failed to write the expanded yaml") - errMsgMissingSetting = string("a required setting is missing from a module") errMsgInvalidVar = string("invalid variable definition in") errMsgVarNotFound = string("could not find source of variable") errMsgIntergroupOrder = string("references to outputs from other groups must be to earlier groups") - errMsgNoOutput = string("output not found for a variable") errMsgCannotUsePacker = string("Packer modules cannot be used by other modules") errMsgDuplicateGroup = string("group names must be unique") errMsgDuplicateID = string("module IDs must be unique") diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 665d37b616..261159b9ec 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -17,13 +17,11 @@ package config import ( "errors" "fmt" - "regexp" - "strings" "hpc-toolkit/pkg/modulereader" - "github.com/agext/levenshtein" "github.com/zclconf/go-cty/cty" + "github.com/zclconf/go-cty/cty/convert" "golang.org/x/exp/maps" "golang.org/x/exp/slices" ) @@ -33,48 +31,15 @@ const ( deploymentLabel string = "ghpc_deployment" ) -var ( - // Checks if a variable exists only as a substring, ex: - // Matches: "a$(vars.example)", "word $(vars.example)", "word$(vars.example)", "$(vars.example)" - // Doesn't match: "\$(vars.example)", "no variable in this string" - anyVariableExp *regexp.Regexp = regexp.MustCompile(`(^|[^\\])\$\((.*?)\)`) - simpleVariableExp *regexp.Regexp = regexp.MustCompile(`^\$\((.*)\)$`) -) - -// expand expands variables and strings in the yaml config. Used directly by -// ExpandConfig for the create and expand commands. -func (dc *DeploymentConfig) expand() error { - dc.expandBackends() - dc.combineLabels() - - if err := dc.applyUseModules(); err != nil { - return err - } - - if err := dc.applyGlobalVariables(); err != nil { - return err - } - - if err := validateInputsAllModules(dc.Config); err != nil { - return err - } - - dc.Config.populateOutputs() - return nil -} - func validateInputsAllModules(bp Blueprint) error { errs := Errors{} - for ig, g := range bp.DeploymentGroups { - for im, m := range g.Modules { - p := Root.Groups.At(ig).Modules.At(im) - errs.Add(validateModuleInputs(p, m, bp)) - } - } + bp.WalkModulesSafe(func(p ModulePath, m *Module) { + errs.Add(validateModuleInputs(p, *m, bp)) + }) return errs.OrNil() } -func validateModuleInputs(mp modulePath, m Module, bp Blueprint) error { +func validateModuleInputs(mp ModulePath, m Module, bp Blueprint) error { mi := m.InfoOrDie() errs := Errors{} for _, input := range mi.Inputs { @@ -82,17 +47,62 @@ func validateModuleInputs(mp modulePath, m Module, bp Blueprint) error { if !m.Settings.Has(input.Name) { if input.Required { - errs.At(ip, fmt.Errorf("%s: Module ID: %s Setting: %s", - errMsgMissingSetting, m.ID, input.Name)) + errs.At(ip, fmt.Errorf("a required setting %q is missing from a module %q", input.Name, m.ID)) } continue } - // TODO: Check set value and input dtypes convertability + errs.At(ip, checkInputValueMatchesType(m.Settings.Get(input.Name), input, bp)) } return errs.OrNil() } +func attemptEvalModuleInput(val cty.Value, bp Blueprint) (cty.Value, bool) { + v, err := bp.Eval(val) + // there could be a legitimate reasons for it. + // e.g. use of modules output or unsupported (by ghpc) functions + // TODO: + // * substitute module outputs with an UnknownValue + // * skip if uses functions with side-effects, e.g. `file` + // * add implementation of all pure terraform functions + // * add positive selection for eval-errors to bubble up + return v, err == nil +} + +func checkInputValueMatchesType(val cty.Value, input modulereader.VarInfo, bp Blueprint) error { + v, ok := attemptEvalModuleInput(val, bp) + if !ok || input.Type == cty.NilType { + return nil // skip, can do nothing + } + // cty does panic on some edge cases, e.g. (cty.NilVal) + // we don't anticipate any of those, but just in case, catch panic and swallow it + defer func() { recover() }() + // TODO: consider returning error (not panic) or logging warning + if _, err := convert.Convert(v, input.Type); err != nil { + return fmt.Errorf("unsuitable value for %q: %w", input.Name, err) + } + return nil +} + +func validateModulesAreUsed(bp Blueprint) error { + used := map[ModuleID]bool{} + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { + for ref := range valueReferences(m.Settings.AsObject()) { + used[ref.Module] = true + } + }) + + errs := Errors{} + bp.WalkModulesSafe(func(p ModulePath, m *Module) { + if m.InfoOrDie().Metadata.Ghpc.HasToBeUsed && !used[m.ID] { + errs.At(p.ID, HintError{ + "you need to add it to the `use`-block of downstream modules", + fmt.Errorf("module %q was not used", m.ID)}) + } + }) + return errs.OrNil() +} + func (dc *DeploymentConfig) expandBackends() { // 1. DEFAULT: use TerraformBackend configuration (if supplied) in each // resource group @@ -121,8 +131,8 @@ func (dc *DeploymentConfig) expandBackends() { } } -func getModuleInputMap(inputs []modulereader.VarInfo) map[string]string { - modInputs := make(map[string]string) +func getModuleInputMap(inputs []modulereader.VarInfo) map[string]cty.Type { + modInputs := make(map[string]cty.Type) for _, input := range inputs { modInputs[input.Name] = input.Type } @@ -178,14 +188,12 @@ func useModule(mod *Module, use Module) { // skip settings that are not of list type, but already have a value // these were probably added by a previous call to this function - isList := strings.HasPrefix(inputType, "list") + isList := inputType.IsListType() if alreadySet && !isList { continue } - v := AsProductOfModuleUse( - ModuleRef(use.ID, setting).AsExpression().AsValue(), - use.ID) + v := AsProductOfModuleUse(ModuleRef(use.ID, setting).AsValue(), use.ID) if !isList { mod.Settings.Set(setting, v) @@ -198,7 +206,7 @@ func useModule(mod *Module, use Module) { // applyUseModules applies variables from modules listed in the "use" field // when/if applicable func (dc *DeploymentConfig) applyUseModules() error { - return dc.Config.WalkModules(func(m *Module) error { + return dc.Config.WalkModules(func(_ ModulePath, m *Module) error { for _, u := range m.Use { used, err := dc.Config.Module(u) if err != nil { // should never happen @@ -234,9 +242,8 @@ func (dc *DeploymentConfig) combineLabels() { gl := mergeMaps(defaults, vars.Get(labels).AsValueMap()) vars.Set(labels, cty.ObjectVal(gl)) - dc.Config.WalkModules(func(mod *Module) error { + dc.Config.WalkModulesSafe(func(_ ModulePath, mod *Module) { combineModuleLabels(mod, *dc) - return nil }) } @@ -246,7 +253,7 @@ func combineModuleLabels(mod *Module, dc DeploymentConfig) { return // no op } - ref := GlobalRef(labels).AsExpression().AsValue() + ref := GlobalRef(labels).AsValue() set := mod.Settings.Get(labels) if !set.IsNull() { @@ -271,7 +278,7 @@ func mergeMaps(ms ...map[string]cty.Value) map[string]cty.Value { return r } -func (bp Blueprint) applyGlobalVarsInModule(mod *Module) error { +func (bp Blueprint) applyGlobalVarsInModule(mod *Module) { mi := mod.InfoOrDie() for _, input := range mi.Inputs { // Module setting exists? Nothing more needs to be done. @@ -281,8 +288,7 @@ func (bp Blueprint) applyGlobalVarsInModule(mod *Module) error { // If it's not set, is there a global we can use? if bp.Vars.Has(input.Name) { - ref := GlobalRef(input.Name) - mod.Settings.Set(input.Name, ref.AsExpression().AsValue()) + mod.Settings.Set(input.Name, GlobalRef(input.Name).AsValue()) continue } @@ -290,14 +296,13 @@ func (bp Blueprint) applyGlobalVarsInModule(mod *Module) error { mod.Settings.Set(input.Name, cty.StringVal(string(mod.ID))) } } - return nil } // applyGlobalVariables takes any variables defined at the global level and // applies them to module settings if not already set. -func (dc *DeploymentConfig) applyGlobalVariables() error { - return dc.Config.WalkModules(func(mod *Module) error { - return dc.Config.applyGlobalVarsInModule(mod) +func (dc *DeploymentConfig) applyGlobalVariables() { + dc.Config.WalkModulesSafe(func(_ ModulePath, m *Module) { + dc.Config.applyGlobalVarsInModule(m) }) } @@ -313,10 +318,11 @@ func AutomaticOutputName(outputName string, moduleID ModuleID) string { func validateModuleReference(bp Blueprint, from Module, toID ModuleID) error { to, err := bp.Module(toID) if err != nil { - if hint, ok := bp.SuggestModuleIDHint(toID); ok { - return HintError{fmt.Sprintf("Did you mean \"%s\"?", hint), err} - } - return err + mods := []string{} + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { + mods = append(mods, string(m.ID)) + }) + return hintSpelling(string(toID), mods, err) } if to.Kind == PackerKind { @@ -341,15 +347,20 @@ func validateModuleSettingReference(bp Blueprint, mod Module, r Reference) error // simplest case to evaluate is a deployment variable's existence if r.GlobalVar { if !bp.Vars.Has(r.Name) { - return fmt.Errorf("module %#v references unknown global variable %#v", mod.ID, r.Name) + err := fmt.Errorf("module %#v references unknown global variable %#v", mod.ID, r.Name) + return hintSpelling(r.Name, bp.Vars.Keys(), err) } return nil } if err := validateModuleReference(bp, mod, r.Module); err != nil { var unkModErr UnknownModuleError - if errors.As(err, &unkModErr) && levenshtein.Distance(string(unkModErr.ID), "vars", nil) <= 2 { - return HintError{"Did you mean \"vars\"?", unkModErr} + if errors.As(err, &unkModErr) { + hints := []string{"vars"} + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { + hints = append(hints, string(m.ID)) + }) + return hintSpelling(string(unkModErr.ID), hints, unkModErr) } return err } @@ -358,21 +369,17 @@ func validateModuleSettingReference(bp Blueprint, mod Module, r Reference) error if err != nil { return err } - found := slices.ContainsFunc(mi.Outputs, func(o modulereader.OutputInfo) bool { return o.Name == r.Name }) - if !found { - return fmt.Errorf("%s: module %s did not have output %s", errMsgNoOutput, tm.ID, r.Name) - } - return nil -} -// isSimpleVariable checks if the entire string is just a single variable -func isSimpleVariable(str string) bool { - return simpleVariableExp.MatchString(str) -} + outputs := []string{} + for _, o := range mi.Outputs { + outputs = append(outputs, o.Name) + } -// hasVariable checks to see if any variable exists in a string -func hasVariable(str string) bool { - return anyVariableExp.MatchString(str) + if !slices.Contains(outputs, r.Name) { + err := fmt.Errorf("module %q does not have output %q", tm.ID, r.Name) + return hintSpelling(r.Name, outputs, err) + } + return nil } // FindAllIntergroupReferences finds all intergroup references within the group @@ -390,7 +397,7 @@ func (dg DeploymentGroup) FindAllIntergroupReferences(bp Blueprint) []Reference func FindIntergroupReferences(v cty.Value, mod Module, bp Blueprint) []Reference { g := bp.ModuleGroupOrDie(mod.ID) res := []Reference{} - for _, r := range valueReferences(v) { + for r := range valueReferences(v) { if !r.GlobalVar && bp.ModuleGroupOrDie(r.Module).Name != g.Name { res = append(res, r) } @@ -401,15 +408,14 @@ func FindIntergroupReferences(v cty.Value, mod Module, bp Blueprint) []Reference // find all intergroup references and add them to source Module.Outputs func (bp *Blueprint) populateOutputs() { refs := map[Reference]bool{} - bp.WalkModules(func(m *Module) error { + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { rs := FindIntergroupReferences(m.Settings.AsObject(), *m, *bp) for _, r := range rs { refs[r] = true } - return nil }) - bp.WalkModules(func(m *Module) error { + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { for r := range refs { if r.Module != m.ID { continue // find IGC references pointing to this module @@ -424,7 +430,6 @@ func (bp *Blueprint) populateOutputs() { }) } - return nil }) } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 4b91c0f80f..c27c0ed39f 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -23,11 +23,6 @@ import ( . "gopkg.in/check.v1" ) -func (s *MySuite) TestExpand(c *C) { - dc := s.getDeploymentConfigForTest() - c.Check(dc.expand(), IsNil) -} - func (s *MySuite) TestExpandBackends(c *C) { dc := s.getDeploymentConfigForTest() deplName := dc.Config.Vars.Get("deployment_name").AsString() @@ -77,11 +72,8 @@ func (s *zeroSuite) TestUseModule(c *C) { ID: "UsedModule", Source: "usedSource", } - varInfoNumber := modulereader.VarInfo{ - Name: "val1", - Type: "number", - } - ref := ModuleRef("UsedModule", "val1").AsExpression().AsValue() + varInfoNumber := modulereader.VarInfo{Name: "val1", Type: cty.Number} + ref := ModuleRef("UsedModule", "val1").AsValue() { // Pass: No Inputs, No Outputs mod := Module{ID: "lime", Source: "modSource"} @@ -152,7 +144,7 @@ func (s *zeroSuite) TestUseModule(c *C) { { // Pass: Single Input/Output match, input is list, not already set mod := Module{ID: "lime", Source: "limeTree"} setTestModuleInfo(mod, modulereader.ModuleInfo{ - Inputs: []modulereader.VarInfo{{Name: "val1", Type: "list"}}, + Inputs: []modulereader.VarInfo{{Name: "val1", Type: cty.List(cty.Number)}}, }) setTestModuleInfo(used, modulereader.ModuleInfo{ Outputs: []modulereader.OutputInfo{{Name: "val1"}}, @@ -169,7 +161,7 @@ func (s *zeroSuite) TestUseModule(c *C) { mod := Module{ID: "lime", Source: "limeTree"} mod.Settings.Set("val1", AsProductOfModuleUse(cty.TupleVal([]cty.Value{ref}), "other")) setTestModuleInfo(mod, modulereader.ModuleInfo{ - Inputs: []modulereader.VarInfo{{Name: "val1", Type: "list"}}, + Inputs: []modulereader.VarInfo{{Name: "val1", Type: cty.List(cty.Number)}}, }) setTestModuleInfo(used, modulereader.ModuleInfo{ Outputs: []modulereader.OutputInfo{{Name: "val1"}}, @@ -187,7 +179,7 @@ func (s *zeroSuite) TestUseModule(c *C) { mod := Module{ID: "lime", Source: "limeTree"} mod.Settings.Set("val1", cty.TupleVal([]cty.Value{ref})) setTestModuleInfo(mod, modulereader.ModuleInfo{ - Inputs: []modulereader.VarInfo{{Name: "val1", Type: "list"}}, + Inputs: []modulereader.VarInfo{{Name: "val1", Type: cty.List(cty.Number)}}, }) setTestModuleInfo(used, modulereader.ModuleInfo{ Outputs: []modulereader.OutputInfo{{Name: "val1"}}, @@ -220,9 +212,7 @@ func (s *MySuite) TestApplyUseModules(c *C) { setTestModuleInfo(using, modulereader.ModuleInfo{ Inputs: []modulereader.VarInfo{{ - Name: "potato", - Type: "number", - }}}) + Name: "potato", Type: cty.Number}}}) setTestModuleInfo(used, modulereader.ModuleInfo{ Outputs: []modulereader.OutputInfo{ {Name: "potato"}}}) @@ -242,7 +232,7 @@ func (s *MySuite) TestApplyUseModules(c *C) { m := &dc.Config.DeploymentGroups[1].Modules[0] c.Assert(m.Settings, DeepEquals, Dict{}) c.Assert(dc.applyUseModules(), IsNil) - ref := ModuleRef("TestModule0", "test_inter_0").AsExpression().AsValue() + ref := ModuleRef("TestModule0", "test_inter_0").AsValue() c.Assert(m.Settings.Items(), DeepEquals, map[string]cty.Value{ "test_inter_0": AsProductOfModuleUse(ref, "TestModule0")}) } @@ -301,7 +291,7 @@ func (s *zeroSuite) TestCombineLabels(c *C) { "ghpc_deployment": cty.StringVal("golden"), })) - labelsRef := GlobalRef("labels").AsExpression().AsValue() + labelsRef := GlobalRef("labels").AsValue() lime := dc.Config.DeploymentGroups[0] // Labels are set @@ -325,93 +315,22 @@ func (s *MySuite) TestApplyGlobalVariables(c *C) { dc := s.getDeploymentConfigForTest() mod := &dc.Config.DeploymentGroups[0].Modules[0] - // Test no inputs, none required - c.Check(dc.applyGlobalVariables(), IsNil) - // Test no inputs, one required, doesn't exist in globals setTestModuleInfo(*mod, modulereader.ModuleInfo{ Inputs: []modulereader.VarInfo{{ Name: "gold", - Type: "string", + Type: cty.String, Required: true, }}, }) // Test no input, one required, exists in globals dc.Config.Vars.Set("gold", cty.StringVal("val")) - c.Check(dc.applyGlobalVariables(), IsNil) + dc.applyGlobalVariables() c.Assert( mod.Settings.Get("gold"), DeepEquals, - GlobalRef("gold").AsExpression().AsValue()) - - // Test one input, one required - mod.Settings.Set("reqVar", cty.StringVal("val")) - c.Assert(dc.applyGlobalVariables(), IsNil) - - // Test one input, none required, exists in globals - setTestModuleInfo(*mod, modulereader.ModuleInfo{ - Inputs: []modulereader.VarInfo{{ - Name: "gold", - Type: "string", - Required: false, - }}, - }) - c.Assert(dc.applyGlobalVariables(), IsNil) -} - -func (s *zeroSuite) TestIsSimpleVariable(c *C) { - // True: Correct simple variable - got := isSimpleVariable("$(some_text)") - c.Assert(got, Equals, true) - // False: Missing $ - got = isSimpleVariable("(some_text)") - c.Assert(got, Equals, false) - // False: Missing ( - got = isSimpleVariable("$some_text)") - c.Assert(got, Equals, false) - // False: Missing ) - got = isSimpleVariable("$(some_text") - c.Assert(got, Equals, false) - // False: Contains Prefix - got = isSimpleVariable("prefix-$(some_text)") - c.Assert(got, Equals, false) - // False: Contains Suffix - got = isSimpleVariable("$(some_text)-suffix") - c.Assert(got, Equals, false) - // False: Contains prefix and suffix - got = isSimpleVariable("prefix-$(some_text)-suffix") - c.Assert(got, Equals, false) - // False: empty string - got = isSimpleVariable("") - c.Assert(got, Equals, false) -} - -func (s *zeroSuite) TestHasVariable(c *C) { - // True: simple variable - got := hasVariable("$(some_text)") - c.Assert(got, Equals, true) - // True: has prefix - got = hasVariable("prefix-$(some_text)") - c.Assert(got, Equals, true) - // True: has suffix - got = hasVariable("$(some_text)-suffix") - c.Assert(got, Equals, true) - // True: Two variables - got = hasVariable("$(some_text)$(some_more)") - c.Assert(got, Equals, true) - // True: two variable with other text - got = hasVariable("prefix-$(some_text)-$(some_more)-suffix") - c.Assert(got, Equals, true) - // False: missing $ - got = hasVariable("(some_text)") - c.Assert(got, Equals, false) - // False: missing ( - got = hasVariable("$some_text)") - c.Assert(got, Equals, false) - // False: missing ) - got = hasVariable("$(some_text") - c.Assert(got, Equals, false) + GlobalRef("gold").AsValue()) } func (s *zeroSuite) TestValidateModuleReference(c *C) { diff --git a/pkg/config/expression.go b/pkg/config/expression.go index 85a9e58217..dda754b563 100644 --- a/pkg/config/expression.go +++ b/pkg/config/expression.go @@ -15,6 +15,7 @@ package config import ( + "bytes" "fmt" "regexp" "strings" @@ -22,6 +23,7 @@ import ( "github.com/hashicorp/hcl/v2" "github.com/hashicorp/hcl/v2/hclsyntax" "github.com/hashicorp/hcl/v2/hclwrite" + "github.com/pkg/errors" "github.com/zclconf/go-cty/cty" "github.com/zclconf/go-cty/cty/function" "github.com/zclconf/go-cty/cty/function/stdlib" @@ -55,89 +57,54 @@ func (r Reference) AsExpression() Expression { return MustParseExpression(fmt.Sprintf("module.%s.%s", r.Module, r.Name)) } -// MakeStringInterpolationError generates an error message guiding the user to proper escape syntax -func MakeStringInterpolationError(s string) error { - matchall := anyVariableExp.FindAllString(s, -1) - hint := "" - for _, element := range matchall { - // the regex match will include the first matching character - // this might be (1) "^" or (2) any character EXCEPT "\" - // if (2), we have to remove the first character from the match - if element[0:2] != "$(" { - element = strings.Replace(element, element[0:1], "", 1) - } - hint += "\\" + element + " will be rendered as " + element + "\n" - } - return fmt.Errorf( - "variables \"$(...)\" within strings are not yet implemented. remove them or add a backslash to render literally. \n%s", hint) -} - -// Takes `$(expression)` and returns `expression` -func extractSimpleVarExpression(s string) (string, error) { - if !hasVariable(s) { - return "", fmt.Errorf("%#v is not a variable", s) - } - if !isSimpleVariable(s) { - return "", MakeStringInterpolationError(s) - } - contents := simpleVariableExp.FindStringSubmatch(s) - if len(contents) != 2 { // Should always be (match, contents) here - return "", fmt.Errorf("%s %s, failed to extract contents: %v", errMsgInvalidVar, s, contents) - } - return contents[1], nil +func (r Reference) AsValue() cty.Value { + return r.AsExpression().AsValue() } // Takes traversal in "blueprint namespace" (e.g. `vars.zone` or `homefs.mount`) -// and transforms it to `Expression`. -func simpleTraversalToExpression(t hcl.Traversal) (Expression, error) { +// and transforms it to "terraform namespace" (e.g. `var.zone` or `module.homefs.mount`). +func bpTraversalToTerraform(t hcl.Traversal) (hcl.Traversal, error) { if len(t) < 2 { return nil, fmt.Errorf(expectedVarFormat) } - attr, ok := t[1].(hcl.TraverseAttr) + _, ok := t[1].(hcl.TraverseAttr) if !ok { return nil, fmt.Errorf(expectedVarFormat) } - var ref Reference if t.RootName() == "vars" { - t[0] = hcl.TraverseRoot{Name: "var"} - ref = GlobalRef(attr.Name) + root := hcl.TraverseRoot{Name: "var"} + return append([]hcl.Traverser{root}, t[1:]...), nil } else { - mod := t.RootName() - t[0] = hcl.TraverseAttr{Name: mod} root := hcl.TraverseRoot{Name: "module"} - t = append(hcl.Traversal{root}, t...) - ref = ModuleRef(ModuleID(mod), attr.Name) + mod := hcl.TraverseAttr{Name: t.RootName()} + return append([]hcl.Traverser{root, mod}, t[1:]...), nil } - - return &BaseExpression{ - e: &hclsyntax.ScopeTraversalExpr{Traversal: t}, - toks: hclwrite.TokensForTraversal(t), - rs: []Reference{ref}, - }, nil } -// SimpleVarToExpression takes a string `$(...)` and transforms it to `Expression` -func SimpleVarToExpression(s string) (Expression, error) { - s, err := extractSimpleVarExpression(s) - if err != nil { - return nil, err - } - hexp, diag := hclsyntax.ParseExpression([]byte(s), "", hcl.Pos{}) +// BlueprintExpressionLiteralToExpression takes a content of `$(...)`-literal and transforms it to `Expression` +func BlueprintExpressionLiteralToExpression(s string) (Expression, error) { + bpExp, diag := hclsyntax.ParseExpression([]byte(s), "", hcl.Pos{}) if diag.HasErrors() { return nil, diag } - - switch texp := hexp.(type) { - case *hclsyntax.ScopeTraversalExpr: - exp, err := simpleTraversalToExpression(texp.Traversal) + toks, err := parseHcl(s) + if err != nil { + return nil, err + } + for _, t := range bpExp.Variables() { + new, err := bpTraversalToTerraform(t) if err != nil { - return nil, fmt.Errorf("failed to parse variable %q: %w", s, err) + return nil, err } - return exp, nil - default: - return nil, fmt.Errorf("only traversal expressions are supported, got %q", s) + + toks = replaceTokens( + toks, + hclwrite.TokensForTraversal(t), + hclwrite.TokensForTraversal(new)) } + + return ParseExpression(string(toks.Bytes())) } // TraversalToReference takes HCL traversal and returns `Reference` @@ -176,24 +143,10 @@ func TraversalToReference(t hcl.Traversal) (Reference, error) { } } -// IsYamlExpressionLiteral checks if passed value of type cty.String -// and its content starts with "((" and ends with "))". -// Returns trimmed string and result of test. -func IsYamlExpressionLiteral(v cty.Value) (string, bool) { - if v.Type() != cty.String { - return "", false - } - s := v.AsString() - if len(s) < 4 || s[:2] != "((" || s[len(s)-2:] != "))" { - return "", false - } - return s[2 : len(s)-2], true -} - // Expression is a representation of expressions in Blueprint type Expression interface { - // Eval evaluates the expression in the context of Blueprint - Eval(bp Blueprint) (cty.Value, error) + // Eval evaluates the expression in the given context + Eval(ctx *hcl.EvalContext) (cty.Value, error) // Tokenize returns Tokens to be used for marshalling HCL Tokenize() hclwrite.Tokens // References return Reference for all variables used in the expression @@ -207,17 +160,29 @@ type Expression interface { key() expressionKey } -// ParseExpression returns Expression -func ParseExpression(s string) (Expression, error) { - e, diag := hclsyntax.ParseExpression([]byte(s), "", hcl.Pos{}) +func parseHcl(s string) (hclwrite.Tokens, error) { + sToks, diag := hclsyntax.LexExpression([]byte(s), "", hcl.Pos{}) if diag.HasErrors() { return nil, diag } - sToks, _ := hclsyntax.LexExpression([]byte(s), "", hcl.Pos{}) wToks := make(hclwrite.Tokens, len(sToks)) for i, st := range sToks { wToks[i] = &hclwrite.Token{Type: st.Type, Bytes: st.Bytes} } + return wToks, nil +} + +// ParseExpression returns Expression +// Expects expression in "terraform namespace" (e.g. `var.zone` or `module.homefs.mount`) +func ParseExpression(s string) (Expression, error) { + e, diag := hclsyntax.ParseExpression([]byte(s), "", hcl.Pos{}) + if diag.HasErrors() { + return nil, diag + } + toks, err := parseHcl(s) + if err != nil { + return nil, err + } ts := e.Variables() rs := make([]Reference, len(ts)) @@ -227,7 +192,7 @@ func ParseExpression(s string) (Expression, error) { return nil, err } } - return BaseExpression{e: e, toks: wToks, rs: rs}, nil + return BaseExpression{e: e, toks: toks, rs: rs}, nil } // MustParseExpression is "errorless" version of ParseExpression @@ -248,17 +213,25 @@ type BaseExpression struct { rs []Reference } -// Eval evaluates the expression in the context of Blueprint -func (e BaseExpression) Eval(bp Blueprint) (cty.Value, error) { - ctx := hcl.EvalContext{ - Variables: map[string]cty.Value{"var": bp.Vars.AsObject()}, - Functions: functions(), +func handleEvalErr(diag hcl.Diagnostics) error { + if !diag.HasErrors() { + return nil } - v, diag := e.e.Value(&ctx) - if diag.HasErrors() { - return cty.NilVal, diag + err := diag.Errs()[0] + if match := regexp.MustCompile(`There is no function named "(\w+)"`).FindStringSubmatch(err.Error()); match != nil { + sf := strings.Join(maps.Keys(functions()), ", ") + return HintError{ + Err: fmt.Errorf("unsupported function %q", match[1]), + Hint: fmt.Sprintf("this context only supports following functions: %v", sf)} } - return v, nil + return err + +} + +// Eval evaluates the expression in the context of Blueprint +func (e BaseExpression) Eval(ctx *hcl.EvalContext) (cty.Value, error) { + v, diag := e.e.Value(ctx) + return v, handleEvalErr(diag) } // Tokenize returns Tokens to be used for marshalling HCL @@ -340,18 +313,6 @@ func HasMark[T any](val cty.Value) (T, bool) { return tgt, found } -func escapeBlueprintVariables(s string) string { - // Convert \$(not.variable) to $(not.variable) - re := regexp.MustCompile(`\\\$\(`) - return re.ReplaceAllString(s, `$(`) -} - -func escapeLiteralVariables(s string) string { - // Convert \((not.variable)) to ((not.variable)) - re := regexp.MustCompile(`\\\(\(`) - return re.ReplaceAllString(s, `((`) -} - // TokensForValue is a modification of hclwrite.TokensForValue. // The only difference in behavior is handling "HCL literal" strings. func TokensForValue(val cty.Value) hclwrite.Tokens { @@ -363,20 +324,8 @@ func TokensForValue(val cty.Value) hclwrite.Tokens { if e, is := IsExpressionValue(val); is { return e.Tokenize() } - val, _ = val.Unmark() // remove marks, as we don't need them anymore - if s, is := IsYamlExpressionLiteral(val); is { // return it "as is" - return hclwrite.TokensForIdentifier(s) - } - + val, _ = val.Unmark() // remove marks, as we don't need them anymore ty := val.Type() - if ty == cty.String { - s := val.AsString() - // The order of application matters, for an edge cases like: `\$\((` -> `$((` - s = escapeLiteralVariables(s) - s = escapeBlueprintVariables(s) - return hclwrite.TokensForValue(cty.StringVal(s)) - } - if ty.IsListType() || ty.IsSetType() || ty.IsTupleType() { tl := []hclwrite.Tokens{} for it := val.ElementIterator(); it.Next(); { @@ -397,7 +346,6 @@ func TokensForValue(val cty.Value) hclwrite.Tokens { tl = append(tl, hclwrite.ObjectAttrTokens{Name: kt, Value: vt}) } return hclwrite.TokensForObject(tl) - } return hclwrite.TokensForValue(val) // rely on hclwrite implementation } @@ -422,24 +370,232 @@ func functions() map[string]function.Function { } } -func valueReferences(v cty.Value) []Reference { - r := map[Reference]bool{} - cty.Walk(v, func(_ cty.Path, v cty.Value) (bool, error) { +func valueReferences(v cty.Value) map[Reference]cty.Path { + r := map[Reference]cty.Path{} + cty.Walk(v, func(p cty.Path, v cty.Value) (bool, error) { if e, is := IsExpressionValue(v); is { for _, ref := range e.References() { - r[ref] = true + r[ref] = p } } return true, nil }) - return maps.Keys(r) + return r } -func evalValue(v cty.Value, bp Blueprint) (cty.Value, error) { +func (bp *Blueprint) Eval(v cty.Value) (cty.Value, error) { + ctx := hcl.EvalContext{ + Variables: map[string]cty.Value{ + "var": bp.Vars.AsObject()}, + Functions: functions()} + return eval(v, &ctx) +} + +func eval(v cty.Value, ctx *hcl.EvalContext) (cty.Value, error) { return cty.Transform(v, func(p cty.Path, v cty.Value) (cty.Value, error) { if e, is := IsExpressionValue(v); is { - return e.Eval(bp) + return e.Eval(ctx) } return v, nil }) } + +type pToken struct { + s string + e Expression +} + +func tokenizeBpLine(s string) ([]pToken, error) { + line := s // copy + toks := []pToken{} + var exp Expression + var err error + bsRe := regexp.MustCompile(`\\*$`) // to count number of backslashes at the end + + for len(s) > 0 { + i := strings.Index(s, "$(") + if i == -1 { // plain string until the end + toks, s = append(toks, pToken{s: s}), "" // add everything + break // and terminate + } + p := s[:i] + s = s[i+2:] // split as `p$(s` + bs := len(bsRe.FindString(p)) // get number of trailing backslashes + p = p[:len(p)-bs+bs/2] // keep (smaller) half of backslashes + toks = append(toks, pToken{s: p}) // add tokens up to "$(" + + if bs%2 == 1 { // escaped $( + toks = append(toks, pToken{s: "$("}) // add "$(" + } else { // found beginning of expression + offset := len(line) - len(s) + exp, s, err = greedyParseHcl(s) // parse after "$(" + if err != nil { + return nil, prepareParseHclErr(err, line, offset) + } + toks = append(toks, pToken{e: exp}) // add expression + } + } + return toks, nil +} + +// One can't translate HCL diagnostics position to the global YAML position, +// due to lack of information about YAML string-style (e.g. double quoted, plain, folded etc), +// therefore start position of the string in YAML document and indentation. +// Render error in a scope of a single line of the string instead. +func prepareParseHclErr(err error, line string, offset int) error { + var col int + if diag, is := err.(hcl.Diagnostics); is { + derr, _ := diag.Errs()[0].(*hcl.Diagnostic) + col = offset + derr.Subject.Start.Column + err = fmt.Errorf("%s; %s", derr.Summary, derr.Detail) + } else { + col = offset // point at the beginning of expression + } + return fmt.Errorf("%s\n %s\n %s^", err, line, strings.Repeat(" ", col)) +} + +func tokenizeBpString(s string) ([]pToken, error) { + toks := []pToken{} + + // can't use `bufio.NewScanner` as it doesn't preserve trailing empty lines + lines := regexp.MustCompile("\r?\n").Split(s, -1) + for _, line := range lines { + if len(toks) > 0 { + toks = append(toks, pToken{s: "\n"}) + } + ltoks, err := tokenizeBpLine(line) + if err != nil { + return nil, err + } + toks = append(toks, ltoks...) + } + return toks, nil +} + +func compactTokens(toks []pToken) []pToken { + res := []pToken{} + for _, t := range toks { + if t.e != nil { + res = append(res, t) // add as is + } else { + if t.s == "" { + continue // skip + } + if len(res) > 0 && res[len(res)-1].e == nil { + res[len(res)-1].s += t.s // merge with previous + } else { + res = append(res, t) // add as is + } + } + } + return res +} + +func parseBpLit(s string) (cty.Value, error) { + toks, err := tokenizeBpString(s) + if err != nil { + return cty.NilVal, err + } + toks = compactTokens(toks) + if len(toks) == 0 { + return cty.StringVal(""), nil + } + if len(toks) == 1 { + if toks[0].e != nil { + return toks[0].e.AsValue(), nil + } else { + return cty.StringVal(toks[0].s), nil + } + } + + exp, err := buildStringInterpolation(toks) + if err != nil { + return cty.NilVal, err + } + return exp.AsValue(), nil +} + +// greedyParseHcl tries to parse prefix of `s` as a valid HCL expression. +// It iterates over all closing brackets and tries to parse expression up to them. +// The shortest expression is returned. E.g: +// "var.hi) $(var.there)" -> "var.hi" +// "try(var.this) + one(var.time)) tail" -> "try(var.this) + one(var.time)" +func greedyParseHcl(s string) (Expression, string, error) { + err := errors.New("no closing parenthesis") + for i := 0; i < len(s); i++ { + if s[i] != ')' { + continue + } + _, diag := hclsyntax.ParseExpression([]byte(s[:i]), "", hcl.Pos{}) + if !diag.HasErrors() { // found an expression + exp, err := BlueprintExpressionLiteralToExpression(s[:i]) + return exp, s[i+1:], err + } + err = diag // save error, try to find another closing bracket + } + return nil, s, err +} + +func buildStringInterpolation(pts []pToken) (Expression, error) { + toks := hclwrite.Tokens{&hclwrite.Token{ + Type: hclsyntax.TokenOQuote, + Bytes: []byte(`"`)}, + } + + for _, pt := range pts { + if pt.e != nil { + toks = append(toks, &hclwrite.Token{ + Type: hclsyntax.TokenTemplateInterp, + Bytes: []byte(`${`)}) + toks = append(toks, pt.e.Tokenize()...) + toks = append(toks, &hclwrite.Token{ + Type: hclsyntax.TokenTemplateSeqEnd, + Bytes: []byte(`}`)}) + } else { + stoks := hclwrite.TokensForValue(cty.StringVal(pt.s)) + stoks = stoks[1 : len(stoks)-1] // remove quotes + toks = append(toks, stoks...) + } + } + + toks = append(toks, &hclwrite.Token{ + Type: hclsyntax.TokenCQuote, + Bytes: []byte(`"`)}) + return ParseExpression(string(toks.Bytes())) +} + +func trimEOF(ts hclwrite.Tokens) hclwrite.Tokens { + if len(ts) > 0 && ts[len(ts)-1].Type == hclsyntax.TokenEOF { + return ts[:len(ts)-1] + } + return ts +} + +func replaceTokens(body, old, new hclwrite.Tokens) hclwrite.Tokens { + old, new = trimEOF(old), trimEOF(new) + if len(old) == 0 { + return body + } + + r := hclwrite.Tokens{} + + p := hclwrite.Tokens{} // matching prefix of `old` + for _, t := range body { + c := old[len(p)] + p = append(p, t) + if t.Type != c.Type || !bytes.Equal(t.Bytes, c.Bytes) { // t != c + r = append(r, p...) // stop comparison and flash prefix + p = hclwrite.Tokens{} + } + if len(p) == len(old) { // gathered enough tokens + p = hclwrite.Tokens{} + r = append(r, new...) + } + } + return append(r, p...) +} + +func ReplaceSubExpressions(body, old, new Expression) (Expression, error) { + r := replaceTokens(body.Tokenize(), old.Tokenize(), new.Tokenize()) + return ParseExpression(string(r.Bytes())) +} diff --git a/pkg/config/expression_test.go b/pkg/config/expression_test.go index be6206a1ee..15d3998d9c 100644 --- a/pkg/config/expression_test.go +++ b/pkg/config/expression_test.go @@ -15,6 +15,7 @@ package config import ( + "fmt" "testing" "github.com/google/go-cmp/cmp" @@ -71,41 +72,14 @@ func TestTraversalToReference(t *testing.T) { } } -func TestIsYamlHclLiteral(t *testing.T) { - type test struct { - input string - want string - check bool - } - tests := []test{ - {"((var.green))", "var.green", true}, - {"((${var.green}))", "${var.green}", true}, - {"(( 7 + a }))", " 7 + a }", true}, - {"(var.green)", "", false}, - {"((var.green)", "", false}, - {"$(var.green)", "", false}, - {"${var.green}", "", false}, - } - for _, tc := range tests { - t.Run(tc.input, func(t *testing.T) { - got, check := IsYamlExpressionLiteral(cty.StringVal(tc.input)) - if diff := cmp.Diff(tc.want, got); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } - if diff := cmp.Diff(tc.check, check); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } - }) - } -} - -func TestSimpleVarToExpression(t *testing.T) { +func TestParseBpLit(t *testing.T) { type test struct { input string want string err bool } tests := []test{ + // Single expression, without string interpolation {"$(vars.green)", "var.green", false}, {"$(vars.green[3])", "var.green[3]", false}, {"$(vars.green.sleeve)", "var.green.sleeve", false}, @@ -119,24 +93,67 @@ func TestSimpleVarToExpression(t *testing.T) { {"$(box.green.sleeve[3])", "module.box.green.sleeve[3]", false}, {`$(box.green["sleeve"])`, `module.box.green["sleeve"]`, false}, + // String interpolation + {`1gold was here`, `"1gold was here"`, false}, + {`2gold $(vars.here)`, `"2gold ${var.here}"`, false}, + {`3gold $(vars.here) but $(vars.gone)`, `"3gold ${var.here} but ${var.gone}"`, false}, + {`4gold +$(vars.here)`, `"4gold\n${var.here}"`, false}, // quoted strings may not be split over multiple lines + + {`5gold +was here`, `"5gold\nwas here"`, false}, + {"6gold $(vars.here", ``, true}, // missing close parenthesis + + {`#!/bin/bash +echo "Hello $(vars.project_id) from $(vars.region)"`, `"#!/bin/bash\necho \"Hello ${var.project_id} from ${var.region}\""`, false}, + {`#!/bin/bash +echo "Hello $(vars.project_id)" +`, `"#!/bin/bash\necho \"Hello ${var.project_id}\"\n"`, false}, + {"", `""`, false}, + {`$(try(vars.this) + one(vars.time))`, "try(var.this)+one(var.time)", false}, + + // Escaping + {`q $(vars.t)`, `"q ${var.t}"`, false}, // no escaping + {`q \$(vars.t)`, `"q $(vars.t)"`, false}, // escaped `$(` + {`q \\$(vars.t)`, `"q \\${var.t}"`, false}, // escaped `\` + {`q \\\$(vars.t)`, `"q \\$(vars.t)"`, false}, // escaped both `\` and `$(` + {`q \\\\$(vars.t)`, `"q \\\\${var.t}"`, false}, // escaped `\\` + {`q \\\\\$(vars.t)`, `"q \\\\$(vars.t)"`, false}, // escaped both `\\` and `$(` + + // Translation of complex expressions BP -> Terraform + {"$(vars.green + amber.blue)", "var.green+module.amber.blue", false}, + {"$(5 + vars.blue)", "5+var.blue", false}, + {"$(5)", "5", false}, + {`$("${vars.green}_${vars.sleeve}")`, `"${var.green}_${var.sleeve}"`, false}, + {"$(fun(vars.green))", "fun(var.green)", false}, + + // Untranslatable expressions {"$(vars)", "", true}, {"$(sleeve)", "", true}, - {"gold $(var.here)", "", true}, {"$(box[3])", "", true}, // can't index module {`$(box["green"])`, "", true}, // can't index module {"$(vars[3]])", "", true}, // can't index vars {`$(vars["green"])`, "", true}, // can't index module + } for _, tc := range tests { t.Run(tc.input, func(t *testing.T) { - exp, err := SimpleVarToExpression(tc.input) + v, err := parseBpLit(tc.input) if tc.err != (err != nil) { t.Errorf("got unexpected error: %s", err) } if err != nil { return } - got := string(exp.Tokenize().Bytes()) + var got string + if v.Type() == cty.String { + got = string(hclwrite.TokensForValue(v).Bytes()) + } else if exp, is := IsExpressionValue(v); is { + got = string(exp.Tokenize().Bytes()) + } else { + t.Fatalf("got value of unexpected type: %#v", v) + } + if diff := cmp.Diff(tc.want, got); diff != "" { t.Errorf("diff (-want +got):\n%s", diff) } @@ -154,6 +171,7 @@ func TestTokensForValueNoLiteral(t *testing.T) { "ba": cty.NumberIntVal(56), })}), "pony.zebra": cty.NilVal, + "zanzibar": cty.NullVal(cty.DynamicPseudoType), }) want := hclwrite.NewEmptyFile() want.Body().AppendUnstructuredTokens(hclwrite.TokensForValue(val)) @@ -166,33 +184,13 @@ func TestTokensForValueNoLiteral(t *testing.T) { } } -func TestTokensForValueWithLiteral(t *testing.T) { - val := cty.ObjectVal(map[string]cty.Value{ - "tan": cty.TupleVal([]cty.Value{ - cty.StringVal("((var.kilo + 8))"), // HCL literal - MustParseExpression("var.tina + 4").AsValue(), // HclExpression value - })}) - want := ` -{ - tan = [var.kilo + 8, var.tina + 4] -}`[1:] - - gotF := hclwrite.NewEmptyFile() - gotF.Body().AppendUnstructuredTokens(TokensForValue(val)) - got := hclwrite.Format(gotF.Bytes()) // format to normalize whitespace - - if diff := cmp.Diff(want, string(got)); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } -} - func TestFlattenFunctionCallExpression(t *testing.T) { bp := Blueprint{Vars: NewDict(map[string]cty.Value{ "three": cty.NumberIntVal(3), })} expr := FunctionCallExpression("flatten", cty.TupleVal([]cty.Value{ cty.TupleVal([]cty.Value{cty.NumberIntVal(1), cty.NumberIntVal(2)}), - GlobalRef("three").AsExpression().AsValue(), + GlobalRef("three").AsValue(), })) want := cty.TupleVal([]cty.Value{ @@ -200,7 +198,7 @@ func TestFlattenFunctionCallExpression(t *testing.T) { cty.NumberIntVal(2), cty.NumberIntVal(3)}) - got, err := expr.Eval(bp) + got, err := bp.Eval(expr.AsValue()) if err != nil { t.Errorf("got unexpected error: %s", err) } @@ -220,7 +218,7 @@ func TestMergeFunctionCallExpression(t *testing.T) { "one": cty.NumberIntVal(1), "two": cty.NumberIntVal(3), }), - GlobalRef("fix").AsExpression().AsValue(), + GlobalRef("fix").AsValue(), ) want := cty.ObjectVal(map[string]cty.Value{ @@ -228,7 +226,7 @@ func TestMergeFunctionCallExpression(t *testing.T) { "two": cty.NumberIntVal(2), }) - got, err := expr.Eval(bp) + got, err := bp.Eval(expr.AsValue()) if err != nil { t.Errorf("got unexpected error: %s", err) } @@ -236,3 +234,39 @@ func TestMergeFunctionCallExpression(t *testing.T) { t.Errorf("diff (-want +got):\n%s", diff) } } + +func TestReplaceTokens(t *testing.T) { + type test struct { + body string + old string + new string + want string + } + tests := []test{ + {"var.green", "var.green", "var.blue", "var.blue"}, + {"var.green + var.green", "var.green", "var.blue", "var.blue+var.blue"}, + {"vars.green + 5", "vars.green", "var.green", "var.green+5"}, + {"var.green + var.blue", "vars.gold", "var.silver", "var.green+var.blue"}, + } + for _, tc := range tests { + t.Run(fmt.Sprintf("s/%s/%s/%s", tc.old, tc.new, tc.body), func(t *testing.T) { + b, err := parseHcl(tc.body) + if err != nil { + t.Fatal(err) + } + o, err := parseHcl(tc.old) + if err != nil { + t.Fatal(err) + } + n, err := parseHcl(tc.new) + if err != nil { + t.Fatal(err) + } + + got := replaceTokens(b, o, n) + if diff := cmp.Diff(tc.want, string(got.Bytes())); diff != "" { + t.Errorf("diff (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/config/path.go b/pkg/config/path.go index fef57d7b56..c22de0aeb7 100644 --- a/pkg/config/path.go +++ b/pkg/config/path.go @@ -156,10 +156,10 @@ type groupPath struct { basePath Name basePath `path:".group"` Backend backendPath `path:".terraform_backend"` - Modules arrayPath[modulePath] `path:".modules"` + Modules arrayPath[ModulePath] `path:".modules"` } -type modulePath struct { +type ModulePath struct { basePath Source basePath `path:".source"` Kind basePath `path:".kind"` @@ -179,9 +179,6 @@ type outputPath struct { // Root is a starting point for creating a Blueprint Path var Root rootPath -// internalPath is to be used to report problems outside of Blueprint schema (e.g. YAML parsing error position) -var internalPath = mapPath[basePath]{basePath{nil, "__internal_path__"}} - func init() { initPath(&Root, nil, "") } diff --git a/pkg/config/path_test.go b/pkg/config/path_test.go index adc702f666..b17a0dc219 100644 --- a/pkg/config/path_test.go +++ b/pkg/config/path_test.go @@ -72,9 +72,6 @@ func TestPath(t *testing.T) { {r.Backend.Type, "terraform_backend_defaults.type"}, {r.Backend.Configuration, "terraform_backend_defaults.configuration"}, {r.Backend.Configuration.Dot("goo"), "terraform_backend_defaults.configuration.goo"}, - - {internalPath, "__internal_path__"}, - {internalPath.Dot("a"), "__internal_path__.a"}, } for _, tc := range tests { t.Run(tc.want, func(t *testing.T) { @@ -103,8 +100,6 @@ func TestPathParent(t *testing.T) { {r.Vars.Dot("red").Cty(cp.IndexInt(6)), r.Vars.Dot("red")}, {r.Vars.Dot("red").Cty(cp.IndexInt(6).IndexString("gg")), r.Vars.Dot("red").Cty(cp.IndexInt(6))}, {r.Vars.Dot("red").Cty(cp.IndexInt(6).IndexString("gg").Index(cty.True)), r.Vars.Dot("red").Cty(cp.IndexInt(6))}, - {internalPath, nil}, - {internalPath.Dot("gold"), internalPath}, } for _, tc := range tests { t.Run(tc.p.String(), func(t *testing.T) { diff --git a/pkg/config/validate.go b/pkg/config/validate.go index 79435eec43..d3a8d06009 100644 --- a/pkg/config/validate.go +++ b/pkg/config/validate.go @@ -25,6 +25,7 @@ import ( "github.com/pkg/errors" "github.com/zclconf/go-cty/cty" + "golang.org/x/exp/maps" ) const maxLabels = 64 @@ -82,7 +83,7 @@ func validateVars(vars Dict) error { return errs.OrNil() } -func validateModule(p modulePath, m Module, bp Blueprint) error { +func validateModule(p ModulePath, m Module, bp Blueprint) error { // Source/Kind validations are required to pass to perform other validations if m.Source == "" { return BpError{p.Source, EmptyModuleSource} @@ -113,7 +114,7 @@ func validateModule(p modulePath, m Module, bp Blueprint) error { OrNil() } -func validateOutputs(p modulePath, mod Module, info modulereader.ModuleInfo) error { +func validateOutputs(p ModulePath, mod Module, info modulereader.ModuleInfo) error { errs := Errors{} outputs := info.GetOutputsAsMap() @@ -133,7 +134,7 @@ type moduleVariables struct { } func validateSettings( - p modulePath, + p ModulePath, mod Module, info modulereader.ModuleInfo) error { @@ -163,7 +164,8 @@ func validateSettings( } // Setting not found if _, ok := cVars.Inputs[k]; !ok { - errs.At(sp, UnknownModuleSetting) + err := hintSpelling(k, maps.Keys(cVars.Inputs), UnknownModuleSetting) + errs.At(sp, err) continue // do not perform other validations } diff --git a/pkg/config/validator_test.go b/pkg/config/validator_test.go index 95d6e5498a..c79df42096 100644 --- a/pkg/config/validator_test.go +++ b/pkg/config/validator_test.go @@ -37,6 +37,12 @@ func (s *zeroSuite) TestValidateVars(c *C) { c.Check(validateVars(vars), NotNil) } + { // Fail: Null value + vars := Dict{base} + vars.Set("fork", cty.NullVal(cty.String)) + c.Check(validateVars(vars), NotNil) + } + { // Fail: labels not a map vars := Dict{base} vars.Set("labels", cty.StringVal("a_string")) diff --git a/pkg/config/yaml.go b/pkg/config/yaml.go index c3d912335b..05144fac42 100644 --- a/pkg/config/yaml.go +++ b/pkg/config/yaml.go @@ -22,6 +22,7 @@ import ( "os" "regexp" "strconv" + "strings" "github.com/hashicorp/hcl/v2/hclwrite" "github.com/pkg/errors" @@ -68,15 +69,7 @@ func importBlueprint(f string) (Blueprint, YamlCtx, error) { var bp Blueprint if err = decoder.Decode(&bp); err != nil { - errs := Errors{} - for i, yep := range parseYamlV3Error(err) { - path := internalPath.Dot(fmt.Sprintf("bp_schema_error_%d", i)) - if yep.pos.Line != 0 { - yamlCtx.pathToPos[yPath(path.String())] = yep.pos - } - errs.At(path, errors.New(yep.errMsg)) - } - return Blueprint{}, yamlCtx, errs + return Blueprint{}, yamlCtx, parseYamlV3Error(err) } return bp, yamlCtx, nil } @@ -148,15 +141,7 @@ func NewYamlCtx(data []byte) (YamlCtx, error) { // error may happen if YAML is not valid, regardless of Blueprint schema if err := yaml.Unmarshal(data, &c); err != nil { - errs := Errors{} - for i, yep := range parseYamlV3Error(err) { - path := internalPath.Dot(fmt.Sprintf("yaml_error_%d", i)) - if yep.pos.Line != 0 { - m[yPath(path.String())] = yep.pos - } - errs.At(path, errors.New(yep.errMsg)) - } - return YamlCtx{m, lines}, errs + return YamlCtx{m, lines}, parseYamlV3Error(err) } var walk func(n *yaml.Node, p yPath, posOf *yaml.Node) @@ -186,6 +171,10 @@ func NewYamlCtx(data []byte) (YamlCtx, error) { type nodeCapturer struct{ n *yaml.Node } +func nodeToPosErr(n *yaml.Node, err error) PosError { + return PosError{Pos{Line: n.Line, Column: n.Column}, err} +} + func (c *nodeCapturer) UnmarshalYAML(n *yaml.Node) error { c.n = n return nil @@ -199,7 +188,7 @@ func (mk *ModuleKind) UnmarshalYAML(n *yaml.Node) error { mk.kind = kind return nil } - return fmt.Errorf("line %d: kind must be \"packer\" or \"terraform\" or removed from YAML", n.Line) + return nodeToPosErr(n, errors.New(`kind must be "packer" or "terraform" or removed from YAML`)) } // MarshalYAML implements a custom marshaler from ModuleKind to YAML string @@ -211,7 +200,7 @@ func (mk ModuleKind) MarshalYAML() (interface{}, error) { func (ms *ModuleIDs) UnmarshalYAML(n *yaml.Node) error { var ids []ModuleID if err := n.Decode(&ids); err != nil { - return fmt.Errorf("line %d: `use` must be a list of module ids", n.Line) + return nodeToPosErr(n, errors.New("`use` must be a list of module ids")) } *ms = ids return nil @@ -219,14 +208,23 @@ func (ms *ModuleIDs) UnmarshalYAML(n *yaml.Node) error { // YamlValue is wrapper around cty.Value to handle YAML unmarshal. type YamlValue struct { - v cty.Value + v cty.Value // do not use this field directly, use Wrap() and Unwrap() instead } // Unwrap returns wrapped cty.Value. func (y YamlValue) Unwrap() cty.Value { + if y.v == cty.NilVal { + // we can't use 0-value of cty.Value (NilVal) + // instead it should be a proper null(any) value + return cty.NullVal(cty.DynamicPseudoType) + } return y.v } +func (y *YamlValue) Wrap(v cty.Value) { + y.v = v +} + // UnmarshalYAML implements custom YAML unmarshaling. func (y *YamlValue) UnmarshalYAML(n *yaml.Node) error { var err error @@ -238,7 +236,7 @@ func (y *YamlValue) UnmarshalYAML(n *yaml.Node) error { case yaml.SequenceNode: err = y.unmarshalTuple(n) default: - err = fmt.Errorf("line %d: cannot decode node with unknown kind %d", n.Line, n.Kind) + err = nodeToPosErr(n, fmt.Errorf("cannot decode node with unknown kind %d", n.Kind)) } return err } @@ -250,30 +248,40 @@ func (y *YamlValue) unmarshalScalar(n *yaml.Node) error { } ty, err := gocty.ImpliedType(s) if err != nil { - return fmt.Errorf("line %d: %w", n.Line, err) + return nodeToPosErr(n, err) } - if y.v, err = gocty.ToCtyValue(s, ty); err != nil { + v, err := gocty.ToCtyValue(s, ty) + if err != nil { return err } - if l, is := IsYamlExpressionLiteral(y.v); is { // HCL literal - var e Expression - if e, err = ParseExpression(l); err != nil { - // TODO: point to exact location within expression, see Diagnostic.Subject - return fmt.Errorf("line %d: %w", n.Line, err) - } - y.v = e.AsValue() - } else if y.v.Type() == cty.String && hasVariable(y.v.AsString()) { // "simple" variable - e, err := SimpleVarToExpression(y.v.AsString()) - if err != nil { - // TODO: point to exact location within expression, see Diagnostic.Subject + if v.Type() == cty.String { + if v, err = parseYamlString(v.AsString()); err != nil { return fmt.Errorf("line %d: %w", n.Line, err) } - y.v = e.AsValue() } + y.Wrap(v) return nil } +func isHCLLiteral(s string) bool { + return strings.HasPrefix(s, "((") && strings.HasSuffix(s, "))") +} + +func parseYamlString(s string) (cty.Value, error) { + if isHCLLiteral(s) { + if e, err := ParseExpression(s[2 : len(s)-2]); err != nil { + return cty.NilVal, err + } else { + return e.AsValue(), nil + } + } + if strings.HasPrefix(s, `\((`) && strings.HasSuffix(s, `))`) { + return cty.StringVal(s[1:]), nil // escaped HCL literal + } + return parseBpLit(s) +} + func (y *YamlValue) unmarshalObject(n *yaml.Node) error { var my map[string]YamlValue if err := n.Decode(&my); err != nil { @@ -281,9 +289,9 @@ func (y *YamlValue) unmarshalObject(n *yaml.Node) error { } mv := map[string]cty.Value{} for k, y := range my { - mv[k] = y.v + mv[k] = y.Unwrap() } - y.v = cty.ObjectVal(mv) + y.Wrap(cty.ObjectVal(mv)) return nil } @@ -294,9 +302,9 @@ func (y *YamlValue) unmarshalTuple(n *yaml.Node) error { } lv := []cty.Value{} for _, y := range ly { - lv = append(lv, y.v) + lv = append(lv, y.Unwrap()) } - y.v = cty.TupleVal(lv) + y.Wrap(cty.TupleVal(lv)) return nil } @@ -306,11 +314,11 @@ func (d *Dict) UnmarshalYAML(n *yaml.Node) error { if err := n.Decode(&v); err != nil { return err } - ty := v.v.Type() + ty := v.Unwrap().Type() if !ty.IsObjectType() { - return fmt.Errorf("line %d: must be a mapping, got %s", n.Line, ty.FriendlyName()) + return nodeToPosErr(n, fmt.Errorf("must be a mapping, got %s", ty.FriendlyName())) } - for k, w := range v.v.AsValueMap() { + for k, w := range v.Unwrap().AsValueMap() { d.Set(k, w) } return nil @@ -319,10 +327,25 @@ func (d *Dict) UnmarshalYAML(n *yaml.Node) error { // MarshalYAML implements custom YAML marshaling. func (d Dict) MarshalYAML() (interface{}, error) { o, _ := cty.Transform(d.AsObject(), func(p cty.Path, v cty.Value) (cty.Value, error) { + if v.IsNull() { + return v, nil + } if e, is := IsExpressionValue(v); is { s := string(hclwrite.Format(e.Tokenize().Bytes())) return cty.StringVal("((" + s + "))"), nil } + if v.Type() == cty.String { + // Need to escape back the non-expressions (both HCL and blueprint ones) + s := v.AsString() + if isHCLLiteral(s) { + // yaml: "\((foo))" -unmarshal-> cty: "((foo))" -marshall-> yaml: "\((foo))" + // NOTE: don't attempt to escape both HCL and blueprint expressions + // they don't get unmarshalled together, terminate here + return cty.StringVal(`\` + s), nil + } + // yaml: "\$(var.foo)" -unmarshal-> cty: "$(var.foo)" -marshall-> yaml: "\$(var.foo)" + return cty.StringVal(strings.ReplaceAll(s, `$(`, `\$(`)), nil + } return v, nil }) @@ -339,40 +362,37 @@ func (d Dict) MarshalYAML() (interface{}, error) { return g, nil } -type yamlErrWithPos struct { - pos Pos - errMsg string -} - // yaml.v3 errors are either TypeError - collection of error message or single error message. // Parse error messages to extract short error message and position. -func parseYamlV3Error(err error) []yamlErrWithPos { - res := []yamlErrWithPos{} +func parseYamlV3Error(err error) error { + errs := Errors{} switch err := err.(type) { case *yaml.TypeError: for _, s := range err.Errors { - res = append(res, parseYamlV3ErrorString(s)) + errs.Add(parseYamlV3ErrorString(s)) } + case PosError: + errs.Add(err) default: - res = append(res, parseYamlV3ErrorString(err.Error())) + errs.Add(parseYamlV3ErrorString(err.Error())) } - if len(res) == 0 { // should never happen - res = append(res, parseYamlV3ErrorString(err.Error())) + if !errs.Any() { // should never happen + errs.Add(parseYamlV3ErrorString(err.Error())) } - return res + return errs } // parseYamlV3Error attempts to extract position and nice error message from yaml.v3 error message. // yaml.v3 errors are unstructured, use string parsing to extract information. -// If no position can be extracted, returns (Pos{}, error.Error()). -// Else returns (Pos{Line: line_number}, error_message). -func parseYamlV3ErrorString(s string) yamlErrWithPos { - match := regexp.MustCompile(`^(yaml: )?(line (\d+): )?(.*)$`).FindStringSubmatch(s) +// If no position can be extracted, returns error without position. +// Else returns PosError{Pos{Line: line_number}, error_message}. +func parseYamlV3ErrorString(s string) error { + match := regexp.MustCompile(`^(yaml: )?(line (\d+): )?((.|\n)*)$`).FindStringSubmatch(s) if match == nil { - return yamlErrWithPos{Pos{}, s} + return errors.New(s) } lns, errMsg := match[3], match[4] ln, _ := strconv.Atoi(lns) // Atoi returns 0 on error, which is fine here - return yamlErrWithPos{Pos{Line: ln}, errMsg} + return PosError{Pos{Line: ln}, errors.New(errMsg)} } diff --git a/pkg/config/yaml_test.go b/pkg/config/yaml_test.go index f65f4d6cc3..46b704fcb8 100644 --- a/pkg/config/yaml_test.go +++ b/pkg/config/yaml_test.go @@ -260,7 +260,7 @@ func TestDictWrongTypeUnmarshalYAML(t *testing.T) { if err == nil { t.Errorf("expected error, got nil") } - if diff := cmp.Diff(err.Error(), "line 2: must be a mapping, got number"); diff != "" { + if diff := cmp.Diff(err.Error(), "line 2 column 1: must be a mapping, got number"); diff != "" { t.Errorf("diff (-want +got):\n%s", diff) } } @@ -337,10 +337,11 @@ b: null c: ~ d: "null" ` + anyNull := cty.NullVal(cty.DynamicPseudoType) want := cty.ObjectVal(map[string]cty.Value{ - "a": cty.NilVal, - "b": cty.NilVal, - "c": cty.NilVal, + "a": anyNull, + "b": anyNull, + "c": anyNull, "d": cty.StringVal("null"), }) diff --git a/pkg/inspect/modules_test.go b/pkg/inspect/modules_test.go index 8e5c1bbba6..e0558c1de6 100644 --- a/pkg/inspect/modules_test.go +++ b/pkg/inspect/modules_test.go @@ -23,6 +23,8 @@ import ( "strings" "testing" + "github.com/hashicorp/hcl/v2/ext/typeexpr" + "github.com/zclconf/go-cty/cty" "golang.org/x/exp/slices" ) @@ -121,7 +123,7 @@ func checkInputType(t *testing.T, mod modInfo, input string, expected string) { t.Errorf("%s does not have input %s", mod.Source, input) } expected = modulereader.NormalizeType(expected) - got := modulereader.NormalizeType(i.Type) + got := typeexpr.TypeString(i.Type) if expected != got { t.Errorf("%s %s has unexpected type expected:\n%#v\ngot:\n%#v", mod.Source, input, expected, got) @@ -148,7 +150,7 @@ func TestNetworkStorage(t *testing.T) { for _, mod := range notEmpty(query(hasInput("network_storage")), t) { i, _ := mod.Input("network_storage") - got := modulereader.NormalizeType(i.Type) + got := typeexpr.TypeString(i.Type) if got != obj && got != lst { t.Errorf("%s `network_storage` has unexpected type expected:\n%#v\nor\n%#v\ngot:\n%#v", mod.Source, obj, lst, got) @@ -189,9 +191,30 @@ func TestMetadataInjectModuleId(t *testing.T) { if !ok { t.Fatalf("has no input %q", gm.InjectModuleId) } - if in.Type != "string" { + if in.Type != cty.String { t.Errorf("%q type is not a string, but %q", gm.InjectModuleId, in.Type) } }) } } + +func TestOutputForbiddenNames(t *testing.T) { + nowhere := []string{} + allowed := map[string][]string{ + // Global blueprint variables we don't want to get overwritten. + "project_id": {"community/modules/project/new-project"}, + "labels": nowhere, + "region": nowhere, + "zone": nowhere, + "deployment_name": nowhere, + } + for _, mod := range query(all()) { + t.Run(mod.Source, func(t *testing.T) { + for _, out := range mod.Outputs { + if where, ok := allowed[out.Name]; ok && !slices.Contains(where, mod.Source) { + t.Errorf("forbidden name for output %q", out.Name) + } + } + }) + } +} diff --git a/pkg/modulereader/hcl_utils.go b/pkg/modulereader/hcl_utils.go index 892f5c35e3..5119aba662 100644 --- a/pkg/modulereader/hcl_utils.go +++ b/pkg/modulereader/hcl_utils.go @@ -61,9 +61,13 @@ func getHCLInfo(source string) (ModuleInfo, error) { var vars []VarInfo var outs []OutputInfo for _, v := range module.Variables { + ty, err := GetCtyType(v.Type) + if err != nil { + return ModuleInfo{}, fmt.Errorf("failed to parse type of variable %q: %w", v.Name, err) + } vInfo := VarInfo{ Name: v.Name, - Type: v.Type, + Type: ty, Description: v.Description, Default: v.Default, Required: v.Required, @@ -82,15 +86,27 @@ func getHCLInfo(source string) (ModuleInfo, error) { return ret, nil } -// Transforms HCL type string into cty.Type -func getCtyType(hclType string) (cty.Type, error) { +// Transforms Terraform type string into cty.Type +func GetCtyType(hclType string) (cty.Type, error) { + if hclType == "" { // treat empty type as `any` + // see https://developer.hashicorp.com/terraform/language/values/variables#type-constraints + return cty.DynamicPseudoType, nil + } expr, diags := hclsyntax.ParseExpression([]byte(hclType), "", hcl.Pos{Line: 1, Column: 1}) if diags.HasErrors() { - return cty.Type{}, diags + return cty.NilType, diags } - typ, diags := typeexpr.TypeConstraint(expr) + + switch hcl.ExprAsKeyword(expr) { + case "list": + return cty.List(cty.DynamicPseudoType), nil + case "map": + return cty.Map(cty.DynamicPseudoType), nil + } + + typ, _, diags := typeexpr.TypeConstraintWithDefaults(expr) if diags.HasErrors() { - return cty.Type{}, diags + return cty.NilType, diags } return typ, nil } @@ -104,7 +120,7 @@ func getCtyType(hclType string) (cty.Type, error) { // // This method is fail-safe, if error arises passed type will be returned without changes. func NormalizeType(hclType string) string { - ctyType, err := getCtyType(hclType) + ctyType, err := GetCtyType(hclType) if err != nil { logging.Error("Failed to parse HCL type='%s', got %v", hclType, err) return hclType diff --git a/pkg/modulereader/hcl_utils_test.go b/pkg/modulereader/hcl_utils_test.go index 94a0123847..13f038a067 100644 --- a/pkg/modulereader/hcl_utils_test.go +++ b/pkg/modulereader/hcl_utils_test.go @@ -16,7 +16,9 @@ package modulereader import ( "os" + "testing" + "github.com/zclconf/go-cty/cty" . "gopkg.in/check.v1" ) @@ -50,3 +52,48 @@ func (s *zeroSuite) TestReadHclAtttributes(c *C) { _, err = ReadHclAttributes(fn.Name()) c.Assert(err, NotNil) } + +func TestReplaceTokens(t *testing.T) { + type test struct { + ty string + err bool + want cty.Type + } + tests := []test{ + {"", false, cty.DynamicPseudoType}, + + {"string", false, cty.String}, + + {"list", false, cty.List(cty.DynamicPseudoType)}, + {"list(string)", false, cty.List(cty.String)}, + {"list(any)", false, cty.List(cty.DynamicPseudoType)}, + + {"map", false, cty.Map(cty.DynamicPseudoType)}, + {"map(string)", false, cty.Map(cty.String)}, + {"map(any)", false, cty.Map(cty.DynamicPseudoType)}, + + {`object({sweet=string})`, false, + cty.Object(map[string]cty.Type{"sweet": cty.String})}, + {`object({sweet=optional(string)})`, false, + cty.ObjectWithOptionalAttrs(map[string]cty.Type{"sweet": cty.String}, []string{"sweet"})}, + {`object({sweet=optional(string, "caramel")})`, false, + cty.ObjectWithOptionalAttrs(map[string]cty.Type{"sweet": cty.String}, []string{"sweet"})}, + + {"for", true, cty.NilType}, + } + for _, tc := range tests { + t.Run(tc.ty, func(t *testing.T) { + got, err := GetCtyType(tc.ty) + if tc.err != (err != nil) { + t.Errorf("got unexpected error: %s", err) + } + if err != nil { + return + } + + if !got.Equals(tc.want) { + t.Errorf("\nwant: %#v\ngot: %#v", tc.want, got) + } + }) + } +} diff --git a/pkg/modulereader/metadata.go b/pkg/modulereader/metadata.go index 3748d3872d..310ae74120 100644 --- a/pkg/modulereader/metadata.go +++ b/pkg/modulereader/metadata.go @@ -47,6 +47,8 @@ type MetadataGhpc struct { // Optional, set to the string-typed module variable name. // If set, the blueprint module id will be set as a value of this variable. InjectModuleId string `yaml:"inject_module_id"` + // If set to true, the creation will fail if the module is not used. + HasToBeUsed bool `yaml:"has_to_be_used"` } // GetMetadata reads and parses `metadata.yaml` from module root. @@ -54,7 +56,6 @@ type MetadataGhpc struct { func GetMetadata(source string) (Metadata, error) { var err error var data []byte - // TODO: use bpmetadata.UnmarshalMetadata, it performs some additional checks filePath := filepath.Join(source, "metadata.yaml") switch { diff --git a/pkg/modulereader/metadata_legacy.go b/pkg/modulereader/metadata_legacy.go index 08894b802e..2571d262fd 100644 --- a/pkg/modulereader/metadata_legacy.go +++ b/pkg/modulereader/metadata_legacy.go @@ -41,9 +41,6 @@ func defaultAPIList(source string) []string { // https://console.cloud.google.com/apis/dashboard and // https://console.cloud.google.com/apis/library staticAPIMap := map[string][]string{ - "community/modules/compute/SchedMD-slurm-on-gcp-partition": { - "compute.googleapis.com", - }, "community/modules/compute/htcondor-execute-point": { "compute.googleapis.com", "storage.googleapis.com", @@ -85,11 +82,10 @@ func defaultAPIList(source string) []string { "community/modules/project/service-enablement": { "serviceusage.googleapis.com", }, - "community/modules/scheduler/SchedMD-slurm-on-gcp-controller": { - "compute.googleapis.com", - }, - "community/modules/scheduler/SchedMD-slurm-on-gcp-login-node": { + "community/modules/scheduler/schedmd-slurm-gcp-v6-controller": { "compute.googleapis.com", + "iam.googleapis.com", + "storage.googleapis.com", }, "community/modules/compute/gke-node-pool": { "container.googleapis.com", diff --git a/pkg/modulereader/resreader.go b/pkg/modulereader/resreader.go index 1390756e66..141157d16f 100644 --- a/pkg/modulereader/resreader.go +++ b/pkg/modulereader/resreader.go @@ -25,13 +25,14 @@ import ( "path" "github.com/hashicorp/go-getter" + "github.com/zclconf/go-cty/cty" "gopkg.in/yaml.v3" ) // VarInfo stores information about a module input variables type VarInfo struct { Name string - Type string + Type cty.Type Description string Default interface{} Required bool @@ -47,6 +48,7 @@ type OutputInfo struct { // UnmarshalYAML supports parsing YAML OutputInfo fields as a simple list of // strings or as a list of maps directly into OutputInfo struct +// TODO: unmarshal logic shouldn't be defined in this package, move to pkg/config func (mo *OutputInfo) UnmarshalYAML(value *yaml.Node) error { var name string const yamlErrorMsg string = "block beginning at line %d: %s" diff --git a/pkg/modulereader/resreader_test.go b/pkg/modulereader/resreader_test.go index a33a17da74..6dd6e9e12a 100644 --- a/pkg/modulereader/resreader_test.go +++ b/pkg/modulereader/resreader_test.go @@ -21,6 +21,7 @@ import ( "path/filepath" "testing" + "github.com/zclconf/go-cty/cty" . "gopkg.in/check.v1" "gopkg.in/yaml.v3" ) @@ -88,7 +89,7 @@ func (s *MySuite) TestGetModuleInfo_Embedded(c *C) { c.Check(mi, DeepEquals, ModuleInfo{ Inputs: []VarInfo{{ Name: "test_variable", - Type: "string", + Type: cty.String, Description: "This is just a test", Required: true}}, Outputs: []OutputInfo{{ @@ -136,7 +137,7 @@ func (s *MySuite) TestGetModuleInfo_Local(c *C) { c.Check(mi, DeepEquals, ModuleInfo{ Inputs: []VarInfo{{ Name: "test_variable", - Type: "string", + Type: cty.String, Description: "This is just a test", Required: true}}, Outputs: []OutputInfo{{ @@ -190,7 +191,7 @@ func (s *MySuite) TestGetInfo_TFReder(c *C) { info, err := reader.GetInfo(s.terraformDir) c.Assert(err, IsNil) c.Check(info, DeepEquals, ModuleInfo{ - Inputs: []VarInfo{{Name: "test_variable", Type: "string", Description: "This is just a test", Required: true}}, + Inputs: []VarInfo{{Name: "test_variable", Type: cty.String, Description: "This is just a test", Required: true}}, Outputs: []OutputInfo{{Name: "test_output", Description: "This is just a test"}}, }) @@ -201,7 +202,7 @@ func (s *MySuite) TestGetInfo_PackerReader(c *C) { exp := ModuleInfo{ Inputs: []VarInfo{{ Name: "test_variable", - Type: "string", + Type: cty.String, Description: "This is just a test", Required: true}}} @@ -264,3 +265,8 @@ func (s *zeroSuite) TestUnmarshalOutputInfo(c *C) { y = "{ name: foo, description: bar, sensitive: contingent }" c.Check(yaml.Unmarshal([]byte(y), &oinfo), NotNil) } + +func (s *zeroSuite) TestLegacyMetadata(c *C) { // dummy test for sake of coverage + mi := legacyMetadata("modules/arbuz/velikan") + c.Check(mi.Spec.Requirements.Services, DeepEquals, []string{}) +} diff --git a/pkg/modulewriter/hcl_utils.go b/pkg/modulewriter/hcl_utils.go index 74e0f33cb2..891d421867 100644 --- a/pkg/modulewriter/hcl_utils.go +++ b/pkg/modulewriter/hcl_utils.go @@ -15,9 +15,6 @@ package modulewriter import ( - "fmt" - "path/filepath" - "hpc-toolkit/pkg/config" "github.com/hashicorp/hcl/v2/hclwrite" @@ -26,10 +23,6 @@ import ( // WriteHclAttributes writes tfvars/pkvars.hcl files func WriteHclAttributes(vars map[string]cty.Value, dst string) error { - if err := createBaseFile(dst); err != nil { - return fmt.Errorf("error creating variables file %v: %v", filepath.Base(dst), err) - } - hclFile := hclwrite.NewEmptyFile() hclBody := hclFile.Body() for _, k := range orderKeys(vars) { @@ -37,11 +30,5 @@ func WriteHclAttributes(vars map[string]cty.Value, dst string) error { toks := config.TokensForValue(vars[k]) hclBody.SetAttributeRaw(k, toks) } - - hclBytes := hclFile.Bytes() - err := appendHCLToFile(dst, hclBytes) - if err != nil { - return fmt.Errorf("error writing HCL to %v: %v", filepath.Base(dst), err) - } - return err + return writeHclFile(dst, hclFile) } diff --git a/pkg/modulewriter/modulewriter.go b/pkg/modulewriter/modulewriter.go index b662fd32b2..6a1f310607 100644 --- a/pkg/modulewriter/modulewriter.go +++ b/pkg/modulewriter/modulewriter.go @@ -333,10 +333,7 @@ func prepArtifactsDir(artifactsDir string) error { defer f.Close() _, err = f.WriteString(artifactsWarning) - if err != nil { - return err - } - return nil + return err } func writeExpandedBlueprint(depDir string, dc config.DeploymentConfig) error { @@ -366,7 +363,6 @@ func writeDestroyInstructions(w io.Writer, dc config.DeploymentConfig, deploymen } if grp.Kind() == config.PackerKind { packerManifests = append(packerManifests, filepath.Join(grpPath, string(grp.Modules[0].ID), "packer-manifest.json")) - } } diff --git a/pkg/modulewriter/modulewriter_test.go b/pkg/modulewriter/modulewriter_test.go index 2c85fe36b6..af4031c75d 100644 --- a/pkg/modulewriter/modulewriter_test.go +++ b/pkg/modulewriter/modulewriter_test.go @@ -27,6 +27,8 @@ import ( "strings" "testing" + "github.com/google/go-cmp/cmp" + "github.com/hashicorp/hcl/v2/ext/typeexpr" "github.com/hashicorp/hcl/v2/hclwrite" "github.com/spf13/afero" "github.com/zclconf/go-cty/cty" @@ -56,8 +58,8 @@ func (s *MySuite) getDeploymentConfigForTest() config.DeploymentConfig { Kind: config.TerraformKind, ID: "testModule", Settings: config.NewDict(map[string]cty.Value{ - "deployment_name": cty.NilVal, - "project_id": cty.NilVal, + "deployment_name": cty.NullVal(cty.String), + "project_id": cty.NullVal(cty.String), }), Outputs: []modulereader.OutputInfo{ { @@ -234,89 +236,42 @@ func (s *MySuite) TestRestoreTfState(c *C) { c.Check(err, IsNil) } -func (s *zeroSuite) TestGetTypeTokens(c *C) { - // Success Integer - tok := getTypeTokens(cty.NumberIntVal(-1)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("number"))) - - tok = getTypeTokens(cty.NumberIntVal(0)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("number"))) - - tok = getTypeTokens(cty.NumberIntVal(1)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("number"))) - - // Success Float - tok = getTypeTokens(cty.NumberFloatVal(-99.9)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("number"))) - - tok = getTypeTokens(cty.NumberFloatVal(99.9)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("number"))) - - // Success String - tok = getTypeTokens(cty.StringVal("Lorum")) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("string"))) - - tok = getTypeTokens(cty.StringVal("")) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("string"))) - - // Success Bool - tok = getTypeTokens(cty.BoolVal(true)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("bool"))) - - tok = getTypeTokens(cty.BoolVal(false)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("bool"))) - - // Success tuple - tok = getTypeTokens(cty.TupleVal([]cty.Value{})) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("list"))) - - tok = getTypeTokens(cty.TupleVal([]cty.Value{cty.StringVal("Lorum")})) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("list"))) - - // Success list - tok = getTypeTokens(cty.ListVal([]cty.Value{cty.StringVal("Lorum")})) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("list"))) - - // Success object - tok = getTypeTokens(cty.ObjectVal(map[string]cty.Value{})) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("any"))) - - val := cty.ObjectVal(map[string]cty.Value{"Lorum": cty.StringVal("Ipsum")}) - tok = getTypeTokens(val) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("any"))) - - // Success Map - val = cty.MapVal(map[string]cty.Value{"Lorum": cty.StringVal("Ipsum")}) - tok = getTypeTokens(val) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("any"))) - - // Success any - tok = getTypeTokens(cty.NullVal(cty.DynamicPseudoType)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("any"))) +func TestGetTypeTokensRelaxed(t *testing.T) { + type test struct { + input cty.Type + want string + } + tests := []test{ + {cty.Number, "number"}, + {cty.String, "string"}, + {cty.Bool, "bool"}, + {cty.Tuple([]cty.Type{}), "list(any)"}, + {cty.Tuple([]cty.Type{cty.String}), "list(any)"}, + {cty.List(cty.String), "list(any)"}, + {cty.Object(map[string]cty.Type{}), "any"}, + {cty.Object(map[string]cty.Type{"Lorum": cty.String}), "any"}, + {cty.Map(cty.String), "any"}, + {cty.DynamicPseudoType, "any"}, + } + for _, tc := range tests { + t.Run(typeexpr.TypeString(tc.input), func(t *testing.T) { + got := string(getTypeTokens(relaxVarType(tc.input)).Bytes()) + if diff := cmp.Diff(tc.want, got); diff != "" { + t.Errorf("diff (-want +got):\n%s", diff) + } + }) + } } -func (s *MySuite) TestCreateBaseFile(c *C) { +func (s *MySuite) TestWriteHclFile(c *C) { + hclF := hclwrite.NewEmptyFile() + hclF.Body().SetAttributeValue("zebra", cty.NumberIntVal(0)) + // Success baseFilename := "main.tf_TestCreateBaseFile" goodPath := filepath.Join(s.testDir, baseFilename) - err := createBaseFile(goodPath) - c.Assert(err, IsNil) + c.Assert(writeHclFile(goodPath, hclF), IsNil) + fi, err := os.Stat(goodPath) c.Assert(err, IsNil) c.Assert(fi.Name(), Equals, baseFilename) @@ -325,26 +280,11 @@ func (s *MySuite) TestCreateBaseFile(c *C) { b, _ := os.ReadFile(goodPath) c.Assert(strings.Contains(string(b), "Licensed under the Apache License"), Equals, true) + c.Assert(strings.Contains(string(b), "zebra"), Equals, true) // Error: not a correct path fakePath := filepath.Join("not/a/real/dir", "main.tf_TestCreateBaseFile") - err = createBaseFile(fakePath) - c.Assert(err, ErrorMatches, ".* no such file or directory") -} - -func (s *MySuite) TestAppendHCLToFile(c *C) { - // Setup - testFilename := "main.tf_TestAppendHCLToFile" - testPath := filepath.Join(s.testDir, testFilename) - _, err := os.Create(testPath) - c.Assert(err, IsNil) - hclFile := hclwrite.NewEmptyFile() - hclBody := hclFile.Body() - hclBody.SetAttributeValue("dummyAttributeName", cty.NumberIntVal(0)) - - // Success - err = appendHCLToFile(testPath, hclFile.Bytes()) - c.Assert(err, IsNil) + c.Assert(writeHclFile(fakePath, hclF), ErrorMatches, ".* no such file or directory") } func stringExistsInFile(str string, filename string) (bool, error) { @@ -421,7 +361,10 @@ func (s *MySuite) TestWriteOutputs(c *C) { // Success: Outputs added outputList := []modulereader.OutputInfo{ {Name: "output1"}, - {Name: "output2"}, + { + Name: "output2", + Sensitive: true, + }, } moduleWithOutputs := config.Module{Outputs: outputList, ID: "testMod"} testModules = []config.Module{moduleWithOutputs} @@ -437,7 +380,7 @@ func (s *MySuite) TestWriteOutputs(c *C) { // Failure: Bad path err = writeOutputs(testModules, "not/a/real/path") - c.Assert(err, ErrorMatches, "error creating outputs.tf file: .*") + c.Assert(err, ErrorMatches, ".*outputs.tf.*") } @@ -458,7 +401,7 @@ func (s *MySuite) TestWriteVariables(c *C) { // Failure: Bad path err = writeVariables(testVars, noIntergroupVars, "not/a/real/path") - c.Assert(err, ErrorMatches, "error creating variables.tf file: .*") + c.Assert(err, NotNil) // Success, common vars testVars["deployment_name"] = cty.StringVal("test_deployment") @@ -493,8 +436,7 @@ func (s *MySuite) TestWriteProviders(c *C) { c.Assert(exists, Equals, false) // Failure: Bad Path - err = writeProviders(testVars, "not/a/real/path") - c.Assert(err, ErrorMatches, "error creating providers.tf file: .*") + c.Assert(writeProviders(testVars, "not/a/real/path"), NotNil) // Success: All vars testVars["project_id"] = cty.StringVal("test_project") @@ -518,44 +460,39 @@ func (s *MySuite) TestWriteDeploymentGroup_PackerWriter(c *C) { deploymentio := deploymentio.GetDeploymentioLocal() testWriter := PackerWriter{} - // No Packer modules - deploymentName := "deployment_TestWriteModuleLevel_PackerWriter" - deploymentDir := filepath.Join(s.testDir, deploymentName) - if err := deploymentio.CreateDirectory(deploymentDir); err != nil { - c.Fatal(err) - } - groupDir := filepath.Join(deploymentDir, "packerGroup") - if err := deploymentio.CreateDirectory(groupDir); err != nil { - c.Fatal(err) - } - moduleDir := filepath.Join(groupDir, "testPackerModule") - if err := deploymentio.CreateDirectory(moduleDir); err != nil { - c.Fatal(err) - } + otherMod := config.Module{ID: "tortoise"} - testPackerModule := config.Module{ + mod := config.Module{ Kind: config.PackerKind, - ID: "testPackerModule", - } - testDeploymentGroup := config.DeploymentGroup{ - Name: "packerGroup", - Modules: []config.Module{testPackerModule}, + ID: "prince", + Settings: config.NewDict(map[string]cty.Value{ + "zebra": cty.StringVal("checker"), // const + "salmon": config.GlobalRef("golf").AsValue(), // var + "bear": config.Reference{Module: otherMod.ID, Name: "rome"}.AsValue(), // IGC + }), } - testDC := config.DeploymentConfig{ + dc := config.DeploymentConfig{ Config: config.Blueprint{ + Vars: config.NewDict(map[string]cty.Value{ + "golf": cty.NumberIntVal(17), + }), DeploymentGroups: []config.DeploymentGroup{ - testDeploymentGroup, + {Name: "bread", Modules: []config.Module{otherMod}}, + {Name: "green", Modules: []config.Module{mod}}, }, }, } - f, err := os.CreateTemp("", "tmpf") - if err != nil { - c.Fatal() + + dir := c.MkDir() + moduleDir := filepath.Join(dir, string(mod.ID)) + if err := deploymentio.CreateDirectory(moduleDir); err != nil { + c.Fatal(err) } - defer os.Remove(f.Name()) - testWriter.writeDeploymentGroup(testDC, 0, groupDir, f) - _, err = os.Stat(filepath.Join(moduleDir, packerAutoVarFilename)) + instructions := new(strings.Builder) + + c.Assert(testWriter.writeDeploymentGroup(dc, 1, dir, instructions), IsNil) + _, err := os.Stat(filepath.Join(moduleDir, packerAutoVarFilename)) c.Assert(err, IsNil) } @@ -568,7 +505,7 @@ func (s *MySuite) TestWritePackerAutoVars(c *C) { // fail writing to a bad path badDestPath := "not/a/real/path" err := writePackerAutovars(vars.Items(), badDestPath) - expErr := fmt.Sprintf("error creating variables file %s:.*", packerAutoVarFilename) + expErr := fmt.Sprintf(".*%s.*", packerAutoVarFilename) c.Assert(err, ErrorMatches, expErr) // success @@ -577,27 +514,6 @@ func (s *MySuite) TestWritePackerAutoVars(c *C) { } -func (s *zeroSuite) TestStringEscape(c *C) { - f := func(s string) string { - toks := config.TokensForValue(cty.StringVal(s)) - return string(toks.Bytes()) - } - // LiteralVariables - c.Check(f(`\((not.var))`), Equals, `"((not.var))"`) - c.Check(f(`abc\((not.var))abc`), Equals, `"abc((not.var))abc"`) - c.Check(f(`abc \((not.var)) abc`), Equals, `"abc ((not.var)) abc"`) - c.Check(f(`abc \((not.var1)) abc \((not.var2)) abc`), Equals, `"abc ((not.var1)) abc ((not.var2)) abc"`) - c.Check(f(`abc \\((escape.backslash))`), Equals, `"abc \\((escape.backslash))"`) - - // BlueprintVariables - c.Check(f(`\$(not.var)`), Equals, `"$(not.var)"`) - c.Check(f(`abc\$(not.var)abc`), Equals, `"abc$(not.var)abc"`) - c.Check(f(`abc \$(not.var) abc`), Equals, `"abc $(not.var) abc"`) - c.Check(f(`abc \$(not.var1) abc \$(not.var2) abc`), Equals, `"abc $(not.var1) abc $(not.var2) abc"`) - c.Check(f(`abc \\$(escape.backslash)`), Equals, `"abc \\$(escape.backslash)"`) - -} - func (s *zeroSuite) TestDeploymentSource(c *C) { { // git m := config.Module{Kind: config.TerraformKind, Source: "github.com/x/y.git"} @@ -662,12 +578,13 @@ func (s *zeroSuite) TestSubstituteIgcReferencesInModule(c *C) { config.MustParseExpression(`module.golf.red + 6 + module.golf.green`).AsValue(), config.MustParseExpression(`module.tennis.brown`).AsValue(), })) - m := SubstituteIgcReferencesInModule( + m, err := SubstituteIgcReferencesInModule( config.Module{Settings: d}, map[config.Reference]modulereader.VarInfo{ config.ModuleRef("golf", "red"): {Name: "pink"}, config.ModuleRef("golf", "green"): {Name: "lime"}, }) + c.Assert(err, IsNil) c.Check(m.Settings.Items(), DeepEquals, map[string]cty.Value{"fold": cty.TupleVal([]cty.Value{ cty.StringVal("zebra"), config.MustParseExpression(`var.pink + 6 + var.lime`).AsValue(), diff --git a/pkg/modulewriter/packerwriter.go b/pkg/modulewriter/packerwriter.go index d45fcdd767..81bd1985ef 100644 --- a/pkg/modulewriter/packerwriter.go +++ b/pkg/modulewriter/packerwriter.go @@ -61,21 +61,14 @@ func (w PackerWriter) writeDeploymentGroup( instructionsFile io.Writer, ) error { depGroup := dc.Config.DeploymentGroups[grpIdx] - igcInputs := map[string]bool{} for _, mod := range depGroup.Modules { pure := config.Dict{} for setting, v := range mod.Settings.Items() { - igcRefs := config.FindIntergroupReferences(v, mod, dc.Config) - if len(igcRefs) == 0 { + if len(config.FindIntergroupReferences(v, mod, dc.Config)) == 0 { pure.Set(setting, v) } - for _, r := range igcRefs { - n := config.AutomaticOutputName(r.Name, r.Module) - igcInputs[n] = true - } } - av, err := pure.Eval(dc.Config) if err != nil { return err diff --git a/pkg/modulewriter/tfversions.go b/pkg/modulewriter/tfversions.go deleted file mode 100644 index 6c1992034b..0000000000 --- a/pkg/modulewriter/tfversions.go +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2022 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package modulewriter - -const tfversions string = ` -terraform { - required_version = ">= 1.2" - - required_providers { - google = { - source = "hashicorp/google" - version = "~> 4.84.0" - } - google-beta = { - source = "hashicorp/google-beta" - version = "~> 4.84.0" - } - } -} -` diff --git a/pkg/modulewriter/tfwriter.go b/pkg/modulewriter/tfwriter.go index 3db6713d6d..22da6c2114 100644 --- a/pkg/modulewriter/tfwriter.go +++ b/pkg/modulewriter/tfwriter.go @@ -42,26 +42,17 @@ const ( // TFWriter writes terraform to the blueprint folder type TFWriter struct{} -// createBaseFile creates a baseline file for all terraform/hcl including a -// license and any other boilerplate -func createBaseFile(path string) error { - baseFile, err := os.Create(path) +func writeHclFile(path string, hclFile *hclwrite.File) error { + f, err := os.Create(path) if err != nil { - return err + return fmt.Errorf("error writing %q: %v", path, err) } - defer baseFile.Close() - _, err = baseFile.WriteString(license) - return err -} - -func appendHCLToFile(path string, hclBytes []byte) error { - file, err := os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0644) - if err != nil { - return err + defer f.Close() + if _, err := f.WriteString(license); err != nil { + return fmt.Errorf("error writing %q: %v", path, err) } - defer file.Close() - if _, err = file.Write(hclBytes); err != nil { - return err + if _, err := f.Write(hclwrite.Format(hclFile.Bytes())); err != nil { + return fmt.Errorf("error writing %q: %v", path, err) } return nil } @@ -101,16 +92,7 @@ func writeOutputs( if len(outputs) == 0 { return nil } - hclBytes := hclFile.Bytes() - outputsPath := filepath.Join(dst, "outputs.tf") - if err := createBaseFile(outputsPath); err != nil { - return fmt.Errorf("error creating outputs.tf file: %v", err) - } - err := appendHCLToFile(outputsPath, hclBytes) - if err != nil { - return fmt.Errorf("error writing HCL to outputs.tf file: %v", err) - } - return nil + return writeHclFile(filepath.Join(dst, "outputs.tf"), hclFile) } func writeTfvars(vars map[string]cty.Value, dst string) error { @@ -120,36 +102,28 @@ func writeTfvars(vars map[string]cty.Value, dst string) error { return err } -func getHclType(t cty.Type) string { +func relaxVarType(t cty.Type) cty.Type { if t.IsPrimitiveType() { - return typeexpr.TypeString(t) + return t } if t.IsListType() || t.IsTupleType() || t.IsSetType() { - return "list" + return cty.List(cty.DynamicPseudoType) // list of any } - return typeexpr.TypeString(cty.DynamicPseudoType) // any + return cty.DynamicPseudoType // any } -func getTypeTokens(v cty.Value) hclwrite.Tokens { - return simpleTokens(getHclType(v.Type())) +func getTypeTokens(ty cty.Type) hclwrite.Tokens { + return simpleTokens(typeexpr.TypeString(ty)) } func writeVariables(vars map[string]cty.Value, extraVars []modulereader.VarInfo, dst string) error { - // Create file - variablesPath := filepath.Join(dst, "variables.tf") - if err := createBaseFile(variablesPath); err != nil { - return fmt.Errorf("error creating variables.tf file: %v", err) - } - var inputs []modulereader.VarInfo for k, v := range vars { - typeStr := getHclType(v.Type()) - newInput := modulereader.VarInfo{ + inputs = append(inputs, modulereader.VarInfo{ Name: k, - Type: typeStr, + Type: relaxVarType(v.Type()), Description: fmt.Sprintf("Toolkit deployment variable: %s", k), - } - inputs = append(inputs, newInput) + }) } inputs = append(inputs, extraVars...) slices.SortFunc(inputs, func(i, j modulereader.VarInfo) int { return strings.Compare(i.Name, j.Name) }) @@ -164,14 +138,10 @@ func writeVariables(vars map[string]cty.Value, extraVars []modulereader.VarInfo, hclBlock := hclBody.AppendNewBlock("variable", []string{k.Name}) blockBody := hclBlock.Body() blockBody.SetAttributeValue("description", cty.StringVal(k.Description)) - blockBody.SetAttributeRaw("type", simpleTokens(k.Type)) + blockBody.SetAttributeRaw("type", getTypeTokens(k.Type)) } - // Write file - if err := appendHCLToFile(variablesPath, hclFile.Bytes()); err != nil { - return fmt.Errorf("error writing HCL to variables.tf file: %v", err) - } - return nil + return writeHclFile(filepath.Join(dst, "variables.tf"), hclFile) } func writeMain( @@ -179,13 +149,6 @@ func writeMain( tfBackend config.TerraformBackend, dst string, ) error { - // Create file - mainPath := filepath.Join(dst, "main.tf") - if err := createBaseFile(mainPath); err != nil { - return fmt.Errorf("error creating main.tf file: %v", err) - } - - // Create HCL Body hclFile := hclwrite.NewEmptyFile() hclBody := hclFile.Body() @@ -220,25 +183,13 @@ func writeMain( moduleBody.SetAttributeRaw(setting, config.TokensForValue(value)) } } - // Write file - hclBytes := hclFile.Bytes() - hclBytes = hclwrite.Format(hclBytes) - if err := appendHCLToFile(mainPath, hclBytes); err != nil { - return fmt.Errorf("error writing HCL to main.tf file: %v", err) - } - return nil + + return writeHclFile(filepath.Join(dst, "main.tf"), hclFile) } var simpleTokens = hclwrite.TokensForIdentifier func writeProviders(vars map[string]cty.Value, dst string) error { - // Create file - providersPath := filepath.Join(dst, "providers.tf") - if err := createBaseFile(providersPath); err != nil { - return fmt.Errorf("error creating providers.tf file: %v", err) - } - - // Create HCL Body hclFile := hclwrite.NewEmptyFile() hclBody := hclFile.Body() @@ -256,26 +207,36 @@ func writeProviders(vars map[string]cty.Value, dst string) error { provBody.SetAttributeRaw("region", simpleTokens("var.region")) } } - - // Write file - hclBytes := hclFile.Bytes() - if err := appendHCLToFile(providersPath, hclBytes); err != nil { - return fmt.Errorf("error writing HCL to providers.tf file: %v", err) - } - return nil + return writeHclFile(filepath.Join(dst, "providers.tf"), hclFile) } func writeVersions(dst string) error { - // Create file - versionsPath := filepath.Join(dst, "versions.tf") - if err := createBaseFile(versionsPath); err != nil { - return fmt.Errorf("error creating versions.tf file: %v", err) + f := hclwrite.NewEmptyFile() + body := f.Body() + body.AppendNewline() + tfb := body.AppendNewBlock("terraform", []string{}).Body() + tfb.SetAttributeValue("required_version", cty.StringVal(">= 1.2")) + tfb.AppendNewline() + + type provider struct { + alias string + source string + version string } - // Write hard-coded version information - if err := appendHCLToFile(versionsPath, []byte(tfversions)); err != nil { - return fmt.Errorf("error writing HCL to versions.tf file: %v", err) + providers := []provider{ + {"google", "hashicorp/google", "~> 4.84.0"}, + {"google-beta", "hashicorp/google-beta", "~> 4.84.0"}, } - return nil + + pb := tfb.AppendNewBlock("required_providers", []string{}).Body() + + for _, p := range providers { + pb.SetAttributeValue(p.alias, cty.ObjectVal(map[string]cty.Value{ + "source": cty.StringVal(p.source), + "version": cty.StringVal(p.version), + })) + } + return writeHclFile(filepath.Join(dst, "versions.tf"), f) } func writeTerraformInstructions(w io.Writer, grpPath string, n config.GroupName, printExportOutputs bool, printImportInputs bool) { @@ -310,7 +271,10 @@ func (w TFWriter) writeDeploymentGroup( } // Write main.tf file - doctoredModules := substituteIgcReferences(g.Modules, intergroupVars) + doctoredModules, err := substituteIgcReferences(g.Modules, intergroupVars) + if err != nil { + return fmt.Errorf("error substituting intergroup references in deployment group %s: %w", g.Name, err) + } if err := writeMain(doctoredModules, g.TerraformBackend, groupPath); err != nil { return fmt.Errorf("error writing main.tf file for deployment group %s: %w", g.Name, err) } @@ -385,56 +349,58 @@ func orderKeys[T any](settings map[string]T) []string { } func getUsedDeploymentVars(group config.DeploymentGroup, bp config.Blueprint) map[string]cty.Value { - // labels must always be written as a variable as it is implicitly added - groupInputs := map[string]bool{ - "labels": true, + res := map[string]cty.Value{ + // labels must always be written as a variable as it is implicitly added + "labels": bp.Vars.Get("labels"), } - for _, mod := range group.Modules { for _, v := range config.GetUsedDeploymentVars(mod.Settings.AsObject()) { - groupInputs[v] = true - } - } - - filteredVars := make(map[string]cty.Value) - for key, val := range bp.Vars.Items() { - if groupInputs[key] { - filteredVars[key] = val + res[v] = bp.Vars.Get(v) } } - return filteredVars + return res } -func substituteIgcReferences(mods []config.Module, igcRefs map[config.Reference]modulereader.VarInfo) []config.Module { +func substituteIgcReferences(mods []config.Module, igcRefs map[config.Reference]modulereader.VarInfo) ([]config.Module, error) { doctoredMods := make([]config.Module, len(mods)) for i, mod := range mods { - doctoredMods[i] = SubstituteIgcReferencesInModule(mod, igcRefs) + dm, err := SubstituteIgcReferencesInModule(mod, igcRefs) + if err != nil { + return nil, err + } + doctoredMods[i] = dm } - return doctoredMods + return doctoredMods, nil } // SubstituteIgcReferencesInModule updates expressions in Module settings to use // special IGC var name instead of the module reference -func SubstituteIgcReferencesInModule(mod config.Module, igcRefs map[config.Reference]modulereader.VarInfo) config.Module { - v, _ := cty.Transform(mod.Settings.AsObject(), func(p cty.Path, v cty.Value) (cty.Value, error) { +func SubstituteIgcReferencesInModule(mod config.Module, igcRefs map[config.Reference]modulereader.VarInfo) (config.Module, error) { + v, err := cty.Transform(mod.Settings.AsObject(), func(p cty.Path, v cty.Value) (cty.Value, error) { e, is := config.IsExpressionValue(v) if !is { return v, nil } - ue := string(e.Tokenize().Bytes()) - for _, r := range e.References() { + refs := e.References() + for _, r := range refs { oi, exists := igcRefs[r] if !exists { continue } - s := fmt.Sprintf("module.%s.%s", r.Module, r.Name) - rs := fmt.Sprintf("var.%s", oi.Name) - ue = strings.ReplaceAll(ue, s, rs) + old := r.AsExpression() + new := config.GlobalRef(oi.Name).AsExpression() + var err error + if e, err = config.ReplaceSubExpressions(e, old, new); err != nil { + return cty.NilVal, err + } } - return config.MustParseExpression(ue).AsValue(), nil + return e.AsValue(), nil }) + if err != nil { + return config.Module{}, err + } mod.Settings = config.NewDict(v.AsValueMap()) - return mod + return mod, nil } // FindIntergroupVariables returns all unique intergroup references made by @@ -446,7 +412,7 @@ func FindIntergroupVariables(group config.DeploymentGroup, bp config.Blueprint) n := config.AutomaticOutputName(r.Name, r.Module) res[r] = modulereader.VarInfo{ Name: n, - Type: getHclType(cty.DynamicPseudoType), + Type: cty.DynamicPseudoType, Description: "Automatically generated input from previous groups (ghpc import-inputs --help)", Required: true, } diff --git a/pkg/shell/terraform.go b/pkg/shell/terraform.go index 2d51a078b7..a06a4b0a93 100644 --- a/pkg/shell/terraform.go +++ b/pkg/shell/terraform.go @@ -413,7 +413,10 @@ func ImportInputs(deploymentGroupDir string, artifactsDir string, expandedBluepr } igcVars := modulewriter.FindIntergroupVariables(g, bp) - newModule := modulewriter.SubstituteIgcReferencesInModule(config.Module{Settings: intergroupSettings}, igcVars) + newModule, err := modulewriter.SubstituteIgcReferencesInModule(config.Module{Settings: intergroupSettings}, igcVars) + if err != nil { + return err + } if err := mergeMapsWithoutLoss(inputs, bp.Vars.Items()); err != nil { return err diff --git a/pkg/validators/cloud.go b/pkg/validators/cloud.go index 957e034d7a..78a0585f2b 100644 --- a/pkg/validators/cloud.go +++ b/pkg/validators/cloud.go @@ -42,9 +42,9 @@ func getErrorReason(err googleapi.Error) (string, map[string]interface{}) { } func newDisabledServiceError(title string, name string, pid string) error { - return hint( - fmt.Errorf("%s service is disabled in project %s", title, pid), - fmt.Sprintf("%s can be enabled at https://console.cloud.google.com/apis/library/%s?project=%s", title, name, pid)) + return config.HintError{ + Hint: fmt.Sprintf("%s can be enabled at https://console.cloud.google.com/apis/library/%s?project=%s", title, name, pid), + Err: fmt.Errorf("%s service is disabled in project %s", title, pid)} } func handleServiceUsageError(err error, pid string) error { @@ -188,12 +188,11 @@ func testApisEnabled(bp config.Blueprint, inputs config.Dict) error { return err } apis := map[string]bool{} - bp.WalkModules(func(m *config.Module) error { + bp.WalkModulesSafe(func(_ config.ModulePath, m *config.Module) { services := m.InfoOrDie().Metadata.Spec.Requirements.Services for _, api := range services { apis[api] = true } - return nil }) return TestApisEnabled(p, maps.Keys(apis)) } diff --git a/pkg/validators/semantic.go b/pkg/validators/semantic.go index c0192465a1..b6fc18eae1 100644 --- a/pkg/validators/semantic.go +++ b/pkg/validators/semantic.go @@ -26,19 +26,14 @@ func testModuleNotUsed(bp config.Blueprint, inputs config.Dict) error { return err } errs := config.Errors{} - for ig, g := range bp.DeploymentGroups { - for im, m := range g.Modules { - ums := m.ListUnusedModules() - p := config.Root.Groups.At(ig).Modules.At(im).Use - - for iu, u := range m.Use { - if slices.Contains(ums, u) { - errs.At(p.At(iu), fmt.Errorf(unusedModuleMsg, m.ID, u)) - } + bp.WalkModulesSafe(func(p config.ModulePath, m *config.Module) { + ums := m.ListUnusedModules() + for iu, u := range m.Use { + if slices.Contains(ums, u) { + errs.At(p.Use.At(iu), fmt.Errorf(unusedModuleMsg, m.ID, u)) } } - } - + }) return errs.OrNil() } diff --git a/pkg/validators/validators.go b/pkg/validators/validators.go index d0076060ab..0b56f76660 100644 --- a/pkg/validators/validators.go +++ b/pkg/validators/validators.go @@ -15,6 +15,7 @@ package validators import ( + "errors" "fmt" "hpc-toolkit/pkg/config" "strings" @@ -27,21 +28,17 @@ const regionError = "region %s is not available in project ID %s or your credent const zoneError = "zone %s is not available in project ID %s or your credentials do not have permission to access it" const zoneInRegionError = "zone %s is not in region %s in project ID %s or your credentials do not have permissions to access it" const unusedModuleMsg = "module %q uses module %q, but matching setting and outputs were not found. This may be because the value is set explicitly or set by a prior used module" +const credentialsHint = "load application default credentials following instructions at https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/README.md#supplying-cloud-credentials-to-terraform" + +var ErrNoDefaultCredentials = errors.New("could not find application default credentials") func handleClientError(e error) error { if strings.Contains(e.Error(), "could not find default credentials") { - return hint( - fmt.Errorf("could not find application default credentials"), - "load application default credentials following instructions at https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/README.md#supplying-cloud-credentials-to-terraform") + return config.HintError{Hint: credentialsHint, Err: ErrNoDefaultCredentials} } return e } -// TODO: use HintError trait once its implemented -func hint(err error, h string) error { - return fmt.Errorf("%w\n%s", err, h) -} - const ( testApisEnabledName = "test_apis_enabled" testProjectExistsName = "test_project_exists" @@ -153,13 +150,13 @@ func inputsAsStrings(inputs config.Dict) (map[string]string, error) { // inspect the blueprint for global variables that exist and add an appropriate validators. func defaults(bp config.Blueprint) []config.Validator { projectIDExists := bp.Vars.Has("project_id") - projectRef := config.GlobalRef("project_id").AsExpression().AsValue() + projectRef := config.GlobalRef("project_id").AsValue() regionExists := bp.Vars.Has("region") - regionRef := config.GlobalRef("region").AsExpression().AsValue() + regionRef := config.GlobalRef("region").AsValue() zoneExists := bp.Vars.Has("zone") - zoneRef := config.GlobalRef("zone").AsExpression().AsValue() + zoneRef := config.GlobalRef("zone").AsValue() defaults := []config.Validator{ {Validator: testModuleNotUsedName}, diff --git a/pkg/validators/validators_test.go b/pkg/validators/validators_test.go index d9389f19a6..cbceb71f04 100644 --- a/pkg/validators/validators_test.go +++ b/pkg/validators/validators_test.go @@ -32,37 +32,39 @@ func Test(t *testing.T) { } func (s *MySuite) TestCheckInputs(c *C) { + dummy := cty.NullVal(cty.String) + { // OK: Inputs is equal to required inputs without regard to ordering i := config.NewDict(map[string]cty.Value{ - "in0": cty.NilVal, - "in1": cty.NilVal}) + "in0": dummy, + "in1": dummy}) c.Check(checkInputs(i, []string{"in0", "in1"}), IsNil) c.Check(checkInputs(i, []string{"in1", "in0"}), IsNil) } { // FAIL: inputs are a proper subset of required inputs i := config.NewDict(map[string]cty.Value{ - "in0": cty.NilVal, - "in1": cty.NilVal}) + "in0": dummy, + "in1": dummy}) err := checkInputs(i, []string{"in0", "in1", "in2"}) c.Check(err, NotNil) } { // FAIL: inputs intersect with required inputs but are not a proper subset i := config.NewDict(map[string]cty.Value{ - "in0": cty.NilVal, - "in1": cty.NilVal, - "in3": cty.NilVal}) + "in0": dummy, + "in1": dummy, + "in3": dummy}) err := checkInputs(i, []string{"in0", "in1", "in2"}) c.Check(err, NotNil) } { // FAIL inputs are a proper superset of required inputs i := config.NewDict(map[string]cty.Value{ - "in0": cty.NilVal, - "in1": cty.NilVal, - "in2": cty.NilVal, - "in3": cty.NilVal}) + "in0": dummy, + "in1": dummy, + "in2": dummy, + "in3": dummy}) err := checkInputs(i, []string{"in0", "in1", "in2"}) c.Check(err, ErrorMatches, "only 3 inputs \\[in0 in1 in2\\] should be provided") } @@ -73,9 +75,9 @@ func (s *MySuite) TestDefaultValidators(c *C) { unusedVars := config.Validator{Validator: "test_deployment_variable_not_used"} apisEnabled := config.Validator{Validator: "test_apis_enabled"} - projectRef := config.GlobalRef("project_id").AsExpression().AsValue() - regionRef := config.GlobalRef("region").AsExpression().AsValue() - zoneRef := config.GlobalRef("zone").AsExpression().AsValue() + projectRef := config.GlobalRef("project_id").AsValue() + regionRef := config.GlobalRef("region").AsValue() + zoneRef := config.GlobalRef("zone").AsValue() projectExists := config.Validator{ Validator: testProjectExistsName, diff --git a/tools/cloud-build/README.md b/tools/cloud-build/README.md index e55f5d77e7..468dcb96a2 100644 --- a/tools/cloud-build/README.md +++ b/tools/cloud-build/README.md @@ -11,9 +11,10 @@ * `Dockerfile`: Defines the HPC Toolkit docker image used in testing. * `hpc-toolkit-builder.yaml`: Cloud build config for running regular builds of the HPC Toolkit docker image. -* `hpc-toolkit-pr-validation.yaml`: Cloud build config for the PR validition +* `hpc-toolkit-pr-validation.yaml`: Cloud build config for the PR validation tests. The PR validation run `make tests` and validates against all pre-commits on all files. +* `pr-ofe.yaml`: Cloud build config for sanity test installing the OFE virtual environment. * `project-cleanup.yaml`: Cloud build config that performs a regular cleanup of resources in the test project. * `provision`: Terraform module that sets up CloudBuild triggers and schedule. diff --git a/tools/cloud-build/babysit_tests.py b/tools/cloud-build/babysit_tests.py index 13edb7405d..25fa7267dc 100755 --- a/tools/cloud-build/babysit_tests.py +++ b/tools/cloud-build/babysit_tests.py @@ -76,6 +76,9 @@ def selector(build: Build) -> bool: "PR-test-hpc-slurm-chromedesktop", "PR-test-lustre-slurm", ]), + "slurm6": selector_by_name([ + "PR-test-slurm-gcp-v6-tpu", + ]), "spack": selector_by_name([ "PR-test-batch-mpi", "PR-test-spack-gromacs", diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml index af3383d517..3245b2a0f5 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml @@ -49,7 +49,7 @@ set -e -o pipefail gcloud compute images delete --project={{ project }} --quiet $(jq -r '.builds[-1].artifact_id' packer-manifest.json | cut -d ":" -f2) args: - chdir: "{{ workspace }}/{{ deployment_name }}/packer/custom-image" + chdir: "{{ workspace }}/{{ deployment_name }}/{{ packer_group_name }}/{{ packer_module_id }}" executable: /bin/bash - name: Trigger Cloud Build failure when: ghpc_destroy.failed or image_deletion.failed diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml index e26ef26c35..69d04de793 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml @@ -199,6 +199,9 @@ executable: /bin/bash changed_when: False register: initial_node_count + until: initial_node_count.rc == 0 + retries: 60 + delay: 15 - name: Run Integration tests for HPC toolkit ansible.builtin.include_tasks: "{{ test }}" diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml index 6e845bd9de..4b0084e628 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml @@ -44,6 +44,8 @@ - name: Run JAX verification register: jax_status failed_when: jax_status.rc != 0 + retries: 3 + delay: 100 ansible.builtin.command: | srun -N 1 -p tpu bash -c ' pip install --upgrade 'jax[tpu]>0.3.0' -f https://storage.googleapis.com/jax-releases/libtpu_releases.html diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-batch-submission.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-batch-submission.yml index de257020c9..5e9f3ae18b 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-batch-submission.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-batch-submission.yml @@ -28,7 +28,7 @@ - name: Submit batch job register: batch_submission changed_when: batch_submission.rc == 0 - ansible.builtin.command: gcloud alpha batch jobs submit {{ deployment_name }} --config=/home/batch-jobs/cloud-batch-{{ deployment_name }}.json --location={{ cli_deployment_vars.region }} --project={{ custom_vars.project }} + ansible.builtin.command: gcloud alpha batch jobs submit {{ deployment_name }} --config=/home/batch-jobs/cloud-batch-{{ deployment_name }}.yaml --location={{ cli_deployment_vars.region }} --project={{ custom_vars.project }} - name: Wait for job to run changed_when: false ansible.builtin.command: gcloud alpha batch jobs describe {{ deployment_name }} --location={{ cli_deployment_vars.region }} --project={{ custom_vars.project }} diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-monitoring.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-monitoring.yml index 00d3ca7676..c5c34701a7 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-monitoring.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-monitoring.yml @@ -19,24 +19,28 @@ vars: vm_name: "{{ remote_node }}" timeout_seconds: 600 + - name: Gather service facts become: true ansible.builtin.service_facts: -- name: Fail if ops agent is not running + +- name: Fail if stackdriver agent is not running ansible.builtin.assert: that: - - ansible_facts.services["google-cloud-ops-agent.service"].status == "enabled" - - ansible_facts.services["google-cloud-ops-agent-fluent-bit.service"].state == "running" - - ansible_facts.services["google-cloud-ops-agent-opentelemetry-collector.service"].state == "running" + - ansible_facts.services["stackdriver-agent"].status == "enabled" + - ansible_facts.services["stackdriver-agent"].state == "running" + - name: Check that monitoring dashboard has been created changed_when: false ansible.builtin.command: gcloud monitoring dashboards list --format="get(displayName)" run_once: true delegate_to: localhost register: dashboards + - name: Print dashboard information ansible.builtin.debug: var: dashboards + - name: Fail if the HPC Dashboard hasn't been created ansible.builtin.fail: msg: Failed to create dashboard diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-spack.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-spack.yml index b09d1a49a1..d5e435c04b 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-spack.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-spack.yml @@ -17,8 +17,8 @@ - name: Include wait for startup script ansible.builtin.include_tasks: "tasks/wait-for-startup-script.yml" vars: - vm_name: "{{ login_node }}" - timeout_seconds: 7200 + vm_name: "{{ image_name }}" + timeout_seconds: 21600 - name: Ensure spack is installed ansible.builtin.command: spack --version changed_when: False diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml index 3db7e9fa56..10d06b03f2 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml @@ -27,7 +27,7 @@ vars: # on_host_maintenance: MIGRATE num_nodes: 1 rocky_image: - family: slurm-gcp-5-9-hpc-rocky-linux-8 + family: slurm-gcp-5-10-hpc-rocky-linux-8 project: schedmd-slurm-public deployment_groups: @@ -85,7 +85,7 @@ deployment_groups: # settings: # node_count_dynamic_max: $(vars.num_nodes) # instance_image: - # family: slurm-gcp-5-9-ubuntu-2004-lts + # family: slurm-gcp-5-10-ubuntu-2004-lts # project: schedmd-slurm-public # instance_image_custom: true diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml index b7880219bf..3d227e9c69 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml @@ -110,7 +110,7 @@ deployment_groups: - id: wait-centos source: community/modules/scripts/wait-for-startup settings: - instance_name: ((module.workstation-centos.name[0])) + instance_name: $(workstation-centos.name[0]) timeout: 7200 - id: workstation-rocky @@ -128,7 +128,7 @@ deployment_groups: - id: wait-rocky source: community/modules/scripts/wait-for-startup settings: - instance_name: ((module.workstation-rocky.name[0])) + instance_name: $(workstation-rocky.name[0]) timeout: 7200 # - id: workstation-ubuntu @@ -145,5 +145,5 @@ deployment_groups: # - id: wait-ubuntu # source: community/modules/scripts/wait-for-startup # settings: - # instance_name: ((module.workstation-ubuntu.name[0])) + # instance_name: $(workstation-ubuntu.name[0]) # timeout: 7200 diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml deleted file mode 100644 index 5d5d39db71..0000000000 --- a/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: lustre-new-vpc - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: lustre-new-vpc - region: us-west4 - zone: us-west4-c - -deployment_groups: -- group: primary - modules: - # Source is an embedded module, denoted by "modules/*" without ./, ../, / - # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/pre-existing-vpc - - id: network1 - source: modules/network/vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: scratchfs - source: community/modules/file-system/DDN-EXAScaler - use: [network1] - settings: - local_mount: /scratch - - # these runners are no longer necessary, but it is important that we test it still works - # even when added twice - - id: mount-exascaler - source: modules/scripts/startup-script - settings: - runners: - - $(scratchfs.install_ddn_lustre_client_runner) - - $(scratchfs.mount_runner) - - # Create a separate workstation to catch regressions in vm-instance - - id: workstation - source: ./modules/compute/vm-instance - use: - - network1 - - homefs - - mount-exascaler - settings: - name_prefix: test-workstation1 - add_deployment_name_before_prefix: true - machine_type: c2-standard-4 - - - id: wait0 - source: ./community/modules/scripts/wait-for-startup - settings: - instance_name: $(workstation.name[0]) - - # test installing lustre from pre-existing-network-storage - - id: pre-fs - source: modules/file-system/pre-existing-network-storage - settings: - server_ip: $(scratchfs.network_storage.server_ip) - remote_mount: $(scratchfs.network_storage.remote_mount) - local_mount: $(scratchfs.network_storage.local_mount) - fs_type: $(scratchfs.network_storage.fs_type) - - - id: mount-exascaler-from-pre-existing - source: modules/scripts/startup-script - settings: - runners: - - $(pre-fs.client_install_runner) - - $(pre-fs.mount_runner) - - - id: install-luster-from-pre-existing - source: modules/compute/vm-instance - use: - - network1 - - mount-exascaler-from-pre-existing - settings: - name_prefix: test-workstation2 - add_deployment_name_before_prefix: false - machine_type: n2-standard-4 - - - id: wait1 - source: ./community/modules/scripts/wait-for-startup - settings: - instance_name: $(install-luster-from-pre-existing.name[0]) - - - id: compute_partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - - scratchfs - settings: - max_node_count: 2 - partition_name: compute - - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - scratchfs - - compute_partition - - - id: slurm_login - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - homefs - - scratchfs - - slurm_controller - settings: - login_machine_type: n2-standard-4 diff --git a/tools/cloud-build/daily-tests/blueprints/monitoring.yaml b/tools/cloud-build/daily-tests/blueprints/monitoring.yaml index c36a55425a..4704259c1c 100644 --- a/tools/cloud-build/daily-tests/blueprints/monitoring.yaml +++ b/tools/cloud-build/daily-tests/blueprints/monitoring.yaml @@ -21,6 +21,8 @@ vars: deployment_name: monitoring region: us-central1 zone: us-central1-c + add_deployment_name_before_prefix: true + machine_type: c2-standard-4 deployment_groups: - group: primary @@ -38,24 +40,47 @@ deployment_groups: - id: bucket-for-startup-script source: community/modules/file-system/cloud-storage-bucket - - id: startup + - id: startup-ops source: modules/scripts/startup-script use: [bucket-for-startup-script] settings: install_cloud_ops_agent: true - - id: workstation - source: ./modules/compute/vm-instance + - id: workstation-ops + source: modules/compute/vm-instance use: - network - homefs - - startup + - startup-ops settings: - machine_type: c2-standard-4 - metadata: - enable-oslogin: true + name_prefix: workstation-ops + + - id: startup-stack + source: modules/scripts/startup-script + use: [bucket-for-startup-script] + settings: + install_stackdriver_agent: true + + - id: workstation-stack + source: modules/compute/vm-instance + use: + - network + - homefs + - startup-stack + settings: + name_prefix: workstation-stackdriver + + - id: wait0 + source: community/modules/scripts/wait-for-startup + settings: + instance_name: $(workstation-ops.name[0]) + + - id: wait1 + source: community/modules/scripts/wait-for-startup + settings: + instance_name: $(workstation-stack.name[0]) - id: hpc-dash - source: ./modules/monitoring/dashboard + source: modules/monitoring/dashboard settings: title: $(vars.deployment_name) diff --git a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml index e6a6b589e9..618393c2c9 100644 --- a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml +++ b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml @@ -62,7 +62,7 @@ steps: echo ' - id: wait' >> $${SG_EXAMPLE} echo ' source: community/modules/scripts/wait-for-startup' >> $${SG_EXAMPLE} echo ' settings:' >> $${SG_EXAMPLE} - echo ' instance_name: ((module.spack-builder.name[0]))' >> $${SG_EXAMPLE} + echo ' instance_name: $(spack-builder.name[0])' >> $${SG_EXAMPLE} echo ' timeout: 2400' >> $${SG_EXAMPLE} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ diff --git a/tools/cloud-build/daily-tests/builds/gke-storage.yaml b/tools/cloud-build/daily-tests/builds/gke-storage.yaml index 16809991aa..a861704b3d 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-storage.yaml @@ -59,8 +59,8 @@ steps: echo ' machine_type: e2-standard-2' >> $${SG_EXAMPLE} echo ' zone: us-central1-a' >> $${SG_EXAMPLE} - # avoids conflict if both gke tests are run at the same time - sed -i "s/gke-subnet/gke-storage-subnet/" $${SG_EXAMPLE} + # avoids conflict with other tests + sed -i "s/gke-subnet/gke-subnet-$${BUILD_ID_SHORT}/" $${SG_EXAMPLE} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/gke.yaml b/tools/cloud-build/daily-tests/builds/gke.yaml index 5826223fb0..5281446e6a 100644 --- a/tools/cloud-build/daily-tests/builds/gke.yaml +++ b/tools/cloud-build/daily-tests/builds/gke.yaml @@ -64,6 +64,9 @@ steps: echo ' use: [gke_cluster]' >> $${SG_EXAMPLE} echo ' settings: {name: ubuntu, image_type: UBUNTU_CONTAINERD}' >> $${SG_EXAMPLE} + # avoids conflict with other tests + sed -i "s/gke-subnet/gke-subnet-$${BUILD_ID_SHORT}/" $${SG_EXAMPLE} + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke.yml" diff --git a/tools/cloud-build/daily-tests/builds/hpc-high-io-v4.yaml b/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml similarity index 88% rename from tools/cloud-build/daily-tests/builds/hpc-high-io-v4.yaml rename to tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml index 6d53f54539..d71788e2a6 100644 --- a/tools/cloud-build/daily-tests/builds/hpc-high-io-v4.yaml +++ b/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml @@ -13,7 +13,7 @@ # limitations under the License. --- -timeout: 14400s # 4hr +timeout: 5400s # 1.5h steps: ## Test simple golang build - id: build_ghpc @@ -33,8 +33,8 @@ steps: args: - -c - echo "done fetching builder" -## Test Slurm High IO Example (Slurm on GCP v4) -- id: hpc-high-io-v4 + +- id: hpc-build-slurm-image waitFor: ["fetch_builder", "build_ghpc"] name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder entrypoint: /bin/bash @@ -48,6 +48,6 @@ steps: BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-high-io.yml" + --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml" diff --git a/tools/cloud-build/daily-tests/builds/lustre-new-vpc.yaml b/tools/cloud-build/daily-tests/builds/lustre-new-vpc.yaml deleted file mode 100644 index 6d73ff0da8..0000000000 --- a/tools/cloud-build/daily-tests/builds/lustre-new-vpc.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -timeout: 14400s # 4hr -steps: -## Test simple golang build -- id: build_ghpc - waitFor: ["-"] - name: "golang:bullseye" - entrypoint: /bin/bash - args: - - -c - - | - cd /workspace - make -- id: fetch_builder - waitFor: ["-"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - -c - - echo "done fetching builder" - -## Test DDN Lustre with new VPC -- id: lustre-new-vpc - waitFor: ["fetch_builder", "build_ghpc"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - args: - - -c - - | - set -x -e - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" --extra-vars="@tools/cloud-build/daily-tests/tests/lustre-new-vpc.yml" diff --git a/tools/cloud-build/daily-tests/builds/omnia.yaml b/tools/cloud-build/daily-tests/builds/omnia.yaml deleted file mode 100644 index ac112d3daa..0000000000 --- a/tools/cloud-build/daily-tests/builds/omnia.yaml +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -timeout: 14400s # 4hr -steps: -## Test simple golang build -- id: build_ghpc - waitFor: ["-"] - name: "golang:bullseye" - entrypoint: /bin/bash - args: - - -c - - | - cd /workspace - make -- id: fetch_builder - waitFor: ["-"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - -c - - echo "done fetching builder" - -## Test Omnia Example -- id: omnia - waitFor: ["fetch_builder", "build_ghpc"] - name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - args: - - -c - - | - set -x -e - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - OMNIA_EXAMPLE=community/examples/omnia-cluster.yaml - - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/omnia.yml" diff --git a/tools/cloud-build/daily-tests/builds/quantum-circuit.yaml b/tools/cloud-build/daily-tests/builds/quantum-circuit.yaml deleted file mode 100644 index 2d72aee381..0000000000 --- a/tools/cloud-build/daily-tests/builds/quantum-circuit.yaml +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -timeout: 14400s # 4hr -steps: -## Test simple golang build -- id: build_ghpc - waitFor: ["-"] - name: "golang:bullseye" - entrypoint: /bin/bash - args: - - -c - - | - cd /workspace - make -- id: fetch_builder - waitFor: ["-"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - -c - - echo "done fetching builder" - -# test quantum circuit simulator example (startup-script runs example code after -# compiling libraries) -- id: quantum-circuit - waitFor: ["fetch_builder", "build_ghpc"] - name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - args: - - -c - - | - set -x -e - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/qsim.yml" diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml deleted file mode 100644 index 3896883092..0000000000 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -timeout: 14400s # 4hr -steps: -## Test simple golang build -- id: build_ghpc - waitFor: ["-"] - name: "golang:bullseye" - entrypoint: /bin/bash - args: - - -c - - | - cd /workspace - make -- id: fetch_builder - waitFor: ["-"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - -c - - echo "done fetching builder" -## Test Slurm v5 HPC Centos7 Example -- id: slurm-gcp-v5-hpc-centos7 - waitFor: ["fetch_builder", "build_ghpc"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - args: - - -c - - | - set -x -e - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml" diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml index 30daa86b8f..3139adc87c 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml @@ -36,7 +36,7 @@ steps: - echo "done fetching builder" ## Test Slurm v6 TPU example -- id: slurm-gcp-v6-tpu +- id: slurm6-tpu waitFor: ["fetch_builder", "build_ghpc"] name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder entrypoint: /bin/bash diff --git a/tools/cloud-build/daily-tests/tests/qsim.yml b/tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml similarity index 65% rename from tools/cloud-build/daily-tests/tests/qsim.yml rename to tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml index e6eca65caa..deb3d3eedc 100644 --- a/tools/cloud-build/daily-tests/tests/qsim.yml +++ b/tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml @@ -14,11 +14,14 @@ --- -test_name: quantum-circuit -deployment_name: "qsim-{{ build }}" -zone: us-central1-f +test_name: hpc-build-slurm-image +deployment_name: build-slurm-{{ build }} +zone: us-central1-c workspace: /workspace -blueprint_yaml: "{{ workspace }}/community/examples/quantum-circuit-simulator.yaml" +blueprint_yaml: "{{ workspace }}/community/examples/hpc-build-slurm-image.yaml" network: "{{ deployment_name }}-net" -remote_node: "{{ deployment_name }}-0" -post_deploy_tests: [] +packer_group_name: build-slurm +packer_module_id: slurm-custom-image +cli_deployment_vars: + network_name: "{{ network }}" + subnetwork_name: "{{ network }}-sub" diff --git a/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml b/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml index 9b2b39d64a..2d1596cb0c 100644 --- a/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml @@ -22,6 +22,7 @@ zone: europe-west1-d cli_deployment_vars: region: europe-west1 zone: "{{ zone }}" + zones: "[europe-west1-b,europe-west1-c,europe-west1-d]" workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/hpc-enterprise-slurm.yaml" network: "default" diff --git a/tools/cloud-build/daily-tests/tests/lustre-new-vpc.yml b/tools/cloud-build/daily-tests/tests/lustre-new-vpc.yml deleted file mode 100644 index 8daf8bace0..0000000000 --- a/tools/cloud-build/daily-tests/tests/lustre-new-vpc.yml +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -test_name: lustre-new-vpc -deployment_name: "lustre-new-vpc-{{ build }}" -zone: us-west4-c -workspace: /workspace -blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml" -network: "{{deployment_name}}-net" -max_nodes: 5 -login_node: "slurm-{{ deployment_name }}-login0" -controller_node: "slurm-{{ deployment_name }}-controller" -post_deploy_tests: -- test-validation/test-mounts.yml -- test-validation/test-partitions.yml -custom_vars: - partitions: - - compute - mounts: - - /home - - /scratch diff --git a/tools/cloud-build/daily-tests/tests/ml-slurm.yml b/tools/cloud-build/daily-tests/tests/ml-slurm.yml index d003e45429..459ee4a565 100644 --- a/tools/cloud-build/daily-tests/tests/ml-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/ml-slurm.yml @@ -18,3 +18,5 @@ test_name: ml-slurm deployment_name: ml-slurm-{{ build }} workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/ml-slurm.yaml" +packer_group_name: packer +packer_module_id: custom-image diff --git a/tools/cloud-build/daily-tests/tests/monitoring.yml b/tools/cloud-build/daily-tests/tests/monitoring.yml index b459f5f9b5..21866db4c0 100644 --- a/tools/cloud-build/daily-tests/tests/monitoring.yml +++ b/tools/cloud-build/daily-tests/tests/monitoring.yml @@ -20,6 +20,6 @@ zone: us-central1-c workspace: /workspace blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/monitoring.yaml" network: "{{ deployment_name }}-net" -remote_node: "{{ deployment_name }}-0" +remote_node: "{{ deployment_name }}-workstation-stackdriver-0" post_deploy_tests: - test-validation/test-monitoring.yml diff --git a/tools/cloud-build/daily-tests/tests/omnia.yml b/tools/cloud-build/daily-tests/tests/omnia.yml deleted file mode 100644 index dfb3d2039e..0000000000 --- a/tools/cloud-build/daily-tests/tests/omnia.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -test_name: omnia-cluster -deployment_name: "omnia-{{ build }}" -zone: us-west3-c -workspace: /workspace -blueprint_yaml: "{{ workspace }}/community/examples/omnia-cluster.yaml" -network: "default" -remote_node: "*omnia-manager-0" -post_deploy_tests: [] -cli_deployment_vars: - machine_type: "c2-standard-4" diff --git a/tools/cloud-build/daily-tests/tests/packer.yml b/tools/cloud-build/daily-tests/tests/packer.yml index 54fb9ddd28..70dd6c9597 100644 --- a/tools/cloud-build/daily-tests/tests/packer.yml +++ b/tools/cloud-build/daily-tests/tests/packer.yml @@ -20,6 +20,8 @@ zone: us-central1-c workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/image-builder.yaml" network: "{{ deployment_name }}-net" +packer_group_name: packer +packer_module_id: custom-image cli_deployment_vars: network_name: "{{ network }}" subnetwork_name: "{{ network }}-sub" diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml index d356e5f380..c6e271cdc6 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml @@ -21,7 +21,7 @@ deployment_name: "debi-v5-{{ build }}" slurm_cluster_name: "debiv5{{ build[0:4] }}" cli_deployment_vars: - instance_image: "{family: slurm-gcp-5-9-debian-11, project: schedmd-slurm-public}" + instance_image: "{family: slurm-gcp-5-10-debian-11, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml deleted file mode 100644 index fe646b74da..0000000000 --- a/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -test_name: hpc-slurm -deployment_name: "cent-v5-{{ build }}" -# Manually adding the slurm_cluster_name for use in node names, which filters -# non-alphanumeric chars and is capped at 10 chars. -slurm_cluster_name: "centv5{{ build[0:4] }}" -zone: us-west4-c -cli_deployment_vars: - enable_cleanup_compute: true - region: us-west4 - zone: "{{ zone }}" - zones: "[us-west4-a,us-west4-b,us-west4-c]" -workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/hpc-slurm.yaml" -network: "{{ deployment_name }}-net" -max_nodes: 5 -# Note: Pattern matching in gcloud only supports 1 wildcard, centv5*-login-* won't work. -login_node: "{{ slurm_cluster_name }}-login-*" -controller_node: "{{ slurm_cluster_name }}-controller" -post_deploy_tests: -- test-validation/test-mounts.yml -- test-validation/test-partitions.yml -custom_vars: - partitions: - - compute - - debug - mounts: - - /home diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml index 8f593332bf..5d26b72fdb 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml @@ -21,7 +21,7 @@ deployment_name: "rock-8-{{ build }}" slurm_cluster_name: "rock8{{ build[0:5] }}" cli_deployment_vars: - instance_image: "{family: slurm-gcp-5-9-hpc-rocky-linux-8, project: schedmd-slurm-public}" + instance_image: "{family: slurm-gcp-5-10-hpc-rocky-linux-8, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml index b5f0a8655b..b77baf5382 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml @@ -26,13 +26,14 @@ cli_deployment_vars: zone: us-west4-c workspace: /workspace -blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm6.yaml" +blueprint_yaml: "{{ workspace }}/examples/hpc-slurm.yaml" network: "{{ deployment_name }}-net" max_nodes: 5 # Note: Pattern matching in gcloud only supports 1 wildcard, a*-login-* won't work. login_node: "{{ slurm_cluster_name }}-login-*" controller_node: "{{ slurm_cluster_name }}-controller" post_deploy_tests: +- test-validation/test-mounts.yml - test-validation/test-partitions.yml custom_vars: partitions: diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml index 8b58588ede..15e0c4d61e 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml @@ -14,7 +14,7 @@ --- -test_name: hpc-slurm6-tpu +test_name: slurm6-tpu deployment_name: "v6-tpu-{{ build }}" # Manually adding the slurm_cluster_name for use in node names, which filters # non-alphanumeric chars and is capped at 10 chars. @@ -27,7 +27,7 @@ cli_deployment_vars: zone: us-central1-b workspace: /workspace blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm6-tpu.yaml" -network: "default" +network: "{{ deployment_name }}-net" max_nodes: 5 # Note: Pattern matching in gcloud only supports 1 wildcard, a*-login-* won't work. login_node: "{{ slurm_cluster_name }}-login-*" diff --git a/tools/cloud-build/daily-tests/tests/spack-gromacs.yml b/tools/cloud-build/daily-tests/tests/spack-gromacs.yml index 412ade50d4..0fd0a596d2 100644 --- a/tools/cloud-build/daily-tests/tests/spack-gromacs.yml +++ b/tools/cloud-build/daily-tests/tests/spack-gromacs.yml @@ -15,14 +15,17 @@ --- test_name: hpc-slurm-gromacs -deployment_name: "spack-gromacs-{{ build }}" +deployment_name: "groma-{{ build }}" +slurm_cluster_name: "groma{{ build[0:5] }}" zone: us-central1-c workspace: /workspace blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm-gromacs.yaml" -network: "default" +network: "{{ deployment_name }}-net" max_nodes: 5 -login_node: slurm-{{ deployment_name }}-login0 -controller_node: slurm-{{ deployment_name }}-controller +login_node: "{{ slurm_cluster_name }}-login-*" +# Image name to be used to filter logs from /var/log/messages for startup script. +image_name: "slurm-gcp-dev-hpc-rocky-linux-8-*" +controller_node: "{{ slurm_cluster_name }}-controller" post_deploy_tests: - test-validation/test-spack.yml custom_vars: diff --git a/tools/cloud-build/hpc-toolkit-pr-validation.yaml b/tools/cloud-build/hpc-toolkit-pr-validation.yaml index 2b03d031b0..78ce66cc7f 100644 --- a/tools/cloud-build/hpc-toolkit-pr-validation.yaml +++ b/tools/cloud-build/hpc-toolkit-pr-validation.yaml @@ -18,28 +18,8 @@ steps: - id: git-fetch-unshallow name: gcr.io/cloud-builders/git args: ['fetch', '--unshallow'] -- id: pre-commits-setup - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - '-c' - - | - set -e - pre-commit install --install-hooks - time tflint --init -- id: pre-commit-run - waitFor: - - pre-commits-setup - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - '-c' - - SKIP=go-unit-tests pre-commit run --all-files - id: make-tests waitFor: - - pre-commits-setup - git-fetch-unshallow name: >- us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder @@ -50,20 +30,6 @@ steps: set -e export PROJECT=build-project time make tests -- id: ofe-virtual-env - waitFor: - - git-fetch-unshallow - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - '-c' - - | - set -e - python3 -m venv /opt/ofe - source /opt/ofe/bin/activate - pip install --upgrade pip - pip install --dry-run --no-cache-dir -r community/front-end/ofe/requirements.txt timeout: "1200s" options: machineType: N1_HIGHCPU_8 diff --git a/tools/cloud-build/daily-tests/tests/hpc-high-io.yml b/tools/cloud-build/pr-ofe.yaml similarity index 52% rename from tools/cloud-build/daily-tests/tests/hpc-high-io.yml rename to tools/cloud-build/pr-ofe.yaml index 5131d565b9..eea5d375a3 100644 --- a/tools/cloud-build/daily-tests/tests/hpc-high-io.yml +++ b/tools/cloud-build/pr-ofe.yaml @@ -14,23 +14,22 @@ --- -test_name: hpc-slurm-legacy -deployment_name: "hpc-high-io-{{ build }}" -zone: us-west4-c -workspace: /workspace -blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm-legacy.yaml" -network: "default" -max_nodes: 5 -login_node: "slurm-{{ deployment_name }}-login0" -controller_node: "slurm-{{ deployment_name }}-controller" -post_deploy_tests: -- test-validation/test-mounts.yml -- test-validation/test-partitions.yml -custom_vars: - partitions: - - compute - - low_cost - mounts: - - /home - - /scratch - - /projects +steps: +- id: git-fetch-unshallow + name: gcr.io/cloud-builders/git + args: ['fetch', '--unshallow'] +- id: ofe-virtual-env + waitFor: [git-fetch-unshallow] + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + args: + - '-c' + - | + set -e + python3 -m venv /opt/ofe + source /opt/ofe/bin/activate + pip install --upgrade pip + pip install --dry-run --no-cache-dir -r community/front-end/ofe/requirements.txt +timeout: "1200s" +options: + machineType: N1_HIGHCPU_8 diff --git a/tools/cloud-build/provision/README.md b/tools/cloud-build/provision/README.md index 634c933f22..275d4939b4 100644 --- a/tools/cloud-build/provision/README.md +++ b/tools/cloud-build/provision/README.md @@ -48,6 +48,7 @@ When prompted for project, use integration test project. | [google_cloudbuild_trigger.daily_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_go_build_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_ofe_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | +| [google_cloudbuild_trigger.pr_ofe_venv](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_validation](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.weekly_build_dependency_check](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | diff --git a/community/modules/compute/SchedMD-slurm-on-gcp-partition/metadata.yaml b/tools/cloud-build/provision/pr-ofe.tf similarity index 52% rename from community/modules/compute/SchedMD-slurm-on-gcp-partition/metadata.yaml rename to tools/cloud-build/provision/pr-ofe.tf index 4c2f23a8d7..6663f265a6 100644 --- a/community/modules/compute/SchedMD-slurm-on-gcp-partition/metadata.yaml +++ b/tools/cloud-build/provision/pr-ofe.tf @@ -1,4 +1,4 @@ -# Copyright 2023 "Google LLC" +# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,9 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ---- -spec: - requirements: - services: - - compute.googleapis.com +resource "google_cloudbuild_trigger" "pr_ofe_venv" { + name = "PR-ofe-venv" + description = "Sanity test installing the OFE virtual environment" + + filename = "tools/cloud-build/pr-ofe.yaml" + + github { + owner = "GoogleCloudPlatform" + name = "hpc-toolkit" + pull_request { + branch = ".*" + comment_control = "COMMENTS_ENABLED_FOR_EXTERNAL_CONTRIBUTORS_ONLY" + } + } + include_build_logs = "INCLUDE_BUILD_LOGS_WITH_STATUS" +} diff --git a/tools/enforce_coverage.pl b/tools/enforce_coverage.pl index b4c5c99c21..77e59c2f5b 100755 --- a/tools/enforce_coverage.pl +++ b/tools/enforce_coverage.pl @@ -16,27 +16,29 @@ use strict; use warnings; -# TODO: raise ./cmd min coverage to 80% after tests are written -my $min = 80; -my $cmdmin = 40; -my $shellmin = 0; -my $validatorsmin = 25; -my $failed_coverage = 0; - +my @failed; while (<>){ print $_; - if ( $_ =~ /hpc-toolkit\/cmd.*coverage: (\d+\.\d)%/) { - $failed_coverage++ if ($1 < $cmdmin); - } elsif ( $_ =~ /hpc-toolkit\/pkg\/shell.*coverage: (\d+\.\d)%/) { - $failed_coverage++ if ($1 < $shellmin); - } elsif ( $_ =~ /hpc-toolkit\/pkg\/validators.*coverage: (\d+\.\d)%/) { - $failed_coverage++ if ($1 < $validatorsmin); - } elsif ( $_ =~ /coverage: (\d+\.\d)%/ ) { - $failed_coverage++ if ($1 < $min); + + my @thresholds = qw( + cmd 40 + pkg/shell 0 + pkg/logging 0 + pkg/validators 25 + pkg/inspect 60 + pkg 80 + ); + + while (@thresholds) { + my ($path, $threshold) = splice(@thresholds, 0, 2); + if ( $_ =~ /hpc-toolkit\/$path.*coverage: (\d+\.\d)%/) { + chomp, push @failed, "$_ <= $threshold%\n" if ($1 < $threshold); + last; + } } } -if ($failed_coverage > 0) { - print STDERR "Coverage must be above $cmdmin% for ./cmd and $min% for other packages, $failed_coverage packages were below that.\n"; +if (@failed) { + print STDERR "\nFAILED:\n@failed"; exit 1 } diff --git a/tools/maintenance/maintenance.py b/tools/maintenance/maintenance.py old mode 100644 new mode 100755 diff --git a/tools/validate_configs/golden_copies/configs/igc_tf.yaml b/tools/validate_configs/golden_copies/configs/igc_tf.yaml index ad50266198..fff810f9ba 100644 --- a/tools/validate_configs/golden_copies/configs/igc_tf.yaml +++ b/tools/validate_configs/golden_copies/configs/igc_tf.yaml @@ -19,7 +19,7 @@ vars: project_id: # deployment_name: igc-tf-test region: us-east4 - zone: us-east4-c + zone: $(vars.region)-c deployment_groups: - group: zero diff --git a/tools/validate_configs/golden_copies/configs/merge_flatten.yaml b/tools/validate_configs/golden_copies/configs/merge_flatten.yaml index 590510eb41..5c2a43cb6c 100644 --- a/tools/validate_configs/golden_copies/configs/merge_flatten.yaml +++ b/tools/validate_configs/golden_copies/configs/merge_flatten.yaml @@ -17,8 +17,9 @@ blueprint_name: merge_flatten vars: project_id: # deployment_name: merge_flatten - region: us-east4 - zone: us-east4-c + region_number: 4 + region: us-east$(vars.region_number) + zone: $(vars.region)-c deployment_groups: - group: zero diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml index 7936ce8934..66b52a3479 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml @@ -32,6 +32,7 @@ vars: ghpc_deployment: golden_copy_deployment project_id: invalid-project region: us-east4 + region_number: 4 zone: us-east4-c deployment_groups: - group: zero diff --git a/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml b/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml index c56a226980..06eafcb4bd 100644 --- a/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml +++ b/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml @@ -24,11 +24,11 @@ vars: machine_type: n1-standard-2 instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - # family: slurm-gcp-5-9-ubuntu-2004-lts - # family: slurm-gcp-5-9-hpc-centos-7 - family: slurm-gcp-5-9-hpc-rocky-linux-8 - # family: slurm-gcp-5-9-debian-11 + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + # family: slurm-gcp-5-10-ubuntu-2004-lts + # family: slurm-gcp-5-10-hpc-centos-7 + family: slurm-gcp-5-10-hpc-rocky-linux-8 + # family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image_custom: true diff --git a/tools/validate_configs/os_compatibility_tests/slurm-lustre.yaml b/tools/validate_configs/os_compatibility_tests/slurm-lustre.yaml index 593442b137..bab5ee5183 100644 --- a/tools/validate_configs/os_compatibility_tests/slurm-lustre.yaml +++ b/tools/validate_configs/os_compatibility_tests/slurm-lustre.yaml @@ -24,10 +24,10 @@ vars: machine_type: n1-standard-2 instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - # family: slurm-gcp-5-9-ubuntu-2004-lts - # family: slurm-gcp-5-9-hpc-centos-7 - family: slurm-gcp-5-9-hpc-rocky-linux-8 + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + # family: slurm-gcp-5-10-ubuntu-2004-lts + # family: slurm-gcp-5-10-hpc-centos-7 + family: slurm-gcp-5-10-hpc-rocky-linux-8 project: schedmd-slurm-public instance_image_custom: true diff --git a/tools/validate_configs/os_compatibility_tests/slurm-startup.yaml b/tools/validate_configs/os_compatibility_tests/slurm-startup.yaml index 7ab7513756..6c5164de61 100644 --- a/tools/validate_configs/os_compatibility_tests/slurm-startup.yaml +++ b/tools/validate_configs/os_compatibility_tests/slurm-startup.yaml @@ -24,11 +24,11 @@ vars: machine_type: n1-standard-2 instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - # family: slurm-gcp-5-9-ubuntu-2004-lts - # family: slurm-gcp-5-9-hpc-centos-7 - family: slurm-gcp-5-9-hpc-rocky-linux-8 - # family: slurm-gcp-5-9-debian-11 + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + # family: slurm-gcp-5-10-ubuntu-2004-lts + # family: slurm-gcp-5-10-hpc-centos-7 + family: slurm-gcp-5-10-hpc-rocky-linux-8 + # family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image_custom: true diff --git a/tools/validate_configs/test_configs/README.md b/tools/validate_configs/test_configs/README.md index 5849fe3c97..7b18fad427 100644 --- a/tools/validate_configs/test_configs/README.md +++ b/tools/validate_configs/test_configs/README.md @@ -21,9 +21,6 @@ supplied guest accelerators are adding to the VM instances. filestore as a /home directory and a network. This has been used as a demo blueprint when presenting the toolkit. -**hpc-cluster-high-io-remote-state.yaml**: Creates a cluster with high -performance IO system with all Terraform state stored remotely. - **hpc-cluster-2filestore-4s_instance.yaml**: A slightly more complicated HPC cluster that includes 2 filestore (/home and /shared), two license servers, a head-node and 2 compute vms @@ -32,11 +29,6 @@ head-node and 2 compute vms SLURM partitions and primarily default settings. The blueprint also creates a new VPC network, a filestore instance mounted to `/home` and a workstation VM. -**omnia-cluster-simple.yaml**: Creates a SLURM cluster using -[DellHPC Omnia](https://github.com/dellhpc/omnia). The cluster is comprised of -one manager node and eight compute nodes that share a `/home` mounted filestore -instance. The pre-existing default VPC network is used. - **instance_with_startup.yaml**: Creates a simple cluster with one vm-instance and filestore using the startup-script module to setup and mount the filestore instance. diff --git a/tools/validate_configs/test_configs/apt-collision.yaml b/tools/validate_configs/test_configs/apt-collision.yaml index 0043ea941a..9ab7a7e8a3 100644 --- a/tools/validate_configs/test_configs/apt-collision.yaml +++ b/tools/validate_configs/test_configs/apt-collision.yaml @@ -42,7 +42,7 @@ deployment_groups: kind: terraform id: startup settings: - install_cloud_ops_agent: true + install_stackdriver_agent: true install_ansible: true - source: modules/compute/vm-instance diff --git a/tools/validate_configs/test_configs/hpc-cluster-high-io-remote-state.yaml b/tools/validate_configs/test_configs/hpc-cluster-high-io-remote-state.yaml deleted file mode 100644 index 1b28a603a1..0000000000 --- a/tools/validate_configs/test_configs/hpc-cluster-high-io-remote-state.yaml +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-cluster-high-io-remote-state - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm-io - region: us-central1 - zone: us-central1-a - -terraform_backend_defaults: - type: gcs - configuration: - bucket: a_bucket - impersonate_service_account: a_bucket_reader@project.iam.gserviceaccount.com - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/pre-existing-vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: projectsfs - source: modules/file-system/filestore - use: [network1] - settings: - filestore_tier: HIGH_SCALE_SSD - size_gb: 10240 - local_mount: /projects - - - id: scratchfs - source: community/modules/file-system/DDN-EXAScaler - use: [network1] - settings: - local_mount: /scratch - - - id: compute_partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - homefs - - scratchfs - - projectsfs - - network1 - settings: - max_node_count: 200 - partition_name: compute - - - id: slurm_controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - homefs - - scratchfs - - projectsfs - - compute_partition - - network1 - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - homefs - - scratchfs - - projectsfs - - slurm_controller - - network1 diff --git a/tools/validate_configs/test_configs/hpc-cluster-project.yaml b/tools/validate_configs/test_configs/hpc-cluster-project.yaml deleted file mode 100644 index b900649f2a..0000000000 --- a/tools/validate_configs/test_configs/hpc-cluster-project.yaml +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-cluster-project - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm-project - region: europe-west4 - zone: europe-west4-a - -terraform_backend_defaults: - type: gcs - configuration: - bucket: a_bucket - impersonate_service_account: a_bucket_reader@project.iam.gserviceaccount.com - -deployment_groups: -- group: onboarding - modules: - - id: project - source: ./community/modules/project/new-project - settings: - project_id: $(vars.project_id) - folder_id: 334688113020 # random number - billing_account: "111110-M2N704-854685" # random billing number - org_id: 123456789 # random org id - - - id: enable-apis - source: ./community/modules/project/service-enablement - use: [project] - settings: - gcp_service_list: - - "file.googleapis.com" - - "compute.googleapis.com" - -- group: primary - modules: - # Source is an embedded module, denoted by "modules/*" without ./, ../, / - # as a prefix. To refer to a local module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - - id: network1 - source: modules/network/vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: compute_partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - settings: - partition_name: compute - machine_type: n1-standard-2 - enable_placement: false - max_node_count: 20 - - - id: slurm_controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - compute_partition - settings: - login_node_count: 1 - - - id: slurm_login - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - homefs - - slurm_controller diff --git a/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml b/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml deleted file mode 100644 index f849d01ef9..0000000000 --- a/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-slurm-sa - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm - region: europe-west4 - zone: europe-west4-a - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - - - id: homefs - source: modules/file-system/pre-existing-network-storage - settings: - server_ip: '$controller' - remote_mount: /home - local_mount: /home - fs_type: nfs - - - id: service_acct - source: ./community/modules/project/service-account - settings: - project_id: $(vars.project_id) - name: hpc-service-acct - project_roles: - - compute.instanceAdmin.v1 - - - id: compute-partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: [network1] - settings: - partition_name: compute - network_storage: - - $(homefs.network_storage) - - - id: slurm - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: [network1] - settings: - network_storage: - - $(homefs.network_storage) - partition: - - $(compute-partition.partition) - controller_service_account: $(service_acct.service_account_email) diff --git a/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml b/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml index 9c3015c9a8..a6d7001fd5 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml @@ -29,45 +29,49 @@ deployment_groups: source: modules/network/vpc - id: homefs - source: ./community/modules/file-system/nfs-server + source: community/modules/file-system/nfs-server use: [network1] settings: labels: ghpc_role: storage-home - id: slurm-sql - source: ./community/modules/database/slurm-cloudsql-federation + source: community/modules/database/slurm-cloudsql-federation use: [network1] settings: sql_instance_name: slurm-sql8 tier: "db-f1-micro" + - id: compute-nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + node_count_dynamic_max: 20 + machine_type: c2-standard-4 + - id: compute-partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - homefs - - network1 + - compute-nodeset settings: partition_name: compute - max_node_count: 20 - machine_type: c2-standard-4 - id: slurm-controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - homefs - compute-partition + - slurm-login - slurm-sql - network1 settings: - login_node_count: 1 - disable_compute_public_ips: true disable_controller_public_ips: true - id: slurm-login - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login use: - - slurm-controller - network1 settings: + name_prefix: login disable_login_public_ips: true diff --git a/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml b/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml deleted file mode 100644 index 3da2f9fc07..0000000000 --- a/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-cluster-small - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm-small - region: europe-west4 - zone: europe-west4-a - -deployment_groups: -- group: primary - modules: - # Source is an embedded module, denoted by "modules/*" without ./, ../, / - # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - - id: network1 - source: modules/network/vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: startup - source: modules/scripts/startup-script - settings: - install_ansible: true - - - id: compute_partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - settings: - enable_placement: false - machine_type: n2-standard-4 - cpu_platform: Intel Ice Lake - partition_name: compute - max_node_count: 20 - - - id: slurm_controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - compute_partition - settings: - login_node_count: 1 - controller_startup_script: $(startup.startup_script) - compute_startup_script: $(startup.startup_script) - - - id: slurm_login - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - homefs - - slurm_controller - settings: - login_startup_script: $(startup.startup_script) diff --git a/tools/validate_configs/test_configs/node-groups.yaml b/tools/validate_configs/test_configs/node-groups.yaml index 9b611679e6..026457c949 100644 --- a/tools/validate_configs/test_configs/node-groups.yaml +++ b/tools/validate_configs/test_configs/node-groups.yaml @@ -64,7 +64,7 @@ deployment_groups: name: c30 machine_type: c2-standard-30 instance_image: - family: slurm-gcp-5-9-debian-11 + family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image_custom: true @@ -83,7 +83,7 @@ deployment_groups: name: cd112 machine_type: c2d-standard-112 instance_image: - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public instance_image_custom: true enable_smt: true @@ -139,6 +139,7 @@ deployment_groups: instance_template: null labels: $(vars.labels) machine_type: n2-standard-16 + maintenance_interval: "" metadata: {} min_cpu_platform: null on_host_maintenance: TERMINATE diff --git a/tools/validate_configs/test_configs/packer.yaml b/tools/validate_configs/test_configs/packer.yaml index 24af11c96d..02c9c125ba 100644 --- a/tools/validate_configs/test_configs/packer.yaml +++ b/tools/validate_configs/test_configs/packer.yaml @@ -20,7 +20,7 @@ vars: project_id: ## Set GCP Project ID Here ## deployment_name: hpc-slurm1 region: europe-west4 - zone: europe-west4-a + zone: $(vars.region)-a network_name: image-builder-net subnetwork_name: image-builder-primary-subnet diff --git a/tools/validate_configs/test_configs/pre-existing-fs.yaml b/tools/validate_configs/test_configs/pre-existing-fs.yaml deleted file mode 100644 index 252ac2a207..0000000000 --- a/tools/validate_configs/test_configs/pre-existing-fs.yaml +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: pre-existing-fs - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: pre-fs-slurm - region: europe-west4 - zone: europe-west4-a - local_mount: /home - network_name: default - -deployment_groups: -- group: storage - modules: - - id: network0 - source: modules/network/pre-existing-vpc - - - id: homefs-filestore - source: modules/file-system/filestore - use: [network0] - -- group: compute - modules: - - id: network1 - source: modules/network/pre-existing-vpc - - - id: homefs - source: modules/file-system/pre-existing-network-storage - settings: - server_ip: "" # for now, must be completed manually in compute/main.tf - remote_mount: nfsshare - local_mount: $(vars.local_mount) # automatic, added here for clarity - fs_type: nfs - - - id: compute-partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - homefs - - network1 - settings: - partition_name: compute - - - id: slurm - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - homefs - - compute-partition - - network1 diff --git a/tools/validate_configs/test_configs/slurm-static-test.yaml b/tools/validate_configs/test_configs/slurm-static-test.yaml index 5d5ed3cf4c..7e3adcbb9a 100644 --- a/tools/validate_configs/test_configs/slurm-static-test.yaml +++ b/tools/validate_configs/test_configs/slurm-static-test.yaml @@ -24,11 +24,11 @@ vars: machine_type: n1-standard-2 instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - # family: slurm-gcp-5-9-ubuntu-2004-lts - # family: slurm-gcp-5-9-hpc-centos-7 - family: slurm-gcp-5-9-hpc-rocky-linux-8 - # family: slurm-gcp-5-9-debian-11 + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + # family: slurm-gcp-5-10-ubuntu-2004-lts + # family: slurm-gcp-5-10-hpc-centos-7 + family: slurm-gcp-5-10-hpc-rocky-linux-8 + # family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image_custom: true enable_reconfigure: true diff --git a/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml b/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml deleted file mode 100644 index 0dbb3ad57c..0000000000 --- a/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-slurm - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm - region: europe-west4 - zone: europe-west4-a - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: workstation - source: modules/compute/vm-instance - use: - - network1 - - homefs - settings: - name_prefix: workstation - machine_type: e2-standard-8 - - - id: compute-partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - homefs - - network1 - settings: - partition_name: compute - - - id: debug-partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - homefs - - network1 - settings: - partition_name: debug - - - id: slurm - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - homefs - - compute-partition - - debug-partition - - network1 diff --git a/tools/validate_configs/test_configs/test-gcs-fuse.yaml b/tools/validate_configs/test_configs/test-gcs-fuse.yaml deleted file mode 100644 index 40d1228d1e..0000000000 --- a/tools/validate_configs/test_configs/test-gcs-fuse.yaml +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: gcs-fuse - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: gcs-fuse - region: us-central1 - zone: us-central1-c - -# Documentation for each of the modules used below can be found at -# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - - - id: gcs - source: ./modules/file-system/pre-existing-network-storage - settings: - remote_mount: hpc-toolkit-service-catalog-solutions - local_mount: /catalog - fs_type: gcsfuse - - - id: gcs2 - source: ./modules/file-system/pre-existing-network-storage - settings: - server_ip: foobar - remote_mount: gs://hpc-toolkit-demo-tf-state - local_mount: /tfstate - fs_type: gcsfuse - mount_options: implicit_dirs,defaults,allow_other - -# find images with: gcloud compute images list - - id: compute-hpc-image - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: hpc-image - - - id: compute-ubuntu2204 - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: ubuntu2204 - instance_image: - family: ubuntu-2204-lts - project: ubuntu-os-cloud - - - id: compute-ubuntu2004 - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: ubuntu2004 - instance_image: - family: ubuntu-2004-lts - project: ubuntu-os-cloud - - - id: compute-debian - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: debian11 - instance_image: - family: debian-11 - project: debian-cloud - - - id: centos08 - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: centos08 - instance_image: - family: centos-stream-9 - project: centos-cloud - - - id: centos07 - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: centos07 - instance_image: - family: centos-7 - project: centos-cloud - - - id: rocky - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: rocky-linux-8 - instance_image: - family: rocky-linux-8 - project: rocky-linux-cloud - - - id: compute-partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - gcs - - gcs2 - - network1 - settings: - partition_name: compute - machine_type: n2-standard-4 - - - id: slurm-controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - gcs - - gcs2 - - compute-partition - - network1 - settings: - login_node_count: 1 - compute_node_scopes: - - https://www.googleapis.com/auth/cloud-platform - - https://www.googleapis.com/auth/devstorage.read_only - disable_compute_public_ips: false - - - id: slurm-login - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - slurm-controller - - network1 diff --git a/tools/validate_configs/test_configs/test_outputs.yaml b/tools/validate_configs/test_configs/test_outputs.yaml index 3e7832efa0..5de0b7bc21 100644 --- a/tools/validate_configs/test_configs/test_outputs.yaml +++ b/tools/validate_configs/test_configs/test_outputs.yaml @@ -130,14 +130,6 @@ deployment_groups: outputs: - startup_script - - id: partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: [vpc] - outputs: - - partition - settings: - partition_name: compute - - id: lustre source: ./community/modules/file-system/DDN-EXAScaler outputs: @@ -146,11 +138,3 @@ deployment_groups: - mount_command - http_console - network_storage - - - id: controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - partition - - vpc - outputs: - - controller_name diff --git a/tools/validate_configs/test_configs/ubuntu-ss.yaml b/tools/validate_configs/test_configs/ubuntu-ss.yaml index 35657939d8..67a15a8437 100644 --- a/tools/validate_configs/test_configs/ubuntu-ss.yaml +++ b/tools/validate_configs/test_configs/ubuntu-ss.yaml @@ -63,7 +63,7 @@ deployment_groups: - id: startup source: ./modules/scripts/startup-script settings: - install_cloud_ops_agent: true + install_stackdriver_agent: true runners: - type: data source: /tmp/foo.tgz diff --git a/tools/validate_configs/test_configs/use-resources.yaml b/tools/validate_configs/test_configs/use-resources.yaml deleted file mode 100644 index d2e39eeeb2..0000000000 --- a/tools/validate_configs/test_configs/use-resources.yaml +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: use-modules - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm-use-modules - region: us-central1 - zone: us-central1-a - -deployment_groups: -- group: primary - modules: - # Source is an embedded module, denoted by "modules/*" without ./, ../, - # / as a prefix. To refer to a local module, prefix with ./, ../ or / - # Example - ./modules/network/pre-existing-vpc - - id: network1 - source: modules/network/pre-existing-vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - - id: projectsfs - source: community/modules/file-system/nfs-server - use: [network1] - - - id: scratchfs - source: community/modules/file-system/DDN-EXAScaler - settings: - local_mount: /scratch - network_self_link: $(network1.network_self_link) - subnetwork_self_link: $(network1.subnetwork_self_link) - subnetwork_address: $(network1.subnetwork_address) - - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - homefs - - scratchfs - - network1 - settings: - max_node_count: 200 - partition_name: compute - - - id: slurm_controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - projectsfs - - compute_partition - - network1 - - - id: slurm_login - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - homefs - - scratchfs - - projectsfs - - slurm_controller - - network1 - settings: - login_machine_type: n2-standard-4