Merge pull request #3004 from GoogleCloudPlatform/release-candidate

Release v1.39.0
GoogleCloudPlatform · Sep 12, 2024 · 7699f5d · 7699f5d
2 parents 1e38ce0 + 3939d49
commit 7699f5d
Show file tree

Hide file tree

Showing 203 changed files with 3,167 additions and 1,212 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -23,11 +23,14 @@ updates:
   - go
   - release-chore
   schedule:
-    interval: weekly
+    interval: monthly
     day: monday
     time: "03:00"
     timezone: America/Los_Angeles
   target-branch: develop
+  ignore:
+  - dependency-name: "google.golang.org/api"
+
 - package-ecosystem: pip
   directory: /community/front-end/ofe/
   labels:
@@ -45,3 +48,18 @@ updates:
   # Disable version updates, do security updates only
   # See https://docs.github.com/en/code-security/dependabot/dependabot-security-updates/configuring-dependabot-security-updates#overriding-the-default-behavior-with-a-configuration-file
   open-pull-requests-limit: 0
+- package-ecosystem: pip
+  directory: /community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/
+  labels:
+  - dependencies
+  - python
+  - release-chore
+  schedule:
+    interval: weekly
+    day: monday
+    time: "03:00"
+    timezone: America/Los_Angeles
+  target-branch: develop
+  # Disable version updates, do security updates only
+  # See https://docs.github.com/en/code-security/dependabot/dependabot-security-updates/configuring-dependabot-security-updates#overriding-the-default-behavior-with-a-configuration-file
+  open-pull-requests-limit: 0
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -1,5 +1,7 @@
 ### Submission Checklist
 
+NOTE: Community submissions can take up to 2 weeks to be reviewed.
+
 Please take the following actions before submitting this pull request.
 
 * Fork your PR branch from the Toolkit "develop" branch (not main)

diff --git a/.github/workflows/pr-precommit.yml b/.github/workflows/pr-precommit.yml
@@ -38,10 +38,6 @@ jobs:
         python-version: '3.10'
         check-latest: true
         cache: 'pip'
-    - run: >
-        pip install
-        -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt
-        -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt
     - uses: actions/setup-go@v5
       with:
         go-version: '1.22'

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -76,7 +76,7 @@ repos:
     require_serial: true
   - id: pytest-check
     name: pytest-check
-    entry: pytest
+    entry: python -m pytest
     language: system
     types: [python]
     pass_filenames: false

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -20,7 +20,11 @@ again.
 All submissions, including submissions by project members, require review. We
 use GitHub pull requests for this purpose. Consult
 [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
-information on using pull requests.
+information on pull requests.
+
+### Standard PR Response Times
+
+Community submissions can take up to 2 weeks to be reviewed.
 
 ## Community Guidelines
 

diff --git a/Makefile b/Makefile
@@ -69,6 +69,8 @@ install-dev-deps: warn-terraform-version warn-packer-version check-pre-commit ch
 	go install mvdan.cc/sh/v3/cmd/shfmt@latest
 	go install golang.org/x/tools/cmd/goimports@latest
 	go install honnef.co/go/tools/cmd/staticcheck@latest
+	pip install -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt
+	pip install -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt
 
 # RULES SUPPORTING THE ABOVE
 

diff --git a/cmd/root.go b/cmd/root.go
@@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`,
 				logging.Fatal("cmd.Help function failed: %s", err)
 			}
 		},
-		Version:     "v1.38.0",
+		Version:     "v1.39.0",
 		Annotations: annotation,
 	}
 )

diff --git a/community/examples/AMD/README.md b/community/examples/AMD/README.md
@@ -53,10 +53,10 @@ using the `compute` partition, you may ignore its quota requirements.
 
 ### Deploying the Blueprint
 
-Use `ghpc` to provision the blueprint, supplying your project ID:
+Use `gcluster` to provision the blueprint, supplying your project ID:
 
 ```shell
-ghpc create --vars project_id=<<PROJECT_ID>> hpc-amd-slurm.yaml
+gcluster create --vars project_id=<<PROJECT_ID>> hpc-amd-slurm.yaml
 ```
 
 It will create a directory containing a Terraform module. Follow the printed

diff --git a/community/examples/flux-framework/README.md b/community/examples/flux-framework/README.md
@@ -26,15 +26,15 @@ Toolkit guidance to enable [APIs][apis] and establish minimum resource
 
 ### Deploy the flux-framework Cluster
 
-Use `ghcp` to provision the blueprint
+Use `gcluster` to provision the blueprint
 
 ```bash
-ghpc create community/examples/flux-framework --vars project_id=<<PROJECT_ID>>
+gcluster create community/examples/flux-framework --vars project_id=<<PROJECT_ID>>
 ```
 
 This will create a directory containing Terraform modules.
 
-Follow `ghpc` instructions to deploy the cluster
+Follow `gcluster` instructions to deploy the cluster
 
 ```text
 terraform -chdir=flux-fw-cluster/primary init

diff --git a/community/examples/hpc-slurm6-tpu-maxtext.yaml b/community/examples/hpc-slurm6-tpu-maxtext.yaml
@@ -72,7 +72,7 @@ deployment_groups:
           python3 MaxText/train.py MaxText/configs/base.yml run_name=<RUN_NAME> base_output_directory=${PWD}/output/ dataset_path=<STORAGE_BUCKET> async_checkpointing=False attention=<ATTENTION> steps=<STEPS>
 
   - id: tpu_nodeset
-    source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu
+    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu
     use: [network]
     settings:
       node_type: v4-8
@@ -88,7 +88,7 @@ deployment_groups:
       node_count_dynamic_max: 1
 
   - id: tpu_partition
-    source: ./community/modules/compute/schedmd-slurm-gcp-v6-partition
+    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
     use: [tpu_nodeset]
     settings:
       partition_name: tpu
@@ -110,14 +110,14 @@ deployment_groups:
       is_default: true
 
   - id: slurm_login
-    source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-login
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
     use: [network]
     settings:
       enable_login_public_ips: true
       machine_type: n2-standard-16
 
   - id: slurm_controller
-    source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-controller
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
     use:
     - tpu_partition
     - compute_partition

diff --git a/community/examples/hpc-slurm6-tpu.yaml b/community/examples/hpc-slurm6-tpu.yaml
@@ -29,7 +29,7 @@ deployment_groups:
     source: modules/network/vpc
 
   - id: tpu_nodeset
-    source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu
+    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu
     use: [network]
     settings:
       node_type: v3-8
@@ -45,20 +45,20 @@ deployment_groups:
       node_count_dynamic_max: 1
 
   - id: tpu_partition
-    source: ./community/modules/compute/schedmd-slurm-gcp-v6-partition
+    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
     use: [tpu_nodeset]
     settings:
       partition_name: tpu
 
   - id: slurm_login
-    source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-login
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
     use: [network]
     settings:
       machine_type: n2-standard-4
       enable_login_public_ips: true
 
   - id: slurm_controller
-    source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-controller
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
     use:
     - tpu_partition
     - slurm_login

diff --git a/community/examples/intel/README.md b/community/examples/intel/README.md
@@ -63,10 +63,10 @@ The Pre-deployment Guide provides instructions for:
 
 ### Deploy the DAOS Cluster
 
-After completing the steps in the [Pre-deployment Guide][pre-deployment_guide] use `ghpc` to provision the blueprint
+After completing the steps in the [Pre-deployment Guide][pre-deployment_guide] use `gcluster` to provision the blueprint
 
 ```text
-ghpc create community/examples/intel/pfs-daos.yaml  \
+gcluster create community/examples/intel/pfs-daos.yaml  \
   --vars project_id=<<PROJECT_ID>> \
   [--backend-config bucket=<GCS tf backend bucket>]
 ```
@@ -75,10 +75,10 @@ This will create the deployment directory containing Terraform modules and
 Packer templates. The `--backend-config` option is not required but recommended.
 It will save the terraform state in a pre-existing [Google Cloud Storage
 bucket][bucket]. For more information see [Setting up a remote terraform
-state][backend]. Use `ghpc deploy` to provision your DAOS storage cluster:
+state][backend]. Use `gcluster deploy` to provision your DAOS storage cluster:
 
 ```text
-ghpc deploy pfs-daos --auto-approve
+gcluster deploy pfs-daos --auto-approve
 ```
 
 [backend]: ../../../examples/README.md#optional-setting-up-a-remote-terraform-state
@@ -238,7 +238,7 @@ See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#d
 Delete the remaining infrastructure
 
 ```bash
-ghpc destroy pfs-daos --auto-approve
+gcluster destroy pfs-daos --auto-approve
 ```
 
 ## DAOS Server with Slurm cluster
@@ -291,10 +291,10 @@ The following available quota is required in the region used by Slurm:
 
 ### Deploy the DAOS/Slurm Cluster
 
-Use `ghpc` to provision the blueprint, supplying your project ID
+Use `gcluster` to provision the blueprint, supplying your project ID
 
 ```text
-ghpc create community/examples/intel/hpc-slurm-daos.yaml \
+gcluster create community/examples/intel/hpc-slurm-daos.yaml \
   --vars project_id=<<PROJECT_ID>> \
   [--backend-config bucket=<GCS tf backend bucket>]
 ```
@@ -304,10 +304,10 @@ templates.
 
 The `--backend-config` option is not required but recommended. It will save the terraform state in a pre-existing [Google Cloud Storage bucket][bucket]. For more information see [Setting up a remote terraform state][backend].
 
-Follow `ghpc` instructions to deploy the environment
+Follow `gcluster` instructions to deploy the environment
 
 ```text
-ghpc deploy hpc-slurm-daos --auto-approve
+gcluster deploy hpc-slurm-daos --auto-approve
 ```
 
 [backend]: ../../../examples/README.md#optional-setting-up-a-remote-terraform-state
@@ -450,5 +450,5 @@ have been shutdown and deleted by the Slurm autoscaler.
 Delete the remaining infrastructure:
 
 ```bash
-ghpc destroy hpc-slurm-daos --auto-approve
+gcluster destroy hpc-slurm-daos --auto-approve
 ```
diff --git a/community/front-end/ofe/README.md b/community/front-end/ofe/README.md
@@ -15,7 +15,7 @@ steps:
 * Prepare the client side environment and secure sufficient IAM permissions for
   the system deployment.
 * When ready, clone this repository and run the deployment script at
-  `hpc-toolkit/community/front-end/ofe/deploy.sh` from a client machine or a Cloud
+  `cluster-toolkit/community/front-end/ofe/deploy.sh` from a client machine or a Cloud
   Shell. Follow instructions to complete the deployment. The whole process is
   automated via Terraform and should complete within 15 minutes.
 * Perform post-deployment configurations.

diff --git a/community/front-end/ofe/deploy.sh b/community/front-end/ofe/deploy.sh
@@ -504,15 +504,15 @@ deploy() {
 	# -- Collect deployment files
 	#
 	#    For a tarball deployment, it is important that the 'root' directory is
-	#    named 'hpc-toolkit' as most of the install depends on it.
+	#    named 'cluster-toolkit' as most of the install depends on it.
 	#
 	#    Simplest way to ensure this is to build from a temporary copy that
 	#    definitely is named correctly.
 	#
 	if [ "${deployment_mode}" == "tarball" ]; then
 
 		basedir=$(git rev-parse --show-toplevel)
-		tdir=/tmp/hpc-toolkit
+		tdir=/tmp/cluster-toolkit
 
 		cp -R "${basedir}" ${tdir}/
 		(
@@ -523,7 +523,7 @@ deploy() {
 				--exclude=.terraform.lock.hcl \
 				--exclude=tf \
 				--directory=/tmp \
-				./hpc-toolkit 2>/dev/null
+				./cluster-toolkit 2>/dev/null
 		)
 
 		rm -rf ${tdir}
@@ -562,7 +562,7 @@ TFVARS
 		fi
 
 		if [ "${deployment_mode}" == "git" ]; then
-			echo "Will clone hpc-toolkit from github.com/${repo_fork}/hpc-toolkit.git ${repo_branch} branch."
+			echo "Will clone cluster-toolkit from github.com/${repo_fork}/cluster-toolkit.git ${repo_branch} branch."
 
 			cat <<-END >>terraform.tfvars
 				repo_fork = "${repo_fork}"

diff --git a/community/front-end/ofe/docs/developer_guide.md b/community/front-end/ofe/docs/developer_guide.md
@@ -148,7 +148,7 @@ The home directory of the *gcluster* account is at `/opt/gcluster`. For a new de
 #### For cloud resources
 
 Run-time data to support creating and managing cloud resources are generated
-and stored in the following sub-directories within `hpc-toolkit/frontend` on
+and stored in the following sub-directories within `cluster-toolkit/frontend` on
 the service machine:
 
 - `clusters/cluster_\<id>` - holding run-time data for a cluster. `\<id>` here
@@ -246,7 +246,7 @@ define the major components:
 
 | dir                         | description |
 |-----------------------------|-------------|
-| `hpc-toolkit/frontend/`     | Top level   |
+| `cluster-toolkit/frontend/` | Top level   |
 | `.../cli/`                  | client commandline interface |
 | `.../docs/`                 | documentation |
 | `.../infrastructure_files/` | Support files for deploying cloud infrastructure |
@@ -344,7 +344,7 @@ not currently support Vertex AI Workbenches.
 ### Infrastructure files
 
 Workbenches are created using a template configuration in
-`hpc-toolkit/frontend/infrastructure_files/workbench_tf`. The Terraform
+`cluster-toolkit/frontend/infrastructure_files/workbench_tf`. The Terraform
 template was originally based on the Terraform template provided by the
 [Google Cloud Platform Rad-Lab git repo](https://github.com/GoogleCloudPlatform/rad-lab)
 however the configuration diverged during early development. The main reason
@@ -353,11 +353,11 @@ specific OSLogin user rather than the generic Jupyter user which would make it
 impossible to interact properly with any mounted shared storage.
 
 The process of creating the workbench files is mostly contained within the file
-`hpc-toolkit/frontend/website/ghpcfe/cluster_manager/workbenchinfo.py`. The
+`cluster-toolkit/frontend/website/ghpcfe/cluster_manager/workbenchinfo.py`. The
 `copy_terraform()` routine copies files from the `infrastructure_files`
 directory while the `prepare_terraform_vars()` routine creates a
 `terraform.tfvars` file within the
-`hpc-toolkit/frontend/workbenches/workbench_##` directory to provide the
+`cluster-toolkit/frontend/workbenches/workbench_##` directory to provide the
 following info gathered by the FrontEnd during the workbench creation process:
 
 - region

diff --git a/...tructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py b/...tructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py
@@ -243,12 +243,17 @@ def _slurm_get_job_info(jobid):
 
 
 def _slurm_get_job_state(jobid):
-    """Returns the job state, or None if job isn't in the queue"""
-    # N.B - eventually, pyslurm might work with our version of Slurm,
-    # and this can be changed to something more sane.  For now, call squeue
-    state = _slurm_get_job_info(jobid)
-    return state.get("job_state", None) if state else None
+    """Returns the job state, or None if the job isn't in the queue"""
+    state = _slurm_get_job_info(jobid)  # Fetch job info using an external function
+    job_state = state.get("job_state", None) if state else None  # Get the 'job_state' if available
+
+    if job_state and isinstance(job_state, list) and job_state:
+        logger.info("Slurm returned job %s with state %s", jobid, job_state[0])  # Log the first state if available
+        return job_state[0]  # Return the first element of the state list
+    else:
+        logger.info("No valid job state available for job %s", jobid)  # Log when no valid state is found
 
+    return None  # Return None if there is no job state or it's not a list
 
 def _spack_submit_build(app_id, partition, app_name, spec, extra_sbatch=None):
     build_dir = Path("/opt/cluster/installs") / str(app_id)
@@ -925,12 +930,14 @@ def cb_run_job(message, **kwargs):
     try:
         slurm_job_info = _slurm_get_job_info(slurm_jobid)
         response["job_runtime"] = (
-            slurm_job_info["end_time"] - slurm_job_info["start_time"]
+            slurm_job_info["end_time"]["number"] - slurm_job_info["start_time"]["number"]
         )
     except KeyError:
         logger.warning(
             "Job data from SLURM did not include start time and end time"
         )
+    except Exception as E:
+        logger.error("Unexpected error: %s", E)
 
     kpi = job_dir / "kpi.json"
     if kpi.is_file():