Skip to content

Commit

Permalink
Merge pull request #2924 from GoogleCloudPlatform/release-candidate
Browse files Browse the repository at this point in the history
Release v1.38.0
  • Loading branch information
nick-stroud authored Aug 15, 2024
2 parents 229803f + 0ef79d3 commit 1e38ce0
Show file tree
Hide file tree
Showing 324 changed files with 10,602 additions and 1,061 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/dependency-review.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ permissions:

jobs:
dependency-review:
if: github.repository == 'GoogleCloudPlatform/hpc-toolkit'
if: github.repository == 'GoogleCloudPlatform/cluster-toolkit'
runs-on: ubuntu-latest
steps:
- name: 'Checkout Repository'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr-label-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ on:

jobs:
pr-label-validation:
if: github.repository == 'GoogleCloudPlatform/hpc-toolkit'
if: github.repository == 'GoogleCloudPlatform/cluster-toolkit'
runs-on: ubuntu-latest
permissions:
pull-requests: read
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/pr-precommit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ jobs:
python-version: '3.10'
check-latest: true
cache: 'pip'
- run: >
pip install
-r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt
-r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt
- uses: actions/setup-go@v5
with:
go-version: '1.22'
Expand Down
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ repos:
files: 'tools/cloud-build/daily-tests/builds/.*\.yaml'
pass_filenames: false
require_serial: true
- id: pytest-check
name: pytest-check
entry: pytest
language: system
types: [python]
pass_filenames: false

- repo: https://github.com/dnephin/pre-commit-golang
rev: v0.5.1
Expand Down
17 changes: 17 additions & 0 deletions .pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

[pytest]
filterwarnings = ignore::DeprecationWarning
testpaths = tests community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ If a self directed path is preferred, you can use the following commands to
build the `gcluster` binary:

```shell
git clone https://github.com/GoogleCloudPlatform/hpc-toolkit
cd hpc-toolkit
git clone https://github.com/GoogleCloudPlatform/cluster-toolkit
cd cluster-toolkit
make
./gcluster --version
./gcluster --help
Expand Down
15 changes: 12 additions & 3 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"fmt"
"hpc-toolkit/pkg/config"
"hpc-toolkit/pkg/logging"
"hpc-toolkit/pkg/shell"
"os"
"os/exec"
"path/filepath"
Expand Down Expand Up @@ -52,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`,
logging.Fatal("cmd.Help function failed: %s", err)
}
},
Version: "v1.37.2",
Version: "v1.38.0",
Annotations: annotation,
}
)
Expand All @@ -79,13 +80,21 @@ func Execute() error {
if len(GitBranch) == 0 {
GitBranch = "detached HEAD"
}

annotation["version"] = GitTagVersion
annotation["branch"] = GitBranch
annotation["commitInfo"] = GitCommitInfo
rootCmd.SetVersionTemplate(`gcluster version {{index .Annotations "version"}}
tmpl := `gcluster version {{index .Annotations "version"}}
Built from '{{index .Annotations "branch"}}' branch.
Commit info: {{index .Annotations "commitInfo"}}
`)
`
tfVersion, _ := shell.TfVersion()
if tfVersion != "" {
annotation["tfVersion"] = tfVersion
tmpl += `Terraform version: {{index .Annotations "tfVersion"}}
`
}
rootCmd.SetVersionTemplate(tmpl)
}

return rootCmd.Execute()
Expand Down
2 changes: 1 addition & 1 deletion community/examples/AMD/hpc-amd-slurm-v5-legacy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ deployment_groups:
# these images must match the images used by Slurm modules below because
# we are building OpenMPI with PMI support in libraries contained in
# Slurm installation
family: slurm-gcp-5-11-hpc-centos-7
family: slurm-gcp-5-12-hpc-centos-7
project: schedmd-slurm-public

- id: low_cost_node_group
Expand Down
4 changes: 3 additions & 1 deletion community/examples/AMD/hpc-amd-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ deployment_groups:
# these images must match the images used by Slurm modules below because
# we are building OpenMPI with PMI support in libraries contained in
# Slurm installation
family: slurm-gcp-6-5-hpc-rocky-linux-8
family: slurm-gcp-6-6-hpc-rocky-linux-8
project: schedmd-slurm-public

- id: low_cost_nodeset
Expand All @@ -179,6 +179,7 @@ deployment_groups:
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
enable_placement: false
allow_automatic_updates: false

- id: low_cost_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand All @@ -194,6 +195,7 @@ deployment_groups:
node_count_dynamic_max: 50
bandwidth_tier: gvnic_enabled
enable_placement: true
allow_automatic_updates: false

# Because is_default is set to true, jobs will run on this partition unless an
# alternative partition is specified using, for example, "srun -p lowcost"
Expand Down
1 change: 1 addition & 0 deletions community/examples/client-google-cloud-storage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ deployment_groups:
settings:
name_prefix: workstation
machine_type: e2-standard-2
allow_automatic_updates: false

- id: wait
source: community/modules/scripts/wait-for-startup
Expand Down
1 change: 1 addition & 0 deletions community/examples/hpc-build-slurm-image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ deployment_groups:
settings:
machine_type: n2d-standard-2
instance_image: $(vars.built_instance_image)
allow_automatic_updates: false

- id: debug_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand Down
4 changes: 2 additions & 2 deletions community/examples/hpc-slurm-chromedesktop-v5-legacy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ vars:
region: us-central1
zone: us-central1-c
instance_image_crd:
family: slurm-gcp-5-11-debian-11
family: slurm-gcp-5-12-debian-11
project: schedmd-slurm-public
instance_image:
family: slurm-gcp-5-11-hpc-centos-7
family: slurm-gcp-5-12-hpc-centos-7
project: schedmd-slurm-public

# Documentation for each of the modules used below can be found at
Expand Down
1 change: 1 addition & 0 deletions community/examples/hpc-slurm-gromacs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ deployment_groups:
settings:
node_count_dynamic_max: 20
bandwidth_tier: gvnic_enabled
allow_automatic_updates: false

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand Down
43 changes: 14 additions & 29 deletions community/examples/hpc-slurm-local-ssd-v5-legacy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,19 @@ deployment_groups:
settings:
local_mount: /home

- id: startup
source: modules/scripts/startup-script
settings:
# When shutting down a VM with local SSD disks, we strongly recommend the
# automatic migration of data following these instructions:
# https://cloud.google.com/compute/docs/disks/local-ssd#stop_instance
# Failure to do will result in VMs that lose data and do not automatically
# mount local SSD filesystems
local_ssd_filesystem:
fs_type: ext4
mountpoint: /mnt/localssd
permissions: "1777" # must quote numeric filesystem permissions!

- id: compute_node_group
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
Expand Down Expand Up @@ -66,39 +79,11 @@ deployment_groups:
- network1
- homefs
- compute_node_group
- startup
settings:
is_default: true
partition_name: ssdcomp
region: us-central1
startup_script: |
#!/bin/bash
set -e -o pipefail
# this script assumes it is running on a RedHat-derivative OS
yum install -y mdadm
RAID_DEVICE=/dev/md0
DST_MNT=/mnt/localssd
DISK_LABEL=LOCALSSD
OPTIONS=discard,defaults
# if mount is successful, do nothing
if mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS"; then
exit 0
fi
# Create new RAID, format ext4 and mount
# TODO: handle case of zero or 1 local SSD disk
# TODO: handle case when /dev/md0 exists but was not mountable
DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | paste -sd ' '`
NB_DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | wc -l`
mdadm --create "$RAID_DEVICE" --level=0 --raid-devices=$NB_DEVICES $DEVICES
mkfs.ext4 -F "$RAID_DEVICE"
tune2fs "$RAID_DEVICE" -r 131072
e2label "$RAID_DEVICE" "$DISK_LABEL"
mkdir -p "$DST_MNT"
mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS"
chmod 1777 "$DST_MNT"

- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
Expand Down
45 changes: 15 additions & 30 deletions community/examples/hpc-slurm-local-ssd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,22 @@ deployment_groups:
settings:
local_mount: /home

- id: startup
source: modules/scripts/startup-script
settings:
# When shutting down a VM with local SSD disks, we strongly recommend the
# automatic migration of data following these instructions:
# https://cloud.google.com/compute/docs/disks/local-ssd#stop_instance
# Failure to do will result in VMs that lose data and do not automatically
# mount local SSD filesystems
local_ssd_filesystem:
fs_type: ext4
mountpoint: /mnt/localssd
permissions: "1777" # must quote numeric filesystem permissions!

- id: nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
use: [network, startup]
settings:
additional_disks:
- device_name: test-disk-1
Expand All @@ -60,35 +73,7 @@ deployment_groups:
machine_type: c2-standard-4
node_count_dynamic_max: 5
node_count_static: 0
startup_script: |
#!/bin/bash
set -e -o pipefail
# this script assumes it is running on a RedHat-derivative OS
yum install -y mdadm
RAID_DEVICE=/dev/md0
DST_MNT=/mnt/localssd
DISK_LABEL=LOCALSSD
OPTIONS=discard,defaults
# if mount is successful, do nothing
if mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS"; then
exit 0
fi
# Create new RAID, format ext4 and mount
# TODO: handle case of zero or 1 local SSD disk
# TODO: handle case when /dev/md0 exists but was not mountable
DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | paste -sd ' '`
NB_DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | wc -l`
mdadm --create "$RAID_DEVICE" --level=0 --raid-devices=$NB_DEVICES $DEVICES
mkfs.ext4 -F "$RAID_DEVICE"
tune2fs "$RAID_DEVICE" -r 131072
e2label "$RAID_DEVICE" "$DISK_LABEL"
mkdir -p "$DST_MNT"
mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS"
chmod 1777 "$DST_MNT"
allow_automatic_updates: false

- id: partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand Down
1 change: 1 addition & 0 deletions community/examples/hpc-slurm-ramble-gromacs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ deployment_groups:
settings:
node_count_dynamic_max: 20
bandwidth_tier: gvnic_enabled
allow_automatic_updates: false

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand Down
2 changes: 2 additions & 0 deletions community/examples/hpc-slurm-sharedvpc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ deployment_groups:
node_count_dynamic_max: 4
machine_type: n2-standard-2
enable_placement: false # the default is: true
allow_automatic_updates: false

- id: debug_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand All @@ -77,6 +78,7 @@ deployment_groups:
settings:
node_count_dynamic_max: 20
bandwidth_tier: gvnic_enabled
allow_automatic_updates: false

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand Down
2 changes: 1 addition & 1 deletion community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ vars:
instance_image:
# Please refer to the following link for the latest images:
# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems
family: slurm-gcp-5-11-ubuntu-2004-lts
family: slurm-gcp-5-12-ubuntu-2004-lts
project: schedmd-slurm-public
instance_image_custom: true

Expand Down
2 changes: 1 addition & 1 deletion community/examples/hpc-slurm-ubuntu2004.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ vars:
slurm_image:
# Please refer to the following link for the latest images:
# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems
family: slurm-gcp-6-5-ubuntu-2004-lts
family: slurm-gcp-6-6-ubuntu-2004-lts
project: schedmd-slurm-public
instance_image_custom: true

Expand Down
3 changes: 2 additions & 1 deletion community/examples/hpc-slurm6-apptainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ deployment_groups:
settings:
source_image_project_id: [schedmd-slurm-public]
# see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
source_image_family: slurm-gcp-6-4-hpc-rocky-linux-8
source_image_family: slurm-gcp-6-6-hpc-rocky-linux-8
# You can find size of source image by using following command
# gcloud compute images describe-from-family <source_image_family> --project schedmd-slurm-public
disk_size: $(vars.disk_size)
Expand All @@ -78,6 +78,7 @@ deployment_groups:
instance_image: $(vars.custom_image)
instance_image_custom: true
bandwidth_tier: gvnic_enabled
allow_automatic_updates: false

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand Down
1 change: 1 addition & 0 deletions community/examples/hpc-slurm6-tpu-maxtext.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ deployment_groups:
name: ns2
node_count_dynamic_max: 20
bandwidth_tier: gvnic_enabled
allow_automatic_updates: false

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand Down
2 changes: 2 additions & 0 deletions community/examples/htc-htcondor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ deployment_groups:
name_prefix: grp1
instance_image: $(vars.new_image)
min_idle: 2
allow_automatic_updates: false

- id: htcondor_execute_point_spot
source: community/modules/compute/htcondor-execute-point
Expand All @@ -117,6 +118,7 @@ deployment_groups:
name_prefix: spot
instance_image: $(vars.new_image)
spot: true
allow_automatic_updates: false

- id: htcondor_access
source: community/modules/scheduler/htcondor-access-point
Expand Down
Loading

0 comments on commit 1e38ce0

Please sign in to comment.