From b4289f8ba240f474e415e7252fbd90f709c63b09 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Mon, 17 Jun 2024 14:25:56 +0100 Subject: [PATCH 001/180] OFE: adding missing APIs for image creation functionality --- community/front-end/ofe/deploy.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/community/front-end/ofe/deploy.sh b/community/front-end/ofe/deploy.sh index 0360591d9c..c1414753d0 100755 --- a/community/front-end/ofe/deploy.sh +++ b/community/front-end/ofe/deploy.sh @@ -57,6 +57,8 @@ PRJ_API['bigqueryconnection.googleapis.com']='BigQuery Connection API' PRJ_API['sqladmin.googleapis.com']='Cloud SQL Admin API' PRJ_API['servicenetworking.googleapis.com']='Service Networking API' PRJ_API['secretmanager.googleapis.com']='Secret Manager API' +PRJ_API['serviceusage.googleapis.com']='Service Usage API' +PRJ_API['storage.googleapis.com']='Cloud Storage API' # Location for output credential file = pwd/credential.json # From 7d85a8adb63f84d90f653a1da24f194b571bf1dc Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:27:21 +0100 Subject: [PATCH 002/180] OFE: Update update_form.html Typo in cluster additional disk javascript. --- .../ofe/website/ghpcfe/templates/cluster/update_form.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html index 8423fbc3ee..54d9846f40 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html +++ b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html @@ -443,7 +443,7 @@

{{ title }}

region = cloudRegionInput.options[cloudRegionInput.selectedIndex].text; } else { zone = cloudZoneInput.value; - region = cloudZoneInput.value; + region = cloudRegionInput.value; } $.ajax({ From dc81f680997db07f6a65100390841595aabb9fca Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:39:04 +0100 Subject: [PATCH 003/180] OFE: Update clusterinfo.py Fixing the logic for selecting custom images for controller and login nodes. --- .../website/ghpcfe/cluster_manager/clusterinfo.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py index 516d791701..fbae2dc8f3 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py @@ -279,6 +279,19 @@ def _prepare_ghpc_yaml(self): "controller_sa": "sa", "startup_bucket": self.config["server"]["gcs_bucket"] } + + if self.cluster.controller_node_image is not None: + context["controller_image_yaml"] = f"""instance_image: + family: image-{self.cluster.controller_node_image.family} + project: {self.cluster.project_id} + """ + + if self.cluster.login_node_image is not None: + context["login_image_yaml"] = f"""instance_image: + family: image-{self.cluster.login_node_image.family} + project: {self.cluster.project_id} + """ + rendered_yaml = template.render(context) with yaml_file.open("w") as f: From dace639ab2631f49d328050206169972a4e44bc6 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Mon, 1 Jul 2024 13:32:20 +0100 Subject: [PATCH 004/180] OFE Update update_form.html Fixing disk selection for login and controller nodes logic --- .../ofe/website/ghpcfe/templates/cluster/update_form.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html index 54d9846f40..663b0bdf85 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html +++ b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html @@ -100,7 +100,7 @@

{{ title }}

Image - + Controller
@@ -132,7 +132,7 @@

{{ title }}

- + Login Nodes
@@ -452,7 +452,7 @@

{{ title }}

dataType: "json", headers: { 'X-CSRFToken': $.cookie("csrftoken") } }).done(function (data) { - $(".part_formset_row").each(function () { + $(".part_formset_row, .login_row, .controller_row").each(function () { var formRow = $(this); var machineTypeSelect = formRow.find('.machine_type_select'); var machineType = machineTypeSelect.val(); From 226dc672c49aebad551050ebdb6ce677dc63e5fa Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Mon, 1 Jul 2024 17:32:50 +0100 Subject: [PATCH 005/180] OFE: Update cluster_config.yaml.j2 Migrating to V6 --- .../blueprint/cluster_config.yaml.j2 | 29 ++++++++----------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 index a22569d024..50835e8c98 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 @@ -5,9 +5,7 @@ vars: deployment_name: {{ cluster.cloud_id }} region: {{ cluster.cloud_region }} zone: {{ cluster.cloud_zone }} - enable_reconfigure: True enable_cleanup_compute: False - enable_cleanup_subscriptions: True enable_bigquery_load: {{ cluster.use_bigquery }} instance_image_custom: True labels: @@ -47,7 +45,7 @@ deployment_groups: {{ cloudsql_yaml | safe }} - - source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + - source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller kind: terraform id: slurm_controller settings: @@ -61,9 +59,8 @@ deployment_groups: disk_type: {{ cluster.controller_disk_type }} disk_size_gb: {{ cluster.controller_disk_size }} {{ controller_image_yaml | safe }} - service_account: - email: $(hpc_service_account.service_account_email) - scopes: + service_account_email: $(hpc_service_account.service_account_email) + service_account_scopes: - https://www.googleapis.com/auth/cloud-platform - https://www.googleapis.com/auth/monitoring.write - https://www.googleapis.com/auth/logging.write @@ -76,29 +73,27 @@ deployment_groups: compute_startup_script: | #!/bin/bash gsutil cp gs://{{ startup_bucket }}/clusters/{{ cluster.id }}/bootstrap_compute.sh - | bash + login_startup_script: | + #!/bin/bash + echo "******************************************** CALLING LOGIN STARTUP" + gsutil cp gs://{{ startup_bucket }}/clusters/{{ cluster.id }}/bootstrap_login.sh - | bash use: + - slurm_login {{ controller_uses | safe }} - - source: community/modules/scheduler/schedmd-slurm-gcp-v5-login + - source: community/modules/scheduler/schedmd-slurm-gcp-v6-login kind: terraform id: slurm_login settings: num_instances: {{ cluster.num_login_nodes }} - subnetwork_self_link: {{ cluster.subnet.cloud_id }} + subnetwork_self_link: "projects/{{ cluster.project_id }}/regions/{{ cluster.region }}/subnetworks/{{ cluster.subnet.cloud_id }}" machine_type: {{ cluster.login_node_instance_type }} disk_type: {{ cluster.login_node_disk_type }} disk_size_gb: {{ cluster.login_node_disk_size }} {{ login_image_yaml | safe }} - service_account: - email: $(hpc_service_account.service_account_email) - scopes: + service_account_email: $(hpc_service_account.service_account_email) + service_account_scopes: - https://www.googleapis.com/auth/cloud-platform - https://www.googleapis.com/auth/monitoring.write - https://www.googleapis.com/auth/logging.write - https://www.googleapis.com/auth/devstorage.read_write - startup_script: | - #!/bin/bash - echo "******************************************** CALLING LOGIN STARTUP" - gsutil cp gs://{{ startup_bucket }}/clusters/{{ cluster.id }}/bootstrap_login.sh - | bash - use: - - slurm_controller From aca482efbe9f23114ac14dcd3ec0dd2da902a67f Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Mon, 1 Jul 2024 17:33:17 +0100 Subject: [PATCH 006/180] OFE: Update partition_config.yaml.j2 Migrating to V6 --- .../templates/blueprint/partition_config.yaml.j2 | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 index 86ade8151c..c66c48e31a 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 @@ -1,21 +1,21 @@ -- source: community/modules/compute/schedmd-slurm-gcp-v5-partition +- source: community/modules/compute/schedmd-slurm-gcp-v6-partition kind: terraform id: {{ part_id }} use: - - {{ part_id }}-group -{{ uses_str }} + - {{ part_id }}-nodeset settings: partition_name: {{ part.name }} - subnetwork_self_link: {{ cluster.subnet.cloud_id }} - enable_placement: {{ part.enable_placement }} exclusive: {{ exclusive }} -- source: community/modules/compute/schedmd-slurm-gcp-v5-node-group - id: {{ part_id }}-group +- source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + id: {{ part_id }}-nodeset use: +{{ uses_str }} settings: bandwidth_tier: {% if part.enable_tier1_networking %}tier_1_enabled{% else %}platform_default{% endif %} + subnetwork_self_link: "projects/{{ cluster.project_id }}/regions/{{ cluster.region }}/subnetworks/{{ cluster.subnet.cloud_id }}" enable_smt: {{ part.enable_hyperthreads }} + enable_placement: {{ part.enable_placement }} machine_type: {{ part.machine_type }} {% if part.reservation_name %} reservation_name: {{ part.reservation_name }} From 74c9d42fa5d939af042131d455b02ef8cb45a169 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Wed, 3 Jul 2024 12:02:09 +0100 Subject: [PATCH 007/180] OFE: Update clusterinfo.py Migrating to V6 --- .../ghpcfe/cluster_manager/clusterinfo.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py index fbae2dc8f3..017dee646d 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py @@ -164,7 +164,7 @@ def _set_credentials(self, creds=None): def _create_ssh_key(self, target_dir): # ssh-keygen -t rsa -f /.ssh/id_rsa -N "" sshdir = target_dir / ".ssh" - + if not sshdir.exists(): sshdir.mkdir(mode=0o711) @@ -196,7 +196,7 @@ def _prepare_ghpc_filesystems(self): filesystems_yaml = [] refs = [] template = self.env.get_template('blueprint/filesystem_config.yaml.j2') - + for (count, mp) in enumerate(self.cluster.mount_points.order_by("mount_order")): storage_id = f"mount_num_{mp.id}" server_ip = "'$controller'" if mp.export in self.cluster.shared_fs.exports.all() else mp.export.server_name @@ -212,7 +212,7 @@ def _prepare_ghpc_filesystems(self): indented_yaml = self.indent_text(rendered_yaml, 1) # Indent as necessary... filesystems_yaml.append(indented_yaml) refs.append(context['storage_id']) - + return ("\n\n".join(filesystems_yaml), refs) def _prepare_ghpc_partitions(self, part_uses): @@ -254,7 +254,7 @@ def _prepare_cloudsql_yaml(self): def _yaml_refs_to_uses(self, use_list, indent_level=0): indent = ' ' * indent_level use_lines = [f"{indent}- {item}" for item in use_list] - return "\n".join(use_lines) + return "\n".join(use_lines) def _prepare_ghpc_yaml(self): try: @@ -279,7 +279,7 @@ def _prepare_ghpc_yaml(self): "controller_sa": "sa", "startup_bucket": self.config["server"]["gcs_bucket"] } - + if self.cluster.controller_node_image is not None: context["controller_image_yaml"] = f"""instance_image: family: image-{self.cluster.controller_node_image.family} @@ -291,7 +291,7 @@ def _prepare_ghpc_yaml(self): family: image-{self.cluster.login_node_image.family} project: {self.cluster.project_id} """ - + rendered_yaml = template.render(context) with yaml_file.open("w") as f: @@ -381,6 +381,9 @@ def _get_tf_state_resource(self, state, filters): Returns each match """ + print(state["resources"]) + print(filters) + def matches(x): try: @@ -394,6 +397,7 @@ def matches(x): return list(filter(matches, state["resources"])) def _create_model_instances_from_tf_state(self, state, filters): + print(self._get_tf_state_resource(state, filters)) tf_nodes = self._get_tf_state_resource(state, filters)[0]["instances"] def model_from_tf(tf): @@ -436,7 +440,7 @@ def model_from_tf(tf): return existing_instance # Return the existing instance except ComputeInstance.DoesNotExist: # If the instance doesn't exist, create a new one - return ComputeInstance(**ci_kwargs) + return ComputeInstance(**ci_kwargs) return [model_from_tf(instance) for instance in tf_nodes] @@ -447,14 +451,14 @@ def _get_service_accounts(self, tf_state): # controller & login until we start setting them. filters = { - "module": "module.slurm_controller.module.slurm_controller_instance.module.slurm_controller_instance", #pylint:disable=line-too-long + "module": "module.slurm_controller.module.slurm_controller_instance", #pylint:disable=line-too-long "name": "slurm_instance", } tf_node = self._get_tf_state_resource(tf_state, filters)[0]["instances"][0] #pylint:disable=line-too-long ctrl_sa = tf_node["attributes"]["service_account"][0]["email"] filters = { - "module": "module.slurm_login.module.slurm_login_instance.module.slurm_login_instance", #pylint:disable=line-too-long + "module": 'module.slurm_controller.module.slurm_login_instance["slurm-login"]', #pylint:disable=line-too-long "name": "slurm_instance", } tf_node = self._get_tf_state_resource(tf_state, filters)[0]["instances"][0] #pylint:disable=line-too-long @@ -531,7 +535,7 @@ def _apply_terraform(self): mgmt_nodes = self._create_model_instances_from_tf_state( state, { - "module": "module.slurm_controller.module.slurm_controller_instance.module.slurm_controller_instance", # pylint: disable=line-too-long + "module": "module.slurm_controller.module.slurm_controller_instance", # pylint: disable=line-too-long "name": "slurm_instance", }, ) @@ -552,7 +556,7 @@ def _apply_terraform(self): login_nodes = self._create_model_instances_from_tf_state( state, { - "module": "module.slurm_login.module.slurm_login_instance.module.slurm_login_instance", # pylint: disable=line-too-long + "module": 'module.slurm_controller.module.slurm_login_instance["slurm-login"]', # pylint: disable=line-too-long "name": "slurm_instance", }, ) From 7ead9ad5a612d321fd2794bb2f5c66a908ff24b2 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Wed, 3 Jul 2024 12:24:39 +0100 Subject: [PATCH 008/180] OFE: Update models.py Migrating to V6 --- community/front-end/ofe/website/ghpcfe/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/website/ghpcfe/models.py b/community/front-end/ofe/website/ghpcfe/models.py index 12ea18ff62..3deefbcc05 100644 --- a/community/front-end/ofe/website/ghpcfe/models.py +++ b/community/front-end/ofe/website/ghpcfe/models.py @@ -604,7 +604,7 @@ class Image(CloudResource): max_length=60, help_text="Enter a source image family", blank=False, - default="schedmd-v5-slurm-22-05-8-rocky-linux-8", + default="slurm-gcp-6-5-hpc-rocky-linux-8", ) startup_script = models.ManyToManyField( From 555292dc89cbdc32d61d32905561d4abfd0589a8 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Wed, 3 Jul 2024 12:56:04 +0100 Subject: [PATCH 009/180] OFE: Update bootstrap_compute.sh Migrating to v6 --- .../cluster_startup/templates/bootstrap_compute.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_compute.sh b/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_compute.sh index 3c2ef0e46e..5429a358f6 100644 --- a/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_compute.sh +++ b/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_compute.sh @@ -13,6 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +#### Workaround for https://github.com/GoogleCloudPlatform/hpc-toolkit/issues/2733 +sed -i 's,latest-release,lustre-2.15.4,' /etc/yum.repos.d/lustre-client.repo +#### + # shellcheck disable=SC1083 BUCKET={{ server_bucket }} CLUSTER_ID={{ cluster.id }} From b00317bca733311857131b86ac3b719071acf319 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Wed, 3 Jul 2024 12:56:24 +0100 Subject: [PATCH 010/180] OFE: Update bootstrap_controller.sh Migrating to v6 --- .../cluster_startup/templates/bootstrap_controller.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_controller.sh b/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_controller.sh index 6fe853e9ba..f334964c05 100644 --- a/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_controller.sh +++ b/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_controller.sh @@ -13,6 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +#### Workaround for https://github.com/GoogleCloudPlatform/hpc-toolkit/issues/2733 +sed -i 's,latest-release,lustre-2.15.4,' /etc/yum.repos.d/lustre-client.repo +#### + # shellcheck disable=SC1083 BUCKET={{ server_bucket }} CLUSTER_ID={{ cluster.id }} From 00f8c9db201b32a3a646c67654800acaa97006ab Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Wed, 3 Jul 2024 12:56:42 +0100 Subject: [PATCH 011/180] OFE: Update bootstrap_login.sh Migrating to v6 --- .../cluster_startup/templates/bootstrap_login.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_login.sh b/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_login.sh index db4fcc92db..c1bfd3b91a 100644 --- a/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_login.sh +++ b/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_login.sh @@ -13,6 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +#### Workaround for https://github.com/GoogleCloudPlatform/hpc-toolkit/issues/2733 +sed -i 's,latest-release,lustre-2.15.4,' /etc/yum.repos.d/lustre-client.repo +#### + # shellcheck disable=SC1083 BUCKET={{ server_bucket }} CLUSTER_ID={{ cluster.id }} From 455b7ad255274e9ca873e7fc53fbf9734c388386 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Thu, 4 Jul 2024 11:18:52 +0100 Subject: [PATCH 012/180] OFE: Update main.yaml Migrating to V6 and Rocky8 --- .../roles/c2_daemon/tasks/main.yaml | 55 ++++++++----------- 1 file changed, 24 insertions(+), 31 deletions(-) diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml index 6ffe2cf2ae..3129584ba7 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml @@ -13,46 +13,39 @@ # limitations under the License. --- -- name: Set most recent Python version as default - ansible.builtin.shell: - cmd: | - latest_version=$(ls -1 /usr/bin/python3* | awk -F/ '{print $NF}' | grep -E 'python[0-9]+\.[0-9]+$' | sort -V | tail -1) - alternatives --set python3 /usr/bin/$latest_version - when: ansible_distribution == 'Rocky' +- name: Get default Python interpreter from update-alternatives + ansible.builtin.shell: > + update-alternatives --display python3 | + grep 'link currently points to' | + awk '{print $NF}' + register: default_python + changed_when: false -- name: Install pip3 - ansible.builtin.package: - name: python3-pip - state: present - become: true - when: ansible_distribution == 'Rocky' +- name: Set default Python interpreter for Ansible + ansible.builtin.set_fact: + ansible_python_interpreter: "{{ default_python.stdout }}" -- name: Install setuptools for Python 3.11 - ansible.builtin.command: - cmd: /usr/bin/python3.11 -m ensurepip --upgrade - become: true - when: ansible_distribution == 'Rocky' +- name: Verify Python interpreter + ansible.builtin.command: "{{ ansible_python_interpreter }} --version" + register: python_version -- name: Upgrade PIP3 - ansible.builtin.pip: - executable: pip3 - name: pip - state: forcereinstall +- name: Display Python version + ansible.builtin.debug: + msg: "The Python interpreter version is: {{ python_version.stdout }}" # Can't use the pip action here because we need to explicitly enable # a modern gcc from the dev_env role - name: Install FE C&C Dependencies ansible.builtin.pip: - executable: pip3 name: - - requests - - pexpect - - google-cloud-storage - - google-cloud-pubsub - - addict - - google-api-python-client - - google-cloud-secret-manager - - prometheus_client + - requests + - pexpect + - google-cloud-storage + - google-cloud-pubsub + - addict + - google-api-python-client + - google-cloud-secret-manager + - prometheus_client state: present - name: Install FE C&C Daemon From 87335141367ab2e77bae845e4db0a857b04223b1 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Thu, 4 Jul 2024 12:00:24 +0100 Subject: [PATCH 013/180] OFE: Update cluster_config.yaml.j2 Migrating to V6 and Rocky8 --- .../website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 index 50835e8c98..5646c2390a 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 @@ -86,7 +86,7 @@ deployment_groups: id: slurm_login settings: num_instances: {{ cluster.num_login_nodes }} - subnetwork_self_link: "projects/{{ cluster.project_id }}/regions/{{ cluster.region }}/subnetworks/{{ cluster.subnet.cloud_id }}" + subnetwork_self_link: "projects/{{ cluster.project_id }}/regions/{{ cluster.cloud_region }}/subnetworks/{{ cluster.subnet.cloud_id }}" machine_type: {{ cluster.login_node_instance_type }} disk_type: {{ cluster.login_node_disk_type }} disk_size_gb: {{ cluster.login_node_disk_size }} From ff5e0a66ee277b8e437f0a03699b94704efd2f38 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Thu, 4 Jul 2024 12:00:53 +0100 Subject: [PATCH 014/180] OFE: Update partition_config.yaml.j2 Migrating to V6 and Rocky8 --- .../website/ghpcfe/templates/blueprint/partition_config.yaml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 index c66c48e31a..865ceca3e6 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 @@ -13,7 +13,7 @@ {{ uses_str }} settings: bandwidth_tier: {% if part.enable_tier1_networking %}tier_1_enabled{% else %}platform_default{% endif %} - subnetwork_self_link: "projects/{{ cluster.project_id }}/regions/{{ cluster.region }}/subnetworks/{{ cluster.subnet.cloud_id }}" + subnetwork_self_link: "projects/{{ cluster.project_id }}/regions/{{ cluster.cloud_region }}/subnetworks/{{ cluster.subnet.cloud_id }}" enable_smt: {{ part.enable_hyperthreads }} enable_placement: {{ part.enable_placement }} machine_type: {{ part.machine_type }} From 488de8d2aa6aa6abdb1319512d394893cdb60677 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Thu, 4 Jul 2024 14:29:31 +0100 Subject: [PATCH 015/180] OFE: Update partition_config.yaml.j2 Migrating to v6 and Rocky8 --- .../website/ghpcfe/templates/blueprint/partition_config.yaml.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 index 865ceca3e6..b0c4b9baeb 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 @@ -6,6 +6,7 @@ settings: partition_name: {{ part.name }} exclusive: {{ exclusive }} + resume_timeout: 500 - source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset id: {{ part_id }}-nodeset From 73b4d7a4879f4836e948a0e6a87c652f42468d95 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Thu, 4 Jul 2024 15:25:20 +0100 Subject: [PATCH 016/180] OFE: Update ghpcfe_c2daemon.py Migrating to v6 and Rocky8 --- .../ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py index f01dc8a0ca..46889b58a3 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py @@ -430,7 +430,7 @@ def cb_spack_install(message): logger.info( "Job for %s:%s completed with result %s", appid, app_name, state ) - status = "r" if state in ["COMPLETED", "COMPLETING"] else "e" + status = "r" if any(s in ['COMPLETED', 'COMPLETING'] for s in state) else "e" final_update = {"ackid": ackid, "app_id": appid, "status": status} if status == "r": final_update.update( From 6ad3f612410f46cce9e5cae7d78d5444951257b3 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Thu, 4 Jul 2024 15:30:03 +0100 Subject: [PATCH 017/180] OFE: Update ghpcfe_c2daemon.py Migrating to v6 and Rocky8 --- .../ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py index 46889b58a3..6b33a188bb 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py @@ -551,7 +551,7 @@ def cb_install_app(message): app_name, state, ) - status = "r" if state in ["COMPLETED", "COMPLETING"] else "e" + status = "r" if any(s in ['COMPLETED', 'COMPLETING'] for s in state) else "e" response["status"] = status if status == "r": # Application installed. Install Module file if appropriate From 262c815ff0c1901f886e30881bf9ce4879af404f Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Thu, 4 Jul 2024 15:32:53 +0100 Subject: [PATCH 018/180] OFE: Update ghpcfe_c2daemon.py Migrating to v6 and Rocky8 --- .../ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py index 6b33a188bb..b27f930968 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py @@ -918,7 +918,7 @@ def cb_run_job(message, **kwargs): logger.info( "Job %s (slurm %s) completed with result %s", jobid, slurm_jobid, state ) - status = "c" if state in ["COMPLETED", "COMPLETING"] else "e" + status = "r" if any(s in ['COMPLETED', 'COMPLETING'] for s in state) else "e" response["status"] = "u" send_message("UPDATE", response) From b1193d0c2bb803d87ba06b6699a8657f486db433 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Fri, 5 Jul 2024 10:59:32 +0100 Subject: [PATCH 019/180] OFE: Update ghpcfe_c2daemon.py Migrating to v6 and Rocky8 --- .../roles/c2_daemon/files/ghpcfe_c2daemon.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py index b27f930968..5ede21bfd3 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py @@ -243,12 +243,17 @@ def _slurm_get_job_info(jobid): def _slurm_get_job_state(jobid): - """Returns the job state, or None if job isn't in the queue""" - # N.B - eventually, pyslurm might work with our version of Slurm, - # and this can be changed to something more sane. For now, call squeue - state = _slurm_get_job_info(jobid) - return state.get("job_state", None) if state else None + """Returns the job state, or None if the job isn't in the queue""" + state = _slurm_get_job_info(jobid) # Fetch job info using an external function + job_state = state.get("job_state", None) if state else None # Get the 'job_state' if available + + if job_state and isinstance(job_state, list) and job_state: + logger.info("Slurm returned job %s with state %s", jobid, job_state[0]) # Log the first state if available + return job_state[0] # Return the first element of the state list + else: + logger.info("No valid job state available for job %s", jobid) # Log when no valid state is found + return None # Return None if there is no job state or it's not a list def _spack_submit_build(app_id, partition, app_name, spec, extra_sbatch=None): build_dir = Path("/opt/cluster/installs") / str(app_id) @@ -430,7 +435,7 @@ def cb_spack_install(message): logger.info( "Job for %s:%s completed with result %s", appid, app_name, state ) - status = "r" if any(s in ['COMPLETED', 'COMPLETING'] for s in state) else "e" + status = "r" if state in ["COMPLETED", "COMPLETING"] else "e" final_update = {"ackid": ackid, "app_id": appid, "status": status} if status == "r": final_update.update( @@ -551,7 +556,7 @@ def cb_install_app(message): app_name, state, ) - status = "r" if any(s in ['COMPLETED', 'COMPLETING'] for s in state) else "e" + status = "r" if state in ["COMPLETED", "COMPLETING"] else "e" response["status"] = status if status == "r": # Application installed. Install Module file if appropriate @@ -918,19 +923,21 @@ def cb_run_job(message, **kwargs): logger.info( "Job %s (slurm %s) completed with result %s", jobid, slurm_jobid, state ) - status = "r" if any(s in ['COMPLETED', 'COMPLETING'] for s in state) else "e" + status = "c" if state in ["COMPLETED", "COMPLETING"] else "e" response["status"] = "u" send_message("UPDATE", response) try: slurm_job_info = _slurm_get_job_info(slurm_jobid) response["job_runtime"] = ( - slurm_job_info["end_time"] - slurm_job_info["start_time"] + slurm_job_info["end_time"]["number"] - slurm_job_info["start_time"]["number"] ) except KeyError: logger.warning( "Job data from SLURM did not include start time and end time" ) + except Exception as E: + logger.error("Unexpected error: %s", E) kpi = job_dir / "kpi.json" if kpi.is_file(): From 4d051d6bf3fe2dd716c790ea75888bcf22a0c749 Mon Sep 17 00:00:00 2001 From: Scott Gordon Date: Fri, 5 Jul 2024 16:48:41 +0100 Subject: [PATCH 020/180] OFE: v6 form validation --- .../front-end/ofe/website/ghpcfe/forms.py | 26 +++++++++++++++++++ .../front-end/ofe/website/ghpcfe/models.py | 3 +++ .../blueprint/partition_config.yaml.j2 | 2 +- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/community/front-end/ofe/website/ghpcfe/forms.py b/community/front-end/ofe/website/ghpcfe/forms.py index d6db9c4618..a876b54c85 100644 --- a/community/front-end/ofe/website/ghpcfe/forms.py +++ b/community/front-end/ofe/website/ghpcfe/forms.py @@ -248,6 +248,7 @@ class Meta: "dynamic_node_count", "static_node_count", "reservation_name", + "exclusive", "enable_placement", "enable_hyperthreads", "enable_tier1_networking", @@ -316,6 +317,31 @@ def clean(self): raise ValidationError( "SlurmGCP does not support Placement Groups for selected instance type" # pylint: disable=line-too-long ) + + # schedmd-slurm-gcp-v6-partition/outputs.tf + if cleaned_data["dynamic_node_count"] > 0 and not cleaned_data[ + "exclusive" + ]: + raise ValidationError("If any non-static nodesets have enable placement set to true, exclusive must be true.") + + if cleaned_data["static_node_count"] > 0 and cleaned_data[ + "exclusive" + ]: + raise ValidationError("Can't use static nodes within partition with exclusive set to true.") + + # schedmd-slurm-gcp-v6-nodeset/outputs.tf + if cleaned_data["reservation_name"] and cleaned_data[ + "enable_placement" + ]: + raise ValidationError("If a reservation is specified, placement must be false.") + + if cleaned_data["enable_placement"] and cleaned_data[ + "static_node_count" + ] > 0 and cleaned_data[ + "dynamic_node_count" + ] > 0: + raise ValidationError("Cannot use placement with static and auto-scaling nodes in the same node set.") + return cleaned_data diff --git a/community/front-end/ofe/website/ghpcfe/models.py b/community/front-end/ofe/website/ghpcfe/models.py index 3deefbcc05..075f4587f5 100644 --- a/community/front-end/ofe/website/ghpcfe/models.py +++ b/community/front-end/ofe/website/ghpcfe/models.py @@ -919,6 +919,9 @@ class ClusterPartition(models.Model): enable_hyperthreads = models.BooleanField( default=False, help_text="Enable Hyperthreads (SMT)" ) + exclusive = models.BooleanField( + default=True, help_text="Exclusive job access to nodes." + ) enable_tier1_networking = models.BooleanField( default=False, help_text=( diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 index b0c4b9baeb..6d49084e69 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 @@ -5,7 +5,7 @@ - {{ part_id }}-nodeset settings: partition_name: {{ part.name }} - exclusive: {{ exclusive }} + exclusive: {{ part.exclusive }} resume_timeout: 500 - source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset From df944f4dea38a908d1ee729a9f376d3d17d39f85 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Mon, 8 Jul 2024 13:50:10 +0100 Subject: [PATCH 021/180] OFE: Update cluster_config.yaml.j2 Migrating to V6 and Rocky8 --- .../website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 index 5646c2390a..b0afe49911 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 @@ -5,7 +5,7 @@ vars: deployment_name: {{ cluster.cloud_id }} region: {{ cluster.cloud_region }} zone: {{ cluster.cloud_zone }} - enable_cleanup_compute: False + enable_cleanup_compute: True enable_bigquery_load: {{ cluster.use_bigquery }} instance_image_custom: True labels: From c7d7d19bb28c1ea92348bf693dda624b90ee532b Mon Sep 17 00:00:00 2001 From: Scott Gordon Date: Thu, 11 Jul 2024 10:02:46 +0100 Subject: [PATCH 022/180] OFE: fix for reservation handling --- .../ghpcfe/cluster_manager/cloud_info.py | 50 ++++++++++++++ .../front-end/ofe/website/ghpcfe/forms.py | 69 ++++++++++++++++++- .../blueprint/partition_config.yaml.j2 | 2 +- .../ofe/website/ghpcfe/views/clusters.py | 23 +++++++ 4 files changed, 140 insertions(+), 4 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py index b5708ef1f4..f88808f81e 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py @@ -83,6 +83,56 @@ def _get_gcp_client(credentials, service="compute", api_version="v1"): ) +def _get_vm_reservations(credentials, zone, ttl_hash=None): + try: + # logger.info(f"Fetching VM reservations for credentials: {credentials}, zone: {zone}") + project, client = _get_gcp_client(credentials) + + req = client.reservations().list(project=project, zone=zone) + resp = req.execute() + + if "items" not in resp: + # logger.info("No reservations found") + return {} + + data = { + reservation["name"]: { + "name": reservation["name"], + "specificReservationRequired": reservation.get("specificReservationRequired", False), + "status": reservation["status"], + "instanceProperties": { + "machineType": reservation + .get("specificReservation", {}) + .get("instanceProperties", {}) + .get("machineType", ""), + "minCpuPlatform": reservation + .get("specificReservation", {}) + .get("instanceProperties", {}) + .get("minCpuPlatform", ""), + "availableCount": int( + reservation + .get("specificReservation", {}) + .get("count", 0) + ) + }, + "shareSettings": reservation.get("shareSettings", {}), + } + for reservation in resp["items"] + } + + # logger.info(f"Reservations data: {data}") + return data + except Exception as e: + logger.error(f"Error fetching VM reservations: {e}") + return {} + +def get_vm_reservations(cloud_provider, credentials, unused_region, zone): + if cloud_provider == "GCP": + return _get_vm_reservations(credentials, zone, ttl_hash=_get_ttl_hash()) + else: + raise Exception(f'Unsupported Cloud Provider "{cloud_provider}"') + + @lru_cache def _get_gcp_disk_types( credentials, zone, ttl_hash=None diff --git a/community/front-end/ofe/website/ghpcfe/forms.py b/community/front-end/ofe/website/ghpcfe/forms.py index a876b54c85..dc4d7046d0 100644 --- a/community/front-end/ofe/website/ghpcfe/forms.py +++ b/community/front-end/ofe/website/ghpcfe/forms.py @@ -322,12 +322,16 @@ def clean(self): if cleaned_data["dynamic_node_count"] > 0 and not cleaned_data[ "exclusive" ]: - raise ValidationError("If any non-static nodesets have enable placement set to true, exclusive must be true.") + raise ValidationError( + "If any non-static nodesets have enable placement set to true, exclusive must be true." + ) if cleaned_data["static_node_count"] > 0 and cleaned_data[ "exclusive" ]: - raise ValidationError("Can't use static nodes within partition with exclusive set to true.") + raise ValidationError( + "Can't use static nodes within partition with exclusive set to true." + ) # schedmd-slurm-gcp-v6-nodeset/outputs.tf if cleaned_data["reservation_name"] and cleaned_data[ @@ -340,7 +344,66 @@ def clean(self): ] > 0 and cleaned_data[ "dynamic_node_count" ] > 0: - raise ValidationError("Cannot use placement with static and auto-scaling nodes in the same node set.") + raise ValidationError( + "Cannot use placement with static and auto-scaling nodes in the same node set." + ) + + # Reservation validation logic + reservation_name = cleaned_data.get("reservation_name") + if reservation_name: + try: + cluster = cleaned_data.get('cluster') + cloud_credential = cluster.cloud_credential.detail + cloud_zone = cluster.cloud_zone + + # logger.info(f"Cluster: {cluster}") + # logger.info(f"Cloud Credential: {cloud_credential}") + # logger.info(f"Cloud Zone: {cloud_zone}") + + reservations = cloud_info.get_vm_reservations("GCP", cloud_credential, None, cloud_zone) + + if not reservations: + raise ValidationError("No reservations found for the specified zone.") + + matching_reservation = reservations.get(reservation_name) + + if not matching_reservation: + raise ValidationError( + f"Reservation {reservation_name} does not exist in the specified zone." + ) + + if matching_reservation[ + "instanceProperties" + ][ + "machineType" + ] != cleaned_data["machine_type"]: + raise ValidationError( + f"Reservation {reservation_name} does not support the specified machine type. " + f"Machine type: {cleaned_data['machine_type']}." + ) + + total_requested_nodes = cleaned_data["dynamic_node_count"] + cleaned_data["static_node_count"] + available_nodes = matching_reservation.get("instanceProperties", {}).get("availableCount", 0) + + if total_requested_nodes > available_nodes: + raise ValidationError( + f"Reservation {reservation_name} does not have enough available nodes." + f"Requested: {total_requested_nodes}, Available: {available_nodes}" + ) + + specific_reservation = matching_reservation.get("specificReservationRequired") + if specific_reservation == False: + raise ValidationError( + f"You must use a 'specific' reservation type." + f"Please read the following URL for more information about setting up reservations:" + f"https://cloud.google.com/compute/docs/instances/reservations-overview#how-reservations-work" + ) + + except Exception as e: + logger.error(f"Error validating reservation: {reservation_name}. Exception: {e}") + raise ValidationError( + f"Error validating reservation: {reservation_name}. Exception: {str(e)}" + ) return cleaned_data diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 index 6d49084e69..9951079cf2 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 @@ -19,7 +19,7 @@ enable_placement: {{ part.enable_placement }} machine_type: {{ part.machine_type }} {% if part.reservation_name %} - reservation_name: {{ part.reservation_name }} + reservation_name: "projects/{{ cluster.project_id }}/reservations/{{ part.reservation_name }}" {% endif %} node_count_dynamic_max: {{ part.dynamic_node_count }} node_count_static: {{ part.static_node_count }} diff --git a/community/front-end/ofe/website/ghpcfe/views/clusters.py b/community/front-end/ofe/website/ghpcfe/views/clusters.py index 733fade339..eac57a8139 100644 --- a/community/front-end/ofe/website/ghpcfe/views/clusters.py +++ b/community/front-end/ofe/website/ghpcfe/views/clusters.py @@ -468,6 +468,7 @@ def form_valid(self, form): parts = partitions.save() try: + total_nodes_requested = {} for part in parts: part.vCPU_per_node = machine_info[part.machine_type]["vCPU"] // (1 if part.enable_hyperthreads else 2) cpu_count = machine_info[part.machine_type]["vCPU"] @@ -507,6 +508,28 @@ def form_valid(self, form): raise ValidationError( f"Invalid combination: machine_type {part.machine_type} cannot be used with disk_type {disk_type}." ) + + # Sum the total nodes for each reservation + if part.reservation_name: + if part.reservation_name not in total_nodes_requested: + total_nodes_requested[part.reservation_name] = 0 + total_nodes_requested[part.reservation_name] += part.dynamic_node_count + part.static_node_count + + # Validate total requested nodes against available nodes + for reservation_name, requested_nodes in total_nodes_requested.items(): + reservation = cloud_info.get_vm_reservations( + "GCP", + self.object.cloud_credential.detail, + None, + self.object.cloud_zone + ) + matching_reservation = reservation.get(reservation_name) + available_nodes = int(matching_reservation["instanceProperties"].get("availableCount", 0)) + if requested_nodes > available_nodes: + raise ValidationError(f"Reservation {reservation_name} does not have enough available nodes." + f"Requested: {requested_nodes}, Available: {available_nodes}" + ) + except KeyError as err: raise ValidationError("Error in Partition - invalid machine type: " f"{part.machine_type}") from err From bd8fcc735cf9b63acce7684bc787c79d27514025 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:11:00 +0100 Subject: [PATCH 023/180] OFE: Update bootstrap_compute.sh Lustre client install fix no longer needed --- .../cluster_startup/templates/bootstrap_compute.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_compute.sh b/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_compute.sh index 5429a358f6..3c2ef0e46e 100644 --- a/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_compute.sh +++ b/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_compute.sh @@ -13,10 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -#### Workaround for https://github.com/GoogleCloudPlatform/hpc-toolkit/issues/2733 -sed -i 's,latest-release,lustre-2.15.4,' /etc/yum.repos.d/lustre-client.repo -#### - # shellcheck disable=SC1083 BUCKET={{ server_bucket }} CLUSTER_ID={{ cluster.id }} From 1065819250fe7ca0815273ee444c28f31341c600 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:11:19 +0100 Subject: [PATCH 024/180] OFE: Update bootstrap_controller.sh Lustre client install fix no longer needed --- .../cluster_startup/templates/bootstrap_controller.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_controller.sh b/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_controller.sh index f334964c05..6fe853e9ba 100644 --- a/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_controller.sh +++ b/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_controller.sh @@ -13,10 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -#### Workaround for https://github.com/GoogleCloudPlatform/hpc-toolkit/issues/2733 -sed -i 's,latest-release,lustre-2.15.4,' /etc/yum.repos.d/lustre-client.repo -#### - # shellcheck disable=SC1083 BUCKET={{ server_bucket }} CLUSTER_ID={{ cluster.id }} From 5917b9e887c8dae15cfcdeb3a39d7eed48c5beba Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius <117852832+ek-nag@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:11:35 +0100 Subject: [PATCH 025/180] OFE: Update bootstrap_login.sh Lustre client install fix no longer needed --- .../cluster_startup/templates/bootstrap_login.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_login.sh b/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_login.sh index c1bfd3b91a..db4fcc92db 100644 --- a/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_login.sh +++ b/community/front-end/ofe/infrastructure_files/cluster_startup/templates/bootstrap_login.sh @@ -13,10 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -#### Workaround for https://github.com/GoogleCloudPlatform/hpc-toolkit/issues/2733 -sed -i 's,latest-release,lustre-2.15.4,' /etc/yum.repos.d/lustre-client.repo -#### - # shellcheck disable=SC1083 BUCKET={{ server_bucket }} CLUSTER_ID={{ cluster.id }} From e7be86867298905024c9d35ffda5db3f063385c4 Mon Sep 17 00:00:00 2001 From: Scott Gordon Date: Tue, 16 Jul 2024 11:47:08 +0100 Subject: [PATCH 026/180] OFE: use TF 1.4 in startup script --- .../ofe/infrastructure_files/gcs_bucket/webserver/startup.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh index ecce1e3a32..6343453e2f 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh @@ -48,9 +48,10 @@ printf "####################\n#### Installing required packages\n############### dnf install -y epel-release dnf update -y --security dnf config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo +dnf install -y terraform-1.4.6 dnf install --best -y google-cloud-sdk nano make gcc python38-devel unzip git \ rsync wget nginx bind-utils policycoreutils-python-utils \ - terraform packer supervisor python3-certbot-nginx jq + packer supervisor python3-certbot-nginx jq curl --silent --show-error --location https://github.com/mikefarah/yq/releases/download/v4.13.4/yq_linux_amd64 --output /usr/local/bin/yq chmod +x /usr/local/bin/yq curl --silent --show-error --location https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz --output /tmp/shellcheck.tar.xz From cc5eb260a56479d750019096abe7a2c188a673e0 Mon Sep 17 00:00:00 2001 From: Scott Gordon Date: Tue, 16 Jul 2024 11:48:09 +0100 Subject: [PATCH 027/180] OFE: update to show valid disk options for instance types --- .../ghpcfe/cluster_manager/cloud_info.py | 100 ++++++++++++++++++ .../ghpcfe/templates/cluster/update_form.html | 84 ++++++++++----- 2 files changed, 156 insertions(+), 28 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py index f88808f81e..91d3d9a840 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py @@ -166,6 +166,10 @@ def _get_gcp_machine_types( ): # pylint: disable=unused-argument (project, client) = _get_gcp_client(credentials) + # Fetch disk types dynamically + disk_types = _get_gcp_disk_types(credentials, zone, ttl_hash=ttl_hash) + disk_type_names = [disk_type["name"] for disk_type in disk_types] + req = client.machineTypes().list( project=project, zone=zone, filter="isSharedCpu=False" ) @@ -174,6 +178,99 @@ def _get_gcp_machine_types( if "items" not in resp: return [] + invalid_disk_types = { + "c4-": [ + "local-ssd", "pd-standard", "pd-balanced", "pd-ssd", + "pd-extreme", "hyperdisk-ml", "hyperdisk-balanced" + ], + "c3-": [ + "pd-extreme", "pd-standard", "hyperdisk-ml" + ], + "c3d-": [ + "pd-standard", "pd-extreme", "hyperdisk-extreme" + ], + "n4-": [ + "local-ssd", "pd-standard", "pd-balanced", "pd-ssd", + "pd-extreme", "hyperdisk-extreme", "hyperdisk-ml", + "hyperdisk-throughput" + ], + "n2-": [ + "hyperdisk-balanced", "hyperdisk-ml" + ], + "n2d-": [ + "pd-extreme", "hyperdisk-ml", "hyperdisk-balanced", + "hyperdisk-extreme" + ], + "n1-": [ + "pd-extreme", "hyperdisk-extreme", "hyperdisk-ml", + "hyperdisk-throughput", "hyperdisk-balanced" + ], + "td2-": [ + "pd-extreme", "local-ssd", "hyperdisk-balanced", + "hyperdisk-ml", "hyperdisk-extreme" + ], + "t2a-": [ + "local-ssd", "pd-extreme", "hyperdisk-balanced", + "hyperdisk-ml", "hyperdisk-extreme", + "hyperdisk-throughput" + ], + "e2-": [ + "local-ssd", "pd-extreme", "hyperdisk-balanced", + "hyperdisk-ml", "hyperdisk-extreme", + "hyperdisk-throughput" + ], + "z3-": [ + "pd-extreme", "pd-standard", "hyperdisk-throughput", + "hyperdisk-ml", "hyperdisk-extreme", + "hyperdisk-balanced" + ], + "h3-": [ + "local-ssd", "pd-standard", "pd-ssd", "pd-extreme", + "hyperdisk-ml", "hyperdisk-extreme" + ], + "c2-": [ + "pd-extreme", "hyperdisk-balanced", "hyperdisk-extreme", + "hyperdisk-ml", "hyperdisk-throughput" + ], + "c2d-": [ + "pd-extreme", "hyperdisk-balanced", "hyperdisk-extreme", + "hyperdisk-ml", "hyperdisk-throughput" + ], + "x4-": [ + "local-ssd", "pd-ssd", "pd-standard", "pd-balanced", + "pd-extreme", "hyperdisk-ml", "hyperdisk-throughput" + ], + "m3-": [ + "hyperdisk-throughput", "hyperdisk-ml", "pd-standard" + ], + "m2-": [ + "local-ssd", "hyperdisk-ml", "hyperdisk-throughput" + ], + "m1-": [ + "hyperdisk-ml", "hyperdisk-throughput" + ], + "n1-": [ + "pd-extreme", "hyperdisk-balanced", "hyperdisk-ml", + "hyperdisk-extreme", "hyperdisk-throughput" + ], + "a3-": [ + "local-ssd", "pd-extreme", "pd-standard", + "hyperdisk-balanced" + ], + "a2-": [ + "pd-extreme", "hyperdisk-throughput", "hyperdisk-ml", + "hyperdisk-balanced" + ], + "g2-": [ + "pd-extreme", "pd-ssd", "hyperdisk-balanced", + "hyperdisk-extreme" + ] + } + + def get_invalid_disk_types(machine_type_name): + family = machine_type_name.split("-")[0] + "-" + return invalid_disk_types.get(family, []) + data = { mt["name"]: { "name": mt["name"], @@ -188,6 +285,7 @@ def _get_gcp_machine_types( } for acc in mt.get("accelerators", []) }, + "invalid_disk_types": get_invalid_disk_types(mt["name"]) } for mt in resp["items"] } @@ -224,6 +322,8 @@ def _get_gcp_machine_types( items[0]["description"] ) + # logger.info(data) + return data diff --git a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html index 663b0bdf85..7cbf1abaa1 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html +++ b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html @@ -456,36 +456,64 @@

{{ title }}

var formRow = $(this); var machineTypeSelect = formRow.find('.machine_type_select'); var machineType = machineTypeSelect.val(); + + var invalidDiskTypes = []; + + // Fetch the invalid disk types for the selected machine type + $.ajax({ + url: "{% url 'api-instancetype-list' %}" + machineType + "/?cluster={{ object.id }}®ion=" + region + "&zone=" + zone, + type: "GET", + dataType: "json", + async: false, // To ensure we get the data before proceeding + headers: { 'X-CSRFToken': $.cookie("csrftoken") } + }).done(function(machineData) { + invalidDiskTypes = machineData.invalid_disk_types || []; + }); formRow.find(".disk_type_select").each(function (pos, selObj) { - var curVal = selObj.value; - $(selObj).empty(); - - if (machineType && machineType.startsWith('c4-')) { - var option = document.createElement("option"); - option.text = "Hyperdisk Balanced Persistent Disk"; - option.setAttribute("value", "hyperdisk-balanced"); - selObj.appendChild(option); - } else { - var additionalDisk = selObj.id.slice(0, selObj.id.lastIndexOf("_disk_type")); - if (additionalDisk.endsWith("additional")) { - $.each(data.disks, function (i, disk_info) { - var option = document.createElement("option"); - option.text = disk_info.description; - option.setAttribute("value", disk_info.name); - selObj.appendChild(option); - }); - } else { - $.each(data.disks, function (i, disk_info) { - if (disk_info.name === 'local-ssd' || disk_info.name.startsWith("pd-")) { - var option = document.createElement("option"); - option.text = disk_info.description; - option.setAttribute("value", disk_info.name); - selObj.appendChild(option); - } - }); - } - } + var curVal = selObj.value; + $(selObj).empty(); + + if (machineType && + (machineType.startsWith('c4-') || + machineType.startsWith('n4-') || + machineType.startsWith('x4-'))) { + var option = document.createElement("option"); + option.text = "Hyperdisk Balanced Persistent Disk"; + option.setAttribute("value", "hyperdisk-balanced"); + selObj.appendChild(option); + + if (machineType.startsWith('c4-') || + machineType.startsWith('x4-')) { + var option = document.createElement("option"); + option.text = "Hyperdisk Extreme Persistent Disk"; + option.setAttribute("value", "hyperdisk-extreme"); + selObj.appendChild(option); + } + } else { + var additionalDisk = selObj.id.slice(0, selObj.id.lastIndexOf("_disk_type")); + if (additionalDisk.endsWith("additional")) { + $.each(data.disks, function (i, disk_info) { + if (invalidDiskTypes.indexOf(disk_info.name) === -1) { + var option = document.createElement("option"); + option.text = disk_info.description; + option.setAttribute("value", disk_info.name); + selObj.appendChild(option); + } + }); + } else { + $.each(data.disks, function (i, disk_info) { + if ((disk_info.name === 'local-ssd' || + disk_info.name.startsWith("pd-")) && + invalidDiskTypes.indexOf(disk_info.name) === -1) { + var option = document.createElement("option"); + option.text = disk_info.description; + option.setAttribute("value", disk_info.name); + selObj.appendChild(option); + } + }); + } + } var id_prefix = selObj.id.slice(0, selObj.id.lastIndexOf("_disk_type")); var disk_size_sel = $(selObj).parentsUntil("tbody").find("#" + id_prefix + "_disk_size")[0]; From b8931926f30baee7159d669e88ee453154277f54 Mon Sep 17 00:00:00 2001 From: Scott Gordon Date: Wed, 17 Jul 2024 14:27:44 +0100 Subject: [PATCH 028/180] OFE: indentation --- .../ofe/infrastructure_files/gcs_bucket/webserver/startup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh index 6343453e2f..e4d9b942cc 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh @@ -50,7 +50,7 @@ dnf update -y --security dnf config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo dnf install -y terraform-1.4.6 dnf install --best -y google-cloud-sdk nano make gcc python38-devel unzip git \ - rsync wget nginx bind-utils policycoreutils-python-utils \ + rsync wget nginx bind-utils policycoreutils-python-utils \ packer supervisor python3-certbot-nginx jq curl --silent --show-error --location https://github.com/mikefarah/yq/releases/download/v4.13.4/yq_linux_amd64 --output /usr/local/bin/yq chmod +x /usr/local/bin/yq From 0420fd9cc23788ec735d4eb89019067b5217b970 Mon Sep 17 00:00:00 2001 From: Scott Gordon Date: Wed, 17 Jul 2024 15:00:50 +0100 Subject: [PATCH 029/180] OFE: indentation --- .../ofe/infrastructure_files/gcs_bucket/webserver/startup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh index e4d9b942cc..cac335eb96 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh @@ -50,8 +50,8 @@ dnf update -y --security dnf config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo dnf install -y terraform-1.4.6 dnf install --best -y google-cloud-sdk nano make gcc python38-devel unzip git \ - rsync wget nginx bind-utils policycoreutils-python-utils \ - packer supervisor python3-certbot-nginx jq + rsync wget nginx bind-utils policycoreutils-python-utils \ + packer supervisor python3-certbot-nginx jq curl --silent --show-error --location https://github.com/mikefarah/yq/releases/download/v4.13.4/yq_linux_amd64 --output /usr/local/bin/yq chmod +x /usr/local/bin/yq curl --silent --show-error --location https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz --output /tmp/shellcheck.tar.xz From 4482135efdfe27891cd27873d60c4d16aec7afb0 Mon Sep 17 00:00:00 2001 From: Scott Gordon Date: Wed, 17 Jul 2024 16:14:32 +0100 Subject: [PATCH 030/180] OFE: indentation --- .../roles/c2_daemon/tasks/main.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml index 3129584ba7..8a5d8d4724 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml @@ -38,14 +38,14 @@ - name: Install FE C&C Dependencies ansible.builtin.pip: name: - - requests - - pexpect - - google-cloud-storage - - google-cloud-pubsub - - addict - - google-api-python-client - - google-cloud-secret-manager - - prometheus_client + - requests + - pexpect + - google-cloud-storage + - google-cloud-pubsub + - addict + - google-api-python-client + - google-cloud-secret-manager + - prometheus_client state: present - name: Install FE C&C Daemon From c15c6fb2396982e9140b9f59445cc59df8d0926b Mon Sep 17 00:00:00 2001 From: Scott Gordon Date: Thu, 18 Jul 2024 12:40:30 +0100 Subject: [PATCH 031/180] OFE: tweak to valid drive types --- .../ghpcfe/cluster_manager/cloud_info.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py index 91d3d9a840..c6b8427394 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py @@ -181,10 +181,10 @@ def _get_gcp_machine_types( invalid_disk_types = { "c4-": [ "local-ssd", "pd-standard", "pd-balanced", "pd-ssd", - "pd-extreme", "hyperdisk-ml", "hyperdisk-balanced" + "pd-extreme", "hyperdisk-ml", "hyperdisk-throughput" ], "c3-": [ - "pd-extreme", "pd-standard", "hyperdisk-ml" + "pd-extreme", "pd-standard" ], "c3d-": [ "pd-standard", "pd-extreme", "hyperdisk-extreme" @@ -205,7 +205,7 @@ def _get_gcp_machine_types( "pd-extreme", "hyperdisk-extreme", "hyperdisk-ml", "hyperdisk-throughput", "hyperdisk-balanced" ], - "td2-": [ + "t2d-": [ "pd-extreme", "local-ssd", "hyperdisk-balanced", "hyperdisk-ml", "hyperdisk-extreme" ], @@ -220,9 +220,8 @@ def _get_gcp_machine_types( "hyperdisk-throughput" ], "z3-": [ - "pd-extreme", "pd-standard", "hyperdisk-throughput", - "hyperdisk-ml", "hyperdisk-extreme", - "hyperdisk-balanced" + "pd-extreme", "pd-standard", "hyperdisk-balanced", + "hyperdisk-ml" ], "h3-": [ "local-ssd", "pd-standard", "pd-ssd", "pd-extreme", @@ -254,15 +253,15 @@ def _get_gcp_machine_types( "hyperdisk-extreme", "hyperdisk-throughput" ], "a3-": [ - "local-ssd", "pd-extreme", "pd-standard", + "pd-extreme", "pd-standard", "hyperdisk-balanced" ], "a2-": [ - "pd-extreme", "hyperdisk-throughput", "hyperdisk-ml", - "hyperdisk-balanced" + "pd-extreme", "hyperdisk-throughput", + "hyperdisk-balanced", "hyperdisk-extreme" ], "g2-": [ - "pd-extreme", "pd-ssd", "hyperdisk-balanced", + "pd-extreme", "pd-standard", "hyperdisk-balanced", "hyperdisk-extreme" ] } From f8bf9052ac2e29f0a9b27f9f9acfc8107b9a3ff9 Mon Sep 17 00:00:00 2001 From: Scott Gordon Date: Fri, 26 Jul 2024 11:26:09 +0100 Subject: [PATCH 032/180] OFE: tweak to valid drive types --- .../ofe/website/ghpcfe/templates/cluster/update_form.html | 8 -------- 1 file changed, 8 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html index 7cbf1abaa1..32e822ade2 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html +++ b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html @@ -482,14 +482,6 @@

{{ title }}

option.text = "Hyperdisk Balanced Persistent Disk"; option.setAttribute("value", "hyperdisk-balanced"); selObj.appendChild(option); - - if (machineType.startsWith('c4-') || - machineType.startsWith('x4-')) { - var option = document.createElement("option"); - option.text = "Hyperdisk Extreme Persistent Disk"; - option.setAttribute("value", "hyperdisk-extreme"); - selObj.appendChild(option); - } } else { var additionalDisk = selObj.id.slice(0, selObj.id.lastIndexOf("_disk_type")); if (additionalDisk.endsWith("additional")) { From 3e44effdc3bf60676690d95de9ebd96610a8266f Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 6 Aug 2024 17:35:05 -0700 Subject: [PATCH 033/180] Override machine type to reduce chance of stockout --- examples/ps-slurm.yaml | 3 ++- tools/cloud-build/daily-tests/tests/ps-slurm.yml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/ps-slurm.yaml b/examples/ps-slurm.yaml index 4a28802924..d97b142298 100644 --- a/examples/ps-slurm.yaml +++ b/examples/ps-slurm.yaml @@ -24,6 +24,7 @@ vars: deployment_name: parallelstore-slurm region: us-east4 zone: us-east4-b + compute_node_machine_type: c2-standard-60 deployment_groups: - group: primary @@ -46,7 +47,7 @@ deployment_groups: use: [network] settings: node_count_dynamic_max: 4 - machine_type: c2-standard-60 + machine_type: $(vars.compute_node_machine_type) enable_placement: false # the default is: true - id: debug_partition diff --git a/tools/cloud-build/daily-tests/tests/ps-slurm.yml b/tools/cloud-build/daily-tests/tests/ps-slurm.yml index f487f4bde5..eea04f27c8 100644 --- a/tools/cloud-build/daily-tests/tests/ps-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/ps-slurm.yml @@ -24,6 +24,7 @@ slurm_cluster_name: "psslurm{{ build[0:3] }}" cli_deployment_vars: region: "{{ region }}" zone: "{{ zone }}" + compute_node_machine_type: c2-standard-4 # Note: Pattern matching in gcloud only supports 1 wildcard. login_node: "{{ slurm_cluster_name }}-slurm-login-*" controller_node: "{{ slurm_cluster_name }}-controller" From e753c788cbf97f3fed71159cfa77500bb3525b95 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 1 Aug 2024 23:29:50 +0000 Subject: [PATCH 034/180] Add `sort_nodes.py` --- .../modules/slurm_files/scripts/sort_nodes.py | 186 ++++++++++++++++++ .../scripts/tests/test_topology.py | 14 ++ 2 files changed, 200 insertions(+) create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py new file mode 100755 index 0000000000..d302453254 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script sorts nodes based on their `physicalHost`. + +See https://cloud.google.com/compute/docs/instances/use-compact-placement-policies + +You can reduce latency in tightly coupled HPC workloads (including distributed ML training) +by deploying them to machines that are located close together. +For example, if you deploy your workload on a single physical rack, you can expect lower latency +than if your workload is spread across multiple racks. +Sending data across multiple rack requires sending data through additional network switches. + +Example usage: +``` my_sbatch.sh +#SBATCH --ntasks-per-node=8 +#SBATCH --nodes=64 + +export SLURM_HOSTFILE=$(/slurm/scripts/sort_nodes.py) + +srun -l hostname | sort +``` +""" +import os +import subprocess +import uuid +from typing import List, Optional, Dict, Callable + +def order(paths: List[List[str]]) -> List[str]: + """ + Orders the leafs of the tree in a way that minimizes the sum of distance in between + each pair of neighboring nodes in the resulting order. + The resulting order will always start from the first node in the input list. + The ordering is "stable" with respect to the input order of the leafs i.e. + given a choice between two nodes (identical in other ways) it will select "nodelist-smallest" one. + """ + if not paths: return [] + # lookup table for node names order + name_idx = {p[-1]: i for i, p in enumerate(paths)} + class Vert: + "Represents a vertex in a *network* tree." + def __init__(self, name: str, parent: "Vert"): + self.name = name + self.parent = parent + self.children = {} + self.lex_key: int = -1 + + # build a tree + root = Vert("", None) + for path in paths: + n = root + for v in path: + if v not in n.children: + n.children[v] = Vert(v, n) + n = n.children[v] + + def post_order_traversal(v: Vert, f: Callable[[Vert], None]) -> None: + # freeze order, since lex_key can be mutated + children = list(sorted(v.children.values(), key=lambda u: u.lex_key)) + for u in children: + post_order_traversal(u, f) + f(v) + + # propagate "lexicographical key" from the leaves to the root + def lex_key_prop(v: Vert) -> None: + if not v.children: # this is a node + v.lex_key = name_idx[v.name] + else: + v.lex_key = min(u.lex_key for u in v.children.values()) + post_order_traversal(root, lex_key_prop) + + # gather leafs + result = [] + def collect_nodes(v: Vert) -> None: + if not v.children: + result.append(v.name) + post_order_traversal(root, collect_nodes) + + return result + + +class Instance: + def __init__(self, name: str, zone: str, physical_host: Optional[str]): + self.name = name + self.zone = zone + self.physical_host = physical_host + + +def make_path(node_name: str, inst: Optional[Instance]) -> List[str]: + if not inst: # node with unknown instance (e.g. hybrid cluster) + return ["unknown", node_name] + zone = f"zone_{inst.zone}" + if not inst.physical_host: # node without physical host info (e.g. no placement policy) + return [zone, "unknown", node_name] + + assert inst.physical_host.startswith("/"), f"Unexpected physicalHost: {inst.physical_host}" + parts = inst.physical_host[1:].split("/") + if len(parts) >= 4: + return [*parts, node_name] + elif len(parts) == 3: + return [zone, *parts, node_name] # add zone + + raise ValueError(f"Unexpected physicalHost: {inst.physical_host}") + + +def to_hostnames(nodelist: str) -> List[str]: + cmd = ["scontrol", "show", "hostnames", nodelist] + out = subprocess.run(cmd, check=True, stdout=subprocess.PIPE).stdout + return [n.decode("utf-8") for n in out.splitlines()] + + +def get_instances(node_names: List[str]) -> Dict[str, object]: + fmt = ( + "--format=csv[no-heading,separator=','](zone,resourceStatus.physicalHost,name)" + ) + cmd = ["gcloud", "compute", "instances", "list", fmt] + + scp = os.path.commonprefix(node_names) + if scp: + cmd.append(f"--filter=name~'{scp}.*'") + out = subprocess.run(cmd, check=True, stdout=subprocess.PIPE).stdout + d = {} + for line in out.splitlines(): + zone, physical_host, name = line.decode("utf-8").split(",") + d[name] = Instance(name, zone, physical_host) + return {n: d.get(n) for n in node_names} + + +def main(args) -> None: + nodelist = args.nodelist or os.getenv("SLURM_NODELIST") + if not nodelist: + raise ValueError("nodelist is not provided and SLURM_NODELIST is not set") + + if args.ntasks_per_node is None: + args.ntasks_per_node = int(os.getenv("SLURM_NTASKS_PER_NODE", "") or 1) + assert args.ntasks_per_node > 0 + + output = args.output or f"hosts.{uuid.uuid4()}" + + node_names = to_hostnames(nodelist) + instannces = get_instances(node_names) + paths = [make_path(n, instannces[n]) for n in node_names] + ordered = order(paths) + + with open(output, "w") as f: + for node in ordered: + for _ in range(args.ntasks_per_node): + f.write(node) + f.write("\n") + print(output) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument( + "--nodelist", + type=str, + help="Slurm 'hostlist expression' of nodes to sort, if not set the value of SLURM_NODELIST environment variable will be used", + ) + parser.add_argument( + "--ntasks-per-node", + type=int, + help="""Number of times to repeat each node in resulting sorted list. +If not set, the value of SLURM_NTASKS_PER_NODE environment variable will be used, +if neither is set, defaults to 1""", + ) + parser.add_argument( + "--output", type=str, help="Output file to write, defaults to 'hosts.'" + ) + args = parser.parse_args() + main(args) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py index 3dc86dcd21..62c33e52cc 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pytest import mock from common import TstCfg, TstNodeset, TstTPU, make_to_hostnames_mock +import sort_nodes import util import conf @@ -89,3 +91,15 @@ def tpu_se(ns: TstNodeset) -> TstTPU: conf.gen_topology_conf(util.Lookup(cfg)) want_written = PRELUDE + "\n".join(want_compressed) + "\n\n" assert open(cfg.output_dir + "/cloud_topology.conf").read() == want_written + + + +@pytest.mark.parametrize( + "paths,expected", + [ + (["z/n-0", "z/n-1", "z/n-2", "z/n-3", "z/n-4", "z/n-10"], ['n-0', 'n-1', 'n-2', 'n-3', 'n-4', 'n-10']), + (["y/n-0", "z/n-1", "x/n-2", "x/n-3", "y/n-4", "g/n-10"], ['n-0', 'n-4', 'n-1', 'n-2', 'n-3', 'n-10']), + ]) +def test_sort_nodes_order(paths: list[list[str]], expected: list[str]) -> None: + paths = [l.split("/") for l in paths] + assert sort_nodes.order(paths) == expected From bcbb5e5856c15f08c48f7e98d15ae76331caf070 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 6 Aug 2024 02:02:22 +0000 Subject: [PATCH 035/180] Simplify traversals --- .../modules/slurm_files/scripts/sort_nodes.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py index d302453254..3b17a196ba 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py @@ -67,28 +67,25 @@ def __init__(self, name: str, parent: "Vert"): n.children[v] = Vert(v, n) n = n.children[v] - def post_order_traversal(v: Vert, f: Callable[[Vert], None]) -> None: - # freeze order, since lex_key can be mutated - children = list(sorted(v.children.values(), key=lambda u: u.lex_key)) - for u in children: - post_order_traversal(u, f) - f(v) - # propagate "lexicographical key" from the leaves to the root - def lex_key_prop(v: Vert) -> None: + def set_lex_key(v: Vert) -> None: if not v.children: # this is a node v.lex_key = name_idx[v.name] else: + for u in v.children.values(): + set_lex_key(u) v.lex_key = min(u.lex_key for u in v.children.values()) - post_order_traversal(root, lex_key_prop) + set_lex_key(root) # gather leafs result = [] - def collect_nodes(v: Vert) -> None: - if not v.children: + def gather_nodes(v: Vert) -> None: + if not v.children: # this is a node result.append(v.name) - post_order_traversal(root, collect_nodes) + for u in v.children.values(): + gather_nodes(u) + gather_nodes(root) return result From c71683d2c817446a4d57c77259877f801768142c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Wed, 17 Jul 2024 14:52:37 +0000 Subject: [PATCH 036/180] Add documentation for exclusive mode --- .../compute/schedmd-slurm-gcp-v6-partition/README.md | 4 ++-- .../compute/schedmd-slurm-gcp-v6-partition/variables.tf | 8 +++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md index 33beb3418f..bdc41245bc 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md @@ -82,7 +82,7 @@ No resources. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no | +| [exclusive](#input\_exclusive) | Exclusive job access to nodes. When set to true, only one job can be scheduled on
compute node and nodes are put into power save mode after job finishes. If set to false,
multiple jobs can be scheduled on one node and node is shutdown after `var.suspend_time`
in idle state. | `bool` | `true` | no | | [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | | [network\_storage](#input\_network\_storage) | DEPRECATED |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | | [nodeset](#input\_nodeset) | A list of nodesets.
For type definition see community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf::nodeset | `list(any)` | `[]` | no | @@ -91,7 +91,7 @@ No resources. | [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | | [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes | | [resume\_timeout](#input\_resume\_timeout) | Maximum time permitted (in seconds) between when a node resume request is issued and when the node is actually available for use.
If null is given, then a smart default will be chosen depending on nodesets in partition.
This sets 'ResumeTimeout' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout_1 for details. | `number` | `300` | no | -| [suspend\_time](#input\_suspend\_time) | Nodes which remain idle or down for this number of seconds will be placed into power save mode by SuspendProgram.
This sets 'SuspendTime' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime_1 for details.
NOTE: use value -1 to exclude partition from suspend. | `number` | `300` | no | +| [suspend\_time](#input\_suspend\_time) | Nodes which remain idle or down for this number of seconds will be placed into power save mode by SuspendProgram.
This sets 'SuspendTime' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime_1 for details.
NOTE: use value -1 to exclude partition from suspend.
NOTE 2: if `var.exclusive` is set to true (default), nodes are placed into power save mode directly after job finishes. | `number` | `300` | no | | [suspend\_timeout](#input\_suspend\_timeout) | Maximum time permitted (in seconds) between when a node suspend request is issued and when the node is shutdown.
If null is given, then a smart default will be chosen depending on nodesets in partition.
This sets 'SuspendTimeout' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout_1 for details. | `number` | `null` | no | ## Outputs diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf index 807ad3847e..7fae6226cb 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf @@ -41,7 +41,12 @@ variable "is_default" { } variable "exclusive" { - description = "Exclusive job access to nodes." + description = <<-EOD + Exclusive job access to nodes. When set to true, only one job can be scheduled on + compute node and nodes are put into power save mode after job finishes. If set to false, + multiple jobs can be scheduled on one node and node is shutdown after `var.suspend_time` + in idle state. + EOD type = bool default = true } @@ -140,6 +145,7 @@ variable "suspend_time" { This sets 'SuspendTime' in partition_conf. See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime_1 for details. NOTE: use value -1 to exclude partition from suspend. + NOTE 2: if `var.exclusive` is set to true (default), nodes are placed into power save mode directly after job finishes. EOD type = number default = 300 From f961d64d86845babe253d8cbfed71794f9d91e1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Wed, 7 Aug 2024 09:12:06 +0000 Subject: [PATCH 037/180] Review fixes --- .../compute/schedmd-slurm-gcp-v6-partition/README.md | 4 ++-- .../compute/schedmd-slurm-gcp-v6-partition/variables.tf | 8 +++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md index bdc41245bc..e68899def7 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md @@ -82,7 +82,7 @@ No resources. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [exclusive](#input\_exclusive) | Exclusive job access to nodes. When set to true, only one job can be scheduled on
compute node and nodes are put into power save mode after job finishes. If set to false,
multiple jobs can be scheduled on one node and node is shutdown after `var.suspend_time`
in idle state. | `bool` | `true` | no | +| [exclusive](#input\_exclusive) | Exclusive job access to nodes. When set to true nodes execute single job and are deleted
after job exits. If set to false, multiple jobs can be scheduled on one node. | `bool` | `true` | no | | [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | | [network\_storage](#input\_network\_storage) | DEPRECATED |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | | [nodeset](#input\_nodeset) | A list of nodesets.
For type definition see community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf::nodeset | `list(any)` | `[]` | no | @@ -91,7 +91,7 @@ No resources. | [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | | [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes | | [resume\_timeout](#input\_resume\_timeout) | Maximum time permitted (in seconds) between when a node resume request is issued and when the node is actually available for use.
If null is given, then a smart default will be chosen depending on nodesets in partition.
This sets 'ResumeTimeout' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout_1 for details. | `number` | `300` | no | -| [suspend\_time](#input\_suspend\_time) | Nodes which remain idle or down for this number of seconds will be placed into power save mode by SuspendProgram.
This sets 'SuspendTime' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime_1 for details.
NOTE: use value -1 to exclude partition from suspend.
NOTE 2: if `var.exclusive` is set to true (default), nodes are placed into power save mode directly after job finishes. | `number` | `300` | no | +| [suspend\_time](#input\_suspend\_time) | Nodes which remain idle or down for this number of seconds will be placed into power save mode by SuspendProgram.
This sets 'SuspendTime' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime_1 for details.
NOTE: use value -1 to exclude partition from suspend.
NOTE 2: if `var.exclusive` is set to true (default), nodes are deleted immediately after job finishes. | `number` | `300` | no | | [suspend\_timeout](#input\_suspend\_timeout) | Maximum time permitted (in seconds) between when a node suspend request is issued and when the node is shutdown.
If null is given, then a smart default will be chosen depending on nodesets in partition.
This sets 'SuspendTimeout' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout_1 for details. | `number` | `null` | no | ## Outputs diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf index 7fae6226cb..e14e44b02a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf @@ -42,10 +42,8 @@ variable "is_default" { variable "exclusive" { description = <<-EOD - Exclusive job access to nodes. When set to true, only one job can be scheduled on - compute node and nodes are put into power save mode after job finishes. If set to false, - multiple jobs can be scheduled on one node and node is shutdown after `var.suspend_time` - in idle state. + Exclusive job access to nodes. When set to true nodes execute single job and are deleted + after job exits. If set to false, multiple jobs can be scheduled on one node. EOD type = bool default = true @@ -145,7 +143,7 @@ variable "suspend_time" { This sets 'SuspendTime' in partition_conf. See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime_1 for details. NOTE: use value -1 to exclude partition from suspend. - NOTE 2: if `var.exclusive` is set to true (default), nodes are placed into power save mode directly after job finishes. + NOTE 2: if `var.exclusive` is set to true (default), nodes are deleted immediately after job finishes. EOD type = number default = 300 From 29e4ef75d8e1e4e368f8f621db02742821902ced Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 7 Aug 2024 20:44:00 +0000 Subject: [PATCH 038/180] Address comments --- .../modules/slurm_files/scripts/sort_nodes.py | 38 +++++++------------ 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py index 3b17a196ba..bfac018830 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py @@ -37,28 +37,30 @@ import os import subprocess import uuid -from typing import List, Optional, Dict, Callable +from typing import List, Optional, Dict +from collections import OrderedDict def order(paths: List[List[str]]) -> List[str]: """ - Orders the leafs of the tree in a way that minimizes the sum of distance in between + Orders the leaves of the tree in a way that minimizes the sum of distance in between each pair of neighboring nodes in the resulting order. The resulting order will always start from the first node in the input list. - The ordering is "stable" with respect to the input order of the leafs i.e. + The ordering is "stable" with respect to the input order of the leaves i.e. given a choice between two nodes (identical in other ways) it will select "nodelist-smallest" one. + + Returns a list of nodenames, ordered as described above. """ if not paths: return [] - # lookup table for node names order - name_idx = {p[-1]: i for i, p in enumerate(paths)} class Vert: "Represents a vertex in a *network* tree." def __init__(self, name: str, parent: "Vert"): self.name = name self.parent = parent - self.children = {} - self.lex_key: int = -1 + # Use `OrderedDict` to preserve insertion order + # TODO: once we move to Python 3.7+ use regular `dict` since it has the same guarantee + self.children = OrderedDict() - # build a tree + # build a tree, children are ordered by insertion order root = Vert("", None) for path in paths: n = root @@ -67,24 +69,13 @@ def __init__(self, name: str, parent: "Vert"): n.children[v] = Vert(v, n) n = n.children[v] - # propagate "lexicographical key" from the leaves to the root - def set_lex_key(v: Vert) -> None: - if not v.children: # this is a node - v.lex_key = name_idx[v.name] - else: - for u in v.children.values(): - set_lex_key(u) - v.lex_key = min(u.lex_key for u in v.children.values()) - set_lex_key(root) - - # gather leafs + # walk the tree in insertion order, gather leaves result = [] def gather_nodes(v: Vert) -> None: - if not v.children: # this is a node + if not v.children: # this is a Slurm node result.append(v.name) for u in v.children.values(): gather_nodes(u) - gather_nodes(root) return result @@ -107,10 +98,7 @@ def make_path(node_name: str, inst: Optional[Instance]) -> List[str]: parts = inst.physical_host[1:].split("/") if len(parts) >= 4: return [*parts, node_name] - elif len(parts) == 3: - return [zone, *parts, node_name] # add zone - - raise ValueError(f"Unexpected physicalHost: {inst.physical_host}") + return [zone, *parts, node_name] def to_hostnames(nodelist: str) -> List[str]: From 97d3bb356e6e299f8b0520bf0a48f464ab8c2428 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Wed, 7 Aug 2024 14:38:59 -0700 Subject: [PATCH 039/180] Add section "Standard PR Response Times" to CONTRIBUTING.md --- .github/pull_request_template.md | 2 ++ CONTRIBUTING.md | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index f3c224983f..19483cac61 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,5 +1,7 @@ ### Submission Checklist +NOTE: Pull requests can take up to 2 weeks to be reviewed. + Please take the following actions before submitting this pull request. * Fork your PR branch from the Toolkit "develop" branch (not main) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6272489dae..c05ae18e32 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,7 +20,11 @@ again. All submissions, including submissions by project members, require review. We use GitHub pull requests for this purpose. Consult [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more -information on using pull requests. +information on pull requests. + +### Standard PR Response Times + +Pull requests can take up to 2 weeks to be reviewed. ## Community Guidelines From 06d6220f48417e71859c745cee3d0e8926ab4dd9 Mon Sep 17 00:00:00 2001 From: annuay Date: Thu, 8 Aug 2024 06:50:08 +0000 Subject: [PATCH 040/180] Run pytest module as an executable. There is no command line setup (for example, run as an alias) for pytest on the workstation machines --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7cecfd5809..42b6a6f041 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -76,7 +76,7 @@ repos: require_serial: true - id: pytest-check name: pytest-check - entry: pytest + entry: python -m pytest language: system types: [python] pass_filenames: false From c363099b4d5b50945a0ca206e89075e9df755eb0 Mon Sep 17 00:00:00 2001 From: annuay Date: Thu, 8 Aug 2024 09:53:06 +0000 Subject: [PATCH 041/180] Update makefile to install pytest and dependencies required by tests --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index 1222be60d1..b9e4ef089a 100644 --- a/Makefile +++ b/Makefile @@ -69,6 +69,9 @@ install-dev-deps: warn-terraform-version warn-packer-version check-pre-commit ch go install mvdan.cc/sh/v3/cmd/shfmt@latest go install golang.org/x/tools/cmd/goimports@latest go install honnef.co/go/tools/cmd/staticcheck@latest + pip install -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt + pip install -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt + pip install pytest # RULES SUPPORTING THE ABOVE From e8005f1e14e6710616095af0ea7d1fb90775d0dd Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Thu, 8 Aug 2024 15:31:53 +0000 Subject: [PATCH 042/180] pre-commit fixes --- modules/compute/gke-node-pool/README.md | 2 ++ modules/compute/gke-node-pool/main.tf | 6 +++++ modules/compute/gke-node-pool/variables.tf | 28 ++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 2daf69a794..e21a5f141a 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -239,9 +239,11 @@ No modules. | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | +| [reservation\_type](#input\_reservation\_type) | Type of reservation to consume | `string` | `null` | no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [specific\_reservation](#input\_specific\_reservation) | Reservation resources to consume when targeting SPECIFIC\_RESERVATION. Specify `compute.googleapis.com/reservation-name` as the key and the list of reservation names as the value. |
object({
key = string
values = list(string)
})
|
{
"key": null,
"values": null
}
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no | | [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 551ba1f5a5..f22dc7beb6 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -150,6 +150,12 @@ resource "google_container_node_pool" "node_pool" { "net.ipv4.tcp_wmem" = "4096 16384 16777216" } } + + reservation_affinity { + consume_reservation_type = var.reservation_type + key = var.specific_reservation.key + values = var.specific_reservation.values + } } timeouts { diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 4cb4bf0af1..4bec7e3c35 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -265,3 +265,31 @@ variable "service_account" { error_message = "service_account is deprecated and replaced with service_account_email and scopes." } } + +variable "reservation_type" { + description = "Type of reservation to consume" + type = string + default = null + + validation { + condition = contains(["NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"], var.reservation_type) + error_message = "Accepted values are: {NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION}" + } +} + +variable "specific_reservation" { + description = "Reservation resources to consume when targeting SPECIFIC_RESERVATION. Specify `compute.googleapis.com/reservation-name` as the key and the list of reservation names as the value." + type = object({ + key = string + values = list(string) + }) + default = { + key = null + values = null + } + + validation { + condition = (var.specific_reservation.key == "compute.googleapis.com/reservation-name" && length(var.specific_reservation.values) > 0) || (var.specific_reservation.key == null && var.specific_reservation.values == null) + error_message = "Value must be equal to `compute.googleapis.com/reservation-name` when targeting a SPECIFIC_RESERVATION. Otherwise, do not specify the value" + } +} From afaf74fbf9f965dd2b057ac659dac348db05a76b Mon Sep 17 00:00:00 2001 From: annuay Date: Thu, 8 Aug 2024 15:43:47 +0000 Subject: [PATCH 043/180] Remove redundant pip install from Github workflow since this is now park of make recipe --- .github/workflows/pr-precommit.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/pr-precommit.yml b/.github/workflows/pr-precommit.yml index e1f56d0007..37234d2a0e 100644 --- a/.github/workflows/pr-precommit.yml +++ b/.github/workflows/pr-precommit.yml @@ -38,10 +38,6 @@ jobs: python-version: '3.10' check-latest: true cache: 'pip' - - run: > - pip install - -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt - -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt - uses: actions/setup-go@v5 with: go-version: '1.22' From 686d7378e7e6e5d1cebe1dd9696bce706f9b51b9 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 8 Aug 2024 17:33:10 +0000 Subject: [PATCH 044/180] Add `slurmgcp-managed` infix to resource policy name --- .../modules/slurm_files/scripts/resume.py | 2 +- .../schedmd-slurm-gcp-v6-controller/scripts/cleanup_compute.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 6289268cdd..8301ec84b4 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -521,7 +521,7 @@ def create_nodeset_placement_groups(node_list: list, job_id=0): region = lkp.node_region(model) groups = { - f"{cfg.slurm_cluster_name}-{nodeset.nodeset_name}-{job_id}-{i}": nodes + f"{cfg.slurm_cluster_name}-slurmgcp-managed-{nodeset.nodeset_name}-{job_id}-{i}": nodes for i, nodes in enumerate(chunked(node_list, n=PLACEMENT_MAX_CNT)) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/scripts/cleanup_compute.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/scripts/cleanup_compute.sh index a165479ad4..c76c9735cb 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/scripts/cleanup_compute.sh +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/scripts/cleanup_compute.sh @@ -52,7 +52,7 @@ while true; do done echo "Deleting resource policies" -policies_filter="name:${cluster_name}-*" +policies_filter="name:${cluster_name}-slurmgcp-managed-*" while true; do policies=$(bash -c "$API_ENDPOINT gcloud compute resource-policies list --project \"${project}\" --format=\"value(selfLink)\" --filter=\"${policies_filter}\" --limit=10 | paste -sd \" \" -") if [[ -z "${policies}" ]]; then From aaec41cf5a433864de23676460128745f3bc5c24 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 8 Aug 2024 19:22:41 +0000 Subject: [PATCH 045/180] Fix bug of suppliying different `instance_properties` --- .../schedmd-slurm-gcp-v6-nodeset/main.tf | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/main.tf | 8 ++++-- .../partition.tf | 28 +++++++++---------- .../variables.tf | 14 +++++----- 5 files changed, 29 insertions(+), 25 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 099920990a..491ea64419 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -97,7 +97,7 @@ locals { termination_action = try(var.spot_instance_config.termination_action, null) reservation_name = local.reservation_name maintenance_interval = var.maintenance_interval - instance_properties = var.instance_properties + instance_properties_json = jsonencode(var.instance_properties) zone_target_shape = var.zone_target_shape zone_policy_allow = local.zones diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 96c9f41272..dc16e79894 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -273,7 +273,7 @@ limitations under the License. | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties = optional(any, null)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index 7e7f39fa0f..c25748dc48 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -67,8 +67,12 @@ locals { epilog_scripts = [for k, v in google_storage_bucket_object.epilog_scripts : k] cloud_parameters = var.cloud_parameters - partitions = { for p in var.partitions : p.partition_name => p } - nodeset = { for n in var.nodeset : n.nodeset_name => n } + partitions = { for p in var.partitions : p.partition_name => p } + nodeset = { + for n in var.nodeset : n.nodeset_name => merge(n, { + instance_properties = jsondecode(n.instance_properties_json) + }) + } nodeset_dyn = { for n in var.nodeset_dyn : n.nodeset_name => n } nodeset_tpu = { for n in var.nodeset_tpu[*].nodeset : n.nodeset_name => n } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index d2d85db1fd..d1f783f64a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -68,20 +68,20 @@ module "slurm_nodeset_template" { locals { nodesets = [for name, ns in local.nodeset_map : { - nodeset_name = ns.nodeset_name - node_conf = ns.node_conf - instance_template = module.slurm_nodeset_template[ns.nodeset_name].self_link - node_count_dynamic_max = ns.node_count_dynamic_max - node_count_static = ns.node_count_static - subnetwork = ns.subnetwork_self_link - reservation_name = ns.reservation_name - maintenance_interval = ns.maintenance_interval - instance_properties = ns.instance_properties - enable_placement = ns.enable_placement - network_storage = ns.network_storage - zone_target_shape = ns.zone_target_shape - zone_policy_allow = ns.zone_policy_allow - zone_policy_deny = ns.zone_policy_deny + nodeset_name = ns.nodeset_name + node_conf = ns.node_conf + instance_template = module.slurm_nodeset_template[ns.nodeset_name].self_link + node_count_dynamic_max = ns.node_count_dynamic_max + node_count_static = ns.node_count_static + subnetwork = ns.subnetwork_self_link + reservation_name = ns.reservation_name + maintenance_interval = ns.maintenance_interval + instance_properties_json = ns.instance_properties_json + enable_placement = ns.enable_placement + network_storage = ns.network_storage + zone_target_shape = ns.zone_target_shape + zone_policy_allow = ns.zone_policy_allow + zone_policy_deny = ns.zone_policy_deny }] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 44ed33f994..ecdb4b22c3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -211,13 +211,13 @@ variable "nodeset" { count = number type = string })) - labels = optional(map(string), {}) - machine_type = optional(string) - maintenance_interval = optional(string) - instance_properties = optional(any, null) - metadata = optional(map(string), {}) - min_cpu_platform = optional(string) - network_tier = optional(string, "STANDARD") + labels = optional(map(string), {}) + machine_type = optional(string) + maintenance_interval = optional(string) + instance_properties_json = string + metadata = optional(map(string), {}) + min_cpu_platform = optional(string) + network_tier = optional(string, "STANDARD") network_storage = optional(list(object({ server_ip = string remote_mount = string From 7fd72b18d47f21020fbed083562d7ac3d562a24f Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Thu, 8 Aug 2024 21:33:01 +0000 Subject: [PATCH 046/180] renaming ghpc to gcluster --- community/examples/AMD/README.md | 4 +-- community/examples/flux-framework/README.md | 6 ++-- community/examples/intel/README.md | 20 ++++++------- .../website/ghpcfe/cluster_manager/image.py | 8 ++--- .../schedmd-slurm-gcp-v5-controller/README.md | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- docs/blueprint-validation.md | 6 ++-- docs/hpc-slurm6-tpu-maxtext.md | 6 ++-- ...demo-with-cloud-controller-instructions.md | 14 ++++----- .../deploy-instructions.md | 6 ++-- .../on-prem-instructions.md | 8 ++--- docs/tutorials/gromacs/spack-gromacs.md | 12 ++++---- docs/tutorials/htcondor.md | 14 ++++----- docs/tutorials/openfoam/spack-openfoam.md | 14 ++++----- docs/tutorials/wrfv3/spack-wrfv3.md | 12 ++++---- examples/README.md | 30 +++++++++---------- modules/README.md | 22 +++++++------- modules/compute/vm-instance/README.md | 2 +- tools/cloud-build/README.md | 2 +- .../tasks/create_deployment_directory.yml | 2 +- .../builds/ml-a3-highgpu-slurm.yaml | 4 +-- .../daily-tests/builds/monitoring.yaml | 2 +- .../cloud-build/images/ghpc-docker/Dockerfile | 2 +- .../cloud-build/images/test-runner/Dockerfile | 2 +- .../golden_copies/validate.sh | 2 +- tools/validate_configs/test_configs/README.md | 4 +-- tools/validate_configs/validate_configs.sh | 4 +-- 27 files changed, 106 insertions(+), 106 deletions(-) diff --git a/community/examples/AMD/README.md b/community/examples/AMD/README.md index f0f67d367d..ffc25e2598 100644 --- a/community/examples/AMD/README.md +++ b/community/examples/AMD/README.md @@ -53,10 +53,10 @@ using the `compute` partition, you may ignore its quota requirements. ### Deploying the Blueprint -Use `ghpc` to provision the blueprint, supplying your project ID: +Use `gcluster` to provision the blueprint, supplying your project ID: ```shell -ghpc create --vars project_id=<> hpc-amd-slurm.yaml +gcluster create --vars project_id=<> hpc-amd-slurm.yaml ``` It will create a directory containing a Terraform module. Follow the printed diff --git a/community/examples/flux-framework/README.md b/community/examples/flux-framework/README.md index c1e2d8271d..aa8b580c24 100644 --- a/community/examples/flux-framework/README.md +++ b/community/examples/flux-framework/README.md @@ -26,15 +26,15 @@ Toolkit guidance to enable [APIs][apis] and establish minimum resource ### Deploy the flux-framework Cluster -Use `ghcp` to provision the blueprint +Use `gcluster` to provision the blueprint ```bash -ghpc create community/examples/flux-framework --vars project_id=<> +gcluster create community/examples/flux-framework --vars project_id=<> ``` This will create a directory containing Terraform modules. -Follow `ghpc` instructions to deploy the cluster +Follow `gcluster` instructions to deploy the cluster ```text terraform -chdir=flux-fw-cluster/primary init diff --git a/community/examples/intel/README.md b/community/examples/intel/README.md index 34305c9215..e83bd27391 100644 --- a/community/examples/intel/README.md +++ b/community/examples/intel/README.md @@ -63,10 +63,10 @@ The Pre-deployment Guide provides instructions for: ### Deploy the DAOS Cluster -After completing the steps in the [Pre-deployment Guide][pre-deployment_guide] use `ghpc` to provision the blueprint +After completing the steps in the [Pre-deployment Guide][pre-deployment_guide] use `gcluster` to provision the blueprint ```text -ghpc create community/examples/intel/pfs-daos.yaml \ +gcluster create community/examples/intel/pfs-daos.yaml \ --vars project_id=<> \ [--backend-config bucket=] ``` @@ -75,10 +75,10 @@ This will create the deployment directory containing Terraform modules and Packer templates. The `--backend-config` option is not required but recommended. It will save the terraform state in a pre-existing [Google Cloud Storage bucket][bucket]. For more information see [Setting up a remote terraform -state][backend]. Use `ghpc deploy` to provision your DAOS storage cluster: +state][backend]. Use `gcluster deploy` to provision your DAOS storage cluster: ```text -ghpc deploy pfs-daos --auto-approve +gcluster deploy pfs-daos --auto-approve ``` [backend]: ../../../examples/README.md#optional-setting-up-a-remote-terraform-state @@ -238,7 +238,7 @@ See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#d Delete the remaining infrastructure ```bash -ghpc destroy pfs-daos --auto-approve +gcluster destroy pfs-daos --auto-approve ``` ## DAOS Server with Slurm cluster @@ -291,10 +291,10 @@ The following available quota is required in the region used by Slurm: ### Deploy the DAOS/Slurm Cluster -Use `ghpc` to provision the blueprint, supplying your project ID +Use `gcluster` to provision the blueprint, supplying your project ID ```text -ghpc create community/examples/intel/hpc-slurm-daos.yaml \ +gcluster create community/examples/intel/hpc-slurm-daos.yaml \ --vars project_id=<> \ [--backend-config bucket=] ``` @@ -304,10 +304,10 @@ templates. The `--backend-config` option is not required but recommended. It will save the terraform state in a pre-existing [Google Cloud Storage bucket][bucket]. For more information see [Setting up a remote terraform state][backend]. -Follow `ghpc` instructions to deploy the environment +Follow `gcluster` instructions to deploy the environment ```text -ghpc deploy hpc-slurm-daos --auto-approve +gcluster deploy hpc-slurm-daos --auto-approve ``` [backend]: ../../../examples/README.md#optional-setting-up-a-remote-terraform-state @@ -450,5 +450,5 @@ have been shutdown and deleted by the Slurm autoscaler. Delete the remaining infrastructure: ```bash -ghpc destroy hpc-slurm-daos --auto-approve +gcluster destroy hpc-slurm-daos --auto-approve ``` diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py index 363029db9b..b119ba5790 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py @@ -51,7 +51,7 @@ def prepare(self): 1. Create the necessary directory structure for the image. 2. Generate a Cluster Toolkit blueprint to build the image. - 3. Run the Cluster Toolkit (`ghpc`) to create the image based on the blueprint. + 3. Run the Cluster Toolkit (`gcluster`) to create the image based on the blueprint. 4. Set up the builder environment on Google Cloud Platform (GCP) using Terraform. 5. Create the image on GCP using Packer. 6. Destroy the builder environment after the image creation is complete. @@ -71,7 +71,7 @@ def prepare(self): OSError: If there is an error while creating the image directory or writing to the credentials file. IOError: If there is an error while writing to the credentials file. - subprocess.CalledProcessError: If any of the subprocess calls (ghpc, Terraform, or Packer) + subprocess.CalledProcessError: If any of the subprocess calls (gcluster, Terraform, or Packer) encounter an error during execution. """ self._create_image_dir() @@ -172,7 +172,7 @@ def _create_blueprint(self): def _run_ghpc(self): target_dir = self.image_dir try: - logger.info(f"Invoking ghpc create for the image {self.image.id}") + logger.info(f"Invoking gcluster create for the image {self.image.id}") log_out_fn = target_dir / "ghpc_create_log.stdout" log_err_fn = target_dir / "ghpc_create_log.stderr" @@ -191,7 +191,7 @@ def _run_ghpc(self): ) except subprocess.CalledProcessError as cpe: self.update_image_status("e") - logger.error(f"ghpc exec failed for image {self.image.id}", exc_info=cpe) + logger.error(f"gcluster exec failed for image {self.image.id}", exc_info=cpe) # No logs from stdout/err - get dumped to files raise diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 50f719575a..6fef0fff97 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -70,7 +70,7 @@ activated through the `enable_reconfigure` setting: To reconfigure a running cluster: 1. Edit the blueprint with the desired configuration changes -1. Call `ghpc create -w` to overwrite the deployment directory +1. Call `gcluster create -w` to overwrite the deployment directory 1. Follow instructions in terminal to deploy The following are examples of updates that can be made to a running cluster: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 96c9f41272..a8a3df9dd5 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -49,7 +49,7 @@ partitions and slurm configuration in a running, active cluster. To reconfigure a running cluster: 1. Edit the blueprint with the desired configuration changes -2. Call `ghpc create -w` to overwrite the deployment directory +2. Call `gcluster create -w` to overwrite the deployment directory 3. Follow instructions in terminal to deploy The following are examples of updates that can be made to a running cluster: diff --git a/docs/blueprint-validation.md b/docs/blueprint-validation.md index c6f02032ac..20c4fef115 100644 --- a/docs/blueprint-validation.md +++ b/docs/blueprint-validation.md @@ -112,7 +112,7 @@ validators: * Use `skip-validators` CLI flag: ```shell -./ghpc create ... --skip-validators="test_project_exists,test_apis_enabled" +./gcluster create ... --skip-validators="test_project_exists,test_apis_enabled" ``` * To disable all validators, set the [validation level to IGNORE](#validation-levels). @@ -134,12 +134,12 @@ They can also be set to 3 differing levels of behavior using the command-line For example, this command will set all validators to `WARNING` behavior: ```shell -./ghpc create --validation-level WARNING examples/hpc-slurm.yaml +./gcluster create --validation-level WARNING examples/hpc-slurm.yaml ``` The flag can be shortened to `-l` as shown below using `IGNORE` to disable all validators. ```shell -./ghpc create -l IGNORE examples/hpc-slurm.yaml +./gcluster create -l IGNORE examples/hpc-slurm.yaml ``` diff --git a/docs/hpc-slurm6-tpu-maxtext.md b/docs/hpc-slurm6-tpu-maxtext.md index c13ca2eab6..4126ac4b19 100644 --- a/docs/hpc-slurm6-tpu-maxtext.md +++ b/docs/hpc-slurm6-tpu-maxtext.md @@ -15,8 +15,8 @@ the dataset in your GCS bucket. After that you can update the blueprint to use t dataset from GCS bucket in training script. ```bash -./ghpc create community/examples/hpc-slurm6-tpu-maxtext.yaml --vars project_id=; -./ghpc deploy slurm6-tpu-v4 --auto-approve +./gcluster create community/examples/hpc-slurm6-tpu-maxtext.yaml --vars project_id=; +./gcluster deploy slurm6-tpu-v4 --auto-approve ``` This would deploy slurm cluster with TPU partition, dynamic compute partition. Maxtext benchmark test script @@ -79,7 +79,7 @@ For this we need to return to our cloud shell terminal. Run exit in the terminal Run the following command in the cloud shell terminal to destroy the cluster: ```bash -./ghpc destroy slurm6-tpu-v4 --auto-approve +./gcluster destroy slurm6-tpu-v4 --auto-approve ``` When complete you should see something like: diff --git a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md index 338ff814f9..fbc851e1db 100644 --- a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md +++ b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md @@ -140,14 +140,14 @@ command to install the pip packages outlined in pip install -r docs/hybrid-slurm-cluster/requirements.txt ``` -#### Build ghpc +#### Build gcluster -Before you begin, ensure that you have built the `ghpc` tool in the Cluster Toolkit. +Before you begin, ensure that you have built the `gcluster` tool in the Cluster Toolkit. For more information see the [README.md](../../README.md#quickstart) Quickstart. -The commands in these instructions assume the ghpc binary is installed in a +The commands in these instructions assume the gcluster binary is installed in a directory represented in the PATH environment variable. To ensure this is the -case, run `make install` after building `ghpc`: +case, run `make install` after building `gcluster`: ```shell make @@ -166,10 +166,10 @@ blueprint will do the following: * Create a subnetwork of `compute-vpc-network` named `primary-subnet` with an internal IP range of 10.1.0.0/16 -Create a deployment directory for the networks using `ghpc`: +Create a deployment directory for the networks using `gcluster`: ```shell -ghpc create docs/hybrid-slurm-cluster/blueprints/create-networks.yaml --vars project_id="<>",project_id_compute="<>" +gcluster create docs/hybrid-slurm-cluster/blueprints/create-networks.yaml --vars project_id="<>",project_id_compute="<>" ``` If successful, this command will provide 3 terraform operations that can be @@ -299,7 +299,7 @@ First, use the Cluster Toolkit to create the deployment directory, replacing "<>" with the ID of your project A: ```shell -ghpc create docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml --vars project_id="<>" +gcluster create docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml --vars project_id="<>" ``` If successful, this command will provide 3 terraform operations that can be diff --git a/docs/hybrid-slurm-cluster/deploy-instructions.md b/docs/hybrid-slurm-cluster/deploy-instructions.md index 6d29b916a7..b03f5403a1 100644 --- a/docs/hybrid-slurm-cluster/deploy-instructions.md +++ b/docs/hybrid-slurm-cluster/deploy-instructions.md @@ -77,7 +77,7 @@ command line, run the following command with the updated values for `<>` and `<>`: ```shell -./ghpc create docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml \ +./gcluster create docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml \ --vars project_id=<> \ --vars static_controller_hostname=<> \ --vars static_controller_addr=<> @@ -87,10 +87,10 @@ If successful, this command will create a deployment folder. Use the following command to deploy the hybrid configuration: ```sh -./ghpc deploy hybrid-config +./gcluster deploy hybrid-config ``` -`ghpc` reports the changes that Terraform is proposing to make for your +`gcluster` reports the changes that Terraform is proposing to make for your cluster. Optionally, you may review them by typing `d` and pressing `enter`. To deploy the cluster, accept the proposed changes by typing `a` and pressing `enter`. diff --git a/docs/hybrid-slurm-cluster/on-prem-instructions.md b/docs/hybrid-slurm-cluster/on-prem-instructions.md index 3a4c3370b3..30e721dad0 100644 --- a/docs/hybrid-slurm-cluster/on-prem-instructions.md +++ b/docs/hybrid-slurm-cluster/on-prem-instructions.md @@ -142,14 +142,14 @@ a service account, see the [Setup Authentication](#setup-authentication) section and the Google Cloud documentation on [Service Accounts](https://cloud.google.com/iam/docs/service-accounts). -#### Build ghpc +#### Build gcluster -Before you begin, ensure that you have built the `ghpc` tool in the Cluster Toolkit. +Before you begin, ensure that you have built the `gcluster` tool in the Cluster Toolkit. For more information see the [README.md](../../README.md#quickstart) Quickstart. -The commands in these instructions assume the ghpc binary is installed in a +The commands in these instructions assume the gcluster binary is installed in a directory represented in the PATH environment variable. To ensure this is the -case, run `make install` after building `ghpc`: +case, run `make install` after building `gcluster`: ```shell make diff --git a/docs/tutorials/gromacs/spack-gromacs.md b/docs/tutorials/gromacs/spack-gromacs.md index eae4e7b5d8..fc4797db7a 100644 --- a/docs/tutorials/gromacs/spack-gromacs.md +++ b/docs/tutorials/gromacs/spack-gromacs.md @@ -59,11 +59,11 @@ To build Cluster Toolkit binary from source run: make ``` -You should now have a binary named ghpc in the current directory. To verify the +You should now have a binary named gcluster in the current directory. To verify the build run: ```bash -./ghpc --version +./gcluster --version ``` This should show you the version of the Cluster Toolkit you are using. @@ -91,11 +91,11 @@ This file describes the cluster you will deploy. It defines: [This diagram](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/docs/tutorials#blueprint-diagram-for-application-tutorials) shows how the different modules relate to each other. -After you have inspected the file, use the ghpc binary to create a deployment +After you have inspected the file, use the gcluster binary to create a deployment folder by running: ```bash -./ghpc create docs/tutorials/gromacs/spack-gromacs.yaml --vars project_id= +./gcluster create docs/tutorials/gromacs/spack-gromacs.yaml --vars project_id= ``` > **_NOTE:_** The `--vars` argument is used to override `project_id` in the @@ -109,7 +109,7 @@ contains the terraform needed to deploy your cluster. Use below command to deploy your cluster. ```bash -./ghpc deploy spack-gromacs +./gcluster deploy spack-gromacs ``` After the deployment is finished, you should see below message. @@ -271,7 +271,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -./ghpc destroy spack-gromacs +./gcluster destroy spack-gromacs ``` When complete you should see something like: diff --git a/docs/tutorials/htcondor.md b/docs/tutorials/htcondor.md index 6399d25766..6c9657b69a 100644 --- a/docs/tutorials/htcondor.md +++ b/docs/tutorials/htcondor.md @@ -34,16 +34,16 @@ To build Cluster Toolkit binary from source run: make ``` -You should now have a binary named ghpc in the current directory. To verify the +You should now have a binary named gcluster in the current directory. To verify the build run: ```bash -./ghpc --version +./gcluster --version ``` This should show you the version of the Cluster Toolkit you are using. -(Optional) To install the `ghpc` binary in your home directory under bin, +(Optional) To install the `gcluster` binary in your home directory under bin, run the following command: ```bash @@ -70,10 +70,10 @@ The blueprint `community/examples/htc-htcondor.yaml` should be open in the Cloud Shell Editor (on the left). This file describes the cluster you will deploy. After you have inspected the -file, use the ghpc binary to create a deployment directory by running: +file, use the gcluster binary to create a deployment directory by running: ```bash -./ghpc create community/examples/htc-htcondor.yaml --vars "project_id=" +./gcluster create community/examples/htc-htcondor.yaml --vars "project_id=" ``` > **_NOTE:_** The `--vars` argument is used to override `project_id` in the @@ -89,7 +89,7 @@ contains the terraform needed to deploy your cluster. Use the following commands to run terraform and deploy your cluster. ```bash -./ghpc deploy htcondor-pool --auto-approve +./gcluster deploy htcondor-pool --auto-approve ``` The Toolkit will automatically approve provisioning a network, building a VM @@ -222,7 +222,7 @@ You should be returned to the Cloud Shell console. You may then destroy your HTCondor pool: ```bash -./ghpc destroy htcondor-pool --auto-approve +./gcluster destroy htcondor-pool --auto-approve ``` When complete you should see output similar to: diff --git a/docs/tutorials/openfoam/spack-openfoam.md b/docs/tutorials/openfoam/spack-openfoam.md index d872e1e436..d9bf3ea6d5 100644 --- a/docs/tutorials/openfoam/spack-openfoam.md +++ b/docs/tutorials/openfoam/spack-openfoam.md @@ -31,7 +31,7 @@ Once you have selected a project, click START. ## Enable APIs & Permissions In a new Google Cloud project there are several apis that must be enabled to -deploy your HPC cluster. These will be caught when you perform `./ghpc create` +deploy your HPC cluster. These will be caught when you perform `./gcluster create` but you can save time by enabling them now by running: @@ -59,11 +59,11 @@ To build Cluster Toolkit binary from source run: make ``` -You should now have a binary named ghpc in the current directory. To verify the +You should now have a binary named gcluster in the current directory. To verify the build run: ```bash -./ghpc --version +./gcluster --version ``` This should show you the version of the Cluster Toolkit you are using. @@ -88,11 +88,11 @@ This file describes the cluster you will deploy. It defines: * a Slurm controller * An auto-scaling Slurm partition -After you have inspected the file, use the ghpc binary to create a deployment +After you have inspected the file, use the gcluster binary to create a deployment folder by running: ```bash -./ghpc create docs/tutorials/openfoam/spack-openfoam.yaml --vars project_id= +./gcluster create docs/tutorials/openfoam/spack-openfoam.yaml --vars project_id= ``` > **_NOTE:_** The `--vars` argument is used to override `project_id` in the @@ -106,7 +106,7 @@ contains the terraform needed to deploy your cluster. Use below command to deploy your cluster. ```bash -./ghpc deploy spack-openfoam +./gcluster deploy spack-openfoam ``` You can also use below command to generate a _plan_ that describes the Google @@ -272,7 +272,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -./ghpc destroy spack-openfoam +./gcluster destroy spack-openfoam ``` When complete you should see something like: diff --git a/docs/tutorials/wrfv3/spack-wrfv3.md b/docs/tutorials/wrfv3/spack-wrfv3.md index cf8dcdb334..2b03ffe66b 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.md +++ b/docs/tutorials/wrfv3/spack-wrfv3.md @@ -59,11 +59,11 @@ To build Cluster Toolkit binary from source run: make ``` -You should now have a binary named ghpc in the current directory. To verify the +You should now have a binary named gcluster in the current directory. To verify the build run: ```bash -./ghpc --version +./gcluster --version ``` This should show you the version of the Cluster Toolkit you are using. @@ -91,11 +91,11 @@ This file describes the cluster you will deploy. It defines: [This diagram](../README.md#blueprint-diagram-for-application-tutorials) shows how the different modules relate to each other. -After you have inspected the file, use the ghpc binary to create a deployment +After you have inspected the file, use the gcluster binary to create a deployment folder by running: ```bash -./ghpc create docs/tutorials/wrfv3/spack-wrfv3.yaml --vars project_id= +./gcluster create docs/tutorials/wrfv3/spack-wrfv3.yaml --vars project_id= ``` > **_NOTE:_** The `--vars` argument is used to override `project_id` in the @@ -109,7 +109,7 @@ contains the terraform needed to deploy your cluster. Use below command to deploy your cluster. ```bash -./ghpc deploy spack-wrfv3 +./gcluster deploy spack-wrfv3 ``` You can also use below command to generate a plan that describes the Google Cloud resources that will be deployed. @@ -274,7 +274,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -./ghpc destroy spack-wrfv3 +./gcluster destroy spack-wrfv3 ``` When complete you should see something like: diff --git a/examples/README.md b/examples/README.md index b10c2c7b74..fb47a19686 100644 --- a/examples/README.md +++ b/examples/README.md @@ -124,7 +124,7 @@ You can set the configuration using the CLI in the `create` and `expand` subcommands as well: ```shell -./ghpc create examples/hpc-slurm.yaml \ +./gcluster create examples/hpc-slurm.yaml \ --vars "project_id=${GOOGLE_CLOUD_PROJECT}" \ --backend-config "bucket=${GCS_BUCKET}" ``` @@ -166,7 +166,7 @@ as follows: * Robust reconfiguration - Reconfiguration is now managed by a service that runs on each instance. This has removed the dependency on the Pub/Sub Google cloud service, and provides a more consistent reconfiguration experience (when calling `ghpc deploy blueprint.yaml -w`). Reconfiguration has also been enabled by default. + Reconfiguration is now managed by a service that runs on each instance. This has removed the dependency on the Pub/Sub Google cloud service, and provides a more consistent reconfiguration experience (when calling `gcluster deploy blueprint.yaml -w`). Reconfiguration has also been enabled by default. * Faster deployments @@ -178,7 +178,7 @@ as follows: * Fewer dependencies in the deployment environment - Reconfiguration and compute node cleanup no longer require users to install local python dependencies in the deployment environment (where ghpc is called). This has allowed for these features to be enabled by default. + Reconfiguration and compute node cleanup no longer require users to install local python dependencies in the deployment environment (where gcluster is called). This has allowed for these features to be enabled by default. * Flexible node to partition relation @@ -565,8 +565,8 @@ VM. The cluster has 2 partitions: To provision the cluster, please run: ```text -./ghpc create examples/ml-slurm-v5-legacy.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" -./ghpc deploy ml-example +./gcluster create examples/ml-slurm-v5-legacy.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" +./gcluster deploy ml-example ``` After accessing the login node, you can activate the conda environment for each @@ -590,7 +590,7 @@ sbatch -N 1 torch_test.sh When you are done, clean up the resources in reverse order of creation: ```text -./ghpc destroy ml-example +./gcluster destroy ml-example ``` Finally, browse to the [Cloud Console][console-images] to delete your custom @@ -614,8 +614,8 @@ VM. The cluster has 2 partitions: To provision the cluster, please run: ```text -./ghpc create examples/ml-slurm.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" -./ghpc deploy ml-example-v6 +./gcluster create examples/ml-slurm.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" +./gcluster deploy ml-example-v6 ``` After accessing the login node, you can activate the conda environment for each @@ -639,7 +639,7 @@ sbatch -N 1 torch_test.sh When you are done, clean up the resources in reverse order of creation: ```text -./ghpc destroy ml-example-v6 +./gcluster destroy ml-example-v6 ``` Finally, browse to the [Cloud Console][console-images] to delete your custom @@ -670,8 +670,8 @@ example takes the following steps: Create the deployment folder from the blueprint: ```text -./ghpc create examples/image-builder-v5-legacy.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" -./ghpc deploy image-builder-001" +./gcluster create examples/image-builder-v5-legacy.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" +./gcluster deploy image-builder-001" ``` Follow the on-screen prompts to approve the creation of each deployment group. @@ -795,8 +795,8 @@ example takes the following steps: Create the deployment folder from the blueprint: ```text -./ghpc create examples/image-builder.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" -./ghpc deploy image-builder-v6-001" +./gcluster create examples/image-builder.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" +./gcluster deploy image-builder-v6-001" ``` Follow the on-screen prompts to approve the creation of each deployment group. @@ -1277,7 +1277,7 @@ To use the blueprint you must supply the project id and the name of an existing bucket: ```shell -./ghpc create community/examples/client-google-cloud-storage.yaml \ +./gcluster create community/examples/client-google-cloud-storage.yaml \ --vars project_id= \ --vars existing_bucket_name= ``` @@ -1943,7 +1943,7 @@ To avoid these issues, the `ghpc_stage` function can be used to copy a file (or ``` The `ghpc_stage` function will always look first in the path specified in the blueprint. If the file is not found at this path then `ghpc_stage` will look for the staged file in the deployment folder, if a deployment folder exists. -This means that you can redeploy a blueprint (`ghpc deploy -w`) so long as you have the deployment folder from the original deployment, even if locally referenced files are not available. +This means that you can redeploy a blueprint (`gcluster deploy -w`) so long as you have the deployment folder from the original deployment, even if locally referenced files are not available. ## Requirements diff --git a/modules/README.md b/modules/README.md index 3796a66b67..8df7ac6d5e 100644 --- a/modules/README.md +++ b/modules/README.md @@ -307,7 +307,7 @@ Terraform modules. A source can either be a filesystem path or a URL to a git repository: * Filesystem paths - * modules embedded in the `ghpc` executable + * modules embedded in the `gcluster` executable * modules in the local filesystem * Remote modules using [Terraform URL syntax](https://developer.hashicorp.com/terraform/language/modules/sources) * Hosted on [GitHub](https://developer.hashicorp.com/terraform/language/modules/sources#github) @@ -324,13 +324,13 @@ deployment folder on your behalf. #### Embedded Modules -Embedded modules are added to the ghpc binary during compilation and cannot +Embedded modules are added to the gcluster binary during compilation and cannot be edited. To refer to embedded modules, set the source path to `modules/<>` or `community/modules/<>`. The paths match the modules in the repository structure for [core modules](./) and [community modules](../community/modules/). Because the modules are embedded -during compilation, your local copies may differ unless you recompile ghpc. +during compilation, your local copies may differ unless you recompile gcluster. For example, this example snippet uses the embedded pre-existing-vpc module: @@ -352,7 +352,7 @@ following module definition refers the local pre-existing-vpc modules. ``` > **_NOTE:_** Relative paths (beginning with `.` or `..` must be relative to the -> working directory from which `ghpc` is executed. This example would have to be +> working directory from which `gcluster` is executed. This example would have to be > run from a local copy of the Cluster Toolkit repository. An alternative is to use > absolute paths to modules. @@ -392,7 +392,7 @@ release of the filestore module: source: github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/filestore?ref=v1.22.1&depth=1 ``` -Because Terraform modules natively support this syntax, ghpc will not copy +Because Terraform modules natively support this syntax, gcluster will not copy GitHub-hosted modules into your deployment folder. Terraform will download them into a hidden folder when you run `terraform init`. @@ -403,12 +403,12 @@ into a hidden folder when you run `terraform init`. ##### GitHub-hosted Packer modules -Packer does not natively support GitHub-hosted modules so `ghpc create` will +Packer does not natively support GitHub-hosted modules so `gcluster create` will copy modules into your deployment folder. -If the module uses `//` package notation, `ghpc create` will copy the entire +If the module uses `//` package notation, `gcluster create` will copy the entire repository to the module path: `deployment_name/group_name/module_id`. However, -when `ghpc deploy` is invoked, it will run Packer from the subdirectory +when `gcluster deploy` is invoked, it will run Packer from the subdirectory `deployment_name/group_name/module_id/subdirectory/after/double_slash`. Referring back to the [Intel DAOS blueprint][pfs-daos.yaml], we see that it will @@ -417,10 +417,10 @@ create 2 deployment groups at `pfs-daos/daos-client-image` and a subdirectories ending in `daos-client-image/images` and `daos-server-image/images`. -If the module does not use `//` package notation, `ghpc create` will copy +If the module does not use `//` package notation, `gcluster create` will copy only the final directory in the path to `deployment_name/group_name/module_id`. -In all cases, `ghpc create` will remove the `.git` directory from the packer +In all cases, `gcluster create` will remove the `.git` directory from the packer module to ensure that you can manage the entire deployment directory with its own git versioning. @@ -504,7 +504,7 @@ to `$(network1.network_self_link)` and `$(network1.subnetwork_self_link)` which refer to the [network1 outputs](network/vpc/README#Outputs) of the same names. -The order of precedence that `ghpc` uses in determining when to infer a setting +The order of precedence that `gcluster` uses in determining when to infer a setting value is in the following priority order: 1. Explicitly set in the blueprint using the `settings` field diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index fb3fde84e6..39513edce9 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -133,7 +133,7 @@ can be found at [docs/gpu-support.md](../../../docs/gpu-support.md) The `vm-instance` module will be replaced when the `instance_image` variable is changed and `terraform apply` is run on the deployment group folder or -`ghpc deploy` is run. However, it will not be automatically replaced if a new +`gcluster deploy` is run. However, it will not be automatically replaced if a new image is created in a family. To selectively replace the vm-instance(s), consider running terraform diff --git a/tools/cloud-build/README.md b/tools/cloud-build/README.md index 3bf30cacdf..175d2f8e65 100644 --- a/tools/cloud-build/README.md +++ b/tools/cloud-build/README.md @@ -4,7 +4,7 @@ * `daily-tests`: The daily-tests directory contains cloud build configs and support files for running the daily test suite -* `dependency-checks`: Verifies the `ghpc` build in limited dependency +* `dependency-checks`: Verifies the `gcluster` build in limited dependency environments. * `ansible.cfg`: Ansible config used to set common ansible setting for running the test suite. diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml index f515e26aac..ecb9513291 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml @@ -32,7 +32,7 @@ - name: Create Blueprint ansible.builtin.command: | - ./ghpc create -l ERROR "{{ blueprint_yaml }}" \ + ./gcluster create -l ERROR "{{ blueprint_yaml }}" \ --backend-config bucket={{ state_bucket }} \ --skip-validators=test_tf_version_for_slurm \ --vars project_id={{ project }} \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml index 7144fc4c23..4f7a08c4c0 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml @@ -62,7 +62,7 @@ steps: NFS_DEPLOYMENT_NAME="a3hnfs$${BUILD_ID_SHORT}" destroy_on_exit() { - ./ghpc destroy "$${NFS_DEPLOYMENT_NAME}" --auto-approve + ./gcluster destroy "$${NFS_DEPLOYMENT_NAME}" --auto-approve cat /persistent_volume/image_name | xargs -L1 gcloud compute images delete --project "${PROJECT_ID}" --quiet } @@ -70,7 +70,7 @@ steps: ZONE=us-east4-a trap 'destroy_on_exit' EXIT - ./ghpc deploy \ + ./gcluster deploy \ --vars region="$${REGION}" \ --vars zone="$${ZONE}" \ --vars project_id="${PROJECT_ID}" \ diff --git a/tools/cloud-build/daily-tests/builds/monitoring.yaml b/tools/cloud-build/daily-tests/builds/monitoring.yaml index 52e97c23cc..810fa650ff 100644 --- a/tools/cloud-build/daily-tests/builds/monitoring.yaml +++ b/tools/cloud-build/daily-tests/builds/monitoring.yaml @@ -36,7 +36,7 @@ steps: - -c - | set -x -e - cd /workspace && make ghpc + cd /workspace && make gcluster BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} diff --git a/tools/cloud-build/images/ghpc-docker/Dockerfile b/tools/cloud-build/images/ghpc-docker/Dockerfile index f4b2e29b4d..746452b921 100644 --- a/tools/cloud-build/images/ghpc-docker/Dockerfile +++ b/tools/cloud-build/images/ghpc-docker/Dockerfile @@ -18,7 +18,7 @@ ARG ref RUN git clone https://github.com/GoogleCloudPlatform/hpc-toolkit.git &&\ cd hpc-toolkit &&\ git checkout ${ref} &&\ - make ghpc &&\ + make gcluster &&\ go install github.com/google/go-licenses@latest &&\ /go/bin/go-licenses check "./..." &&\ /go/bin/go-licenses save "./..." --save_path="THIRD_PARTY_NOTICES" diff --git a/tools/cloud-build/images/test-runner/Dockerfile b/tools/cloud-build/images/test-runner/Dockerfile index 5538328dfe..f7f8ca2708 100644 --- a/tools/cloud-build/images/test-runner/Dockerfile +++ b/tools/cloud-build/images/test-runner/Dockerfile @@ -46,6 +46,6 @@ RUN curl -fsSL https://apt.releases.hashicorp.com/gpg | apt-key add - && \ pip install --no-cache-dir ansible && \ rm -rf ~/.cache/pip/* && \ # compile the binary to warm up `/ghpc_go_cache` - cd /workspace && make ghpc && \ + cd /workspace && make gcluster && \ # remove /workspace to reduce image size rm -rf /workspace diff --git a/tools/validate_configs/golden_copies/validate.sh b/tools/validate_configs/golden_copies/validate.sh index bc03493aaf..4e0b4f196a 100755 --- a/tools/validate_configs/golden_copies/validate.sh +++ b/tools/validate_configs/golden_copies/validate.sh @@ -42,7 +42,7 @@ run_test() { cp -r tools/validate_configs/golden_copies/configs/files "${tmpdir}/" # Only run from the repo directory if there are local modules, otherwise - # run the test from the test directory using the installed ghpc binary. + # run the test from the test directory using the installed gcluster binary. if grep -q "${LOCAL_SOURCE_PATTERN}" "${cwd}/${bp}"; then cd "${cwd}" else diff --git a/tools/validate_configs/test_configs/README.md b/tools/validate_configs/test_configs/README.md index 812f8449d5..00b1f4fa58 100644 --- a/tools/validate_configs/test_configs/README.md +++ b/tools/validate_configs/test_configs/README.md @@ -3,8 +3,8 @@ This directory contains a set of test blueprint files that can be fed into gHPC to create a deployment. These blueprints are used to run integration tests -against `ghpc`. These blueprints can also be used independently and locally to -verify a local `ghpc` build. +against `gcluster`. These blueprints can also be used independently and locally to +verify a local `gcluster` build. ## Blueprint Descriptions diff --git a/tools/validate_configs/validate_configs.sh b/tools/validate_configs/validate_configs.sh index fb92005498..511028beaa 100755 --- a/tools/validate_configs/validate_configs.sh +++ b/tools/validate_configs/validate_configs.sh @@ -35,7 +35,7 @@ run_test() { echo "testing ${example} in ${tmpdir}" # Only run from the repo directory if there are local modules, otherwise - # run the test from the test directory using the installed ghpc binary. + # run the test from the test directory using the installed gcluster binary. if grep -q "${LOCAL_SOURCE_PATTERN}" "${cwd}/${example}"; then cd "${cwd}" else @@ -45,7 +45,7 @@ run_test() { --skip-validators="${VALIDATORS_TO_SKIP}" "${deployment_args[@]}" \ --vars="project_id=${PROJECT},deployment_name=${DEPLOYMENT}" >/dev/null || { - echo "*** ERROR: error creating deployment with ghpc for ${exampleFile}" + echo "*** ERROR: error creating deployment with gcluster for ${exampleFile}" exit 1 } if grep -q "${LOCAL_SOURCE_PATTERN}" "${cwd}/${example}"; then From dae19a5e933a8b4b04cbc06c63893bb2c90a2dd5 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 8 Aug 2024 17:53:52 -0700 Subject: [PATCH 047/180] Align variable name around gcluster to avoid mismatch --- .../ansible_playbooks/tasks/gather_startup_script_logs.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/gather_startup_script_logs.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/gather_startup_script_logs.yml index 6b5808ba8b..a499e43cad 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/gather_startup_script_logs.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/gather_startup_script_logs.yml @@ -16,14 +16,14 @@ - name: Assert variables are defined ansible.builtin.assert: that: - - ghpc_stderr is defined + - gcluster_stderr is defined # Searches the ghpc stderr for a command that gathers the serial logs from the # deployed VM, defaults to an empty string if the command is not found - name: Get serial port command failed_when: false ansible.builtin.set_fact: - serial_port_cmd: '{{ ghpc_stderr | regex_findall("please run:\s+(.+?\s+--project\s+\S+)", "\\1") | first | default("") }}' + serial_port_cmd: '{{ gcluster_stderr | regex_findall("please run:\s+(.+?\s+--project\s+\S+)", "\\1") | first | default("") }}' - name: Print serial port command failed_when: false From ca1a4d71a074e3f36fbdbff12951910b2440b96d Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 8 Aug 2024 18:09:50 -0700 Subject: [PATCH 048/180] Build is taking longer, extend timeout so test will pass while investigating --- tools/cloud-build/daily-tests/builds/batch-mpi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml index fd29f37721..7595534a18 100644 --- a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml +++ b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml @@ -56,7 +56,7 @@ steps: echo ' source: community/modules/scripts/wait-for-startup' >> $${SG_EXAMPLE} echo ' settings:' >> $${SG_EXAMPLE} echo ' instance_name: $(spack-builder.name[0])' >> $${SG_EXAMPLE} - echo ' timeout: 2400' >> $${SG_EXAMPLE} + echo ' timeout: 10800' >> $${SG_EXAMPLE} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ From fc4baa4f225e02490a031f29bac543c873e5146c Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Thu, 8 Aug 2024 20:10:41 -0700 Subject: [PATCH 049/180] Address comments --- .github/pull_request_template.md | 2 +- CONTRIBUTING.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 19483cac61..c51f91e39d 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,6 +1,6 @@ ### Submission Checklist -NOTE: Pull requests can take up to 2 weeks to be reviewed. +NOTE: Community submissions can take up to 2 weeks to be reviewed. Please take the following actions before submitting this pull request. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c05ae18e32..03bfefa2d0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -24,7 +24,7 @@ information on pull requests. ### Standard PR Response Times -Pull requests can take up to 2 weeks to be reviewed. +Community submissions can take up to 2 weeks to be reviewed. ## Community Guidelines From d8e25770ef98641b1bf1e309f040e5f06ac429a3 Mon Sep 17 00:00:00 2001 From: annuay Date: Fri, 9 Aug 2024 06:22:53 +0000 Subject: [PATCH 050/180] Trigger Build From f443feb971095a2edccf7deb1b97d3d1cf72ddda Mon Sep 17 00:00:00 2001 From: annuay Date: Fri, 9 Aug 2024 06:35:17 +0000 Subject: [PATCH 051/180] Trigger Build From f83162df569efed15bd840882aafa03a481c9e70 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Fri, 9 Aug 2024 11:47:53 +0000 Subject: [PATCH 052/180] add TODOs --- modules/compute/gke-node-pool/main.tf | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index f22dc7beb6..2cf366a7f3 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -151,10 +151,14 @@ resource "google_container_node_pool" "node_pool" { } } + # TODO(arajmane): Default values for params in this block considering that + # this block need not be passed at all if reservation_affinity is not required + # Or, values of the params key and values are not required when any_reservation is to be used reservation_affinity { consume_reservation_type = var.reservation_type key = var.specific_reservation.key - values = var.specific_reservation.values + # TODO(arajmane): ensure the reservation exists through dependencies? + values = var.specific_reservation.values } } From 99e0e0202999b6660329986775c3bc8b3b1e645c Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 9 Aug 2024 10:45:18 -0500 Subject: [PATCH 053/180] Fix link in Packer README --- modules/packer/custom-image/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/packer/custom-image/README.md b/modules/packer/custom-image/README.md index 190a2429d0..d4f63133bd 100644 --- a/modules/packer/custom-image/README.md +++ b/modules/packer/custom-image/README.md @@ -40,7 +40,7 @@ This can be achieved by one of the following 2 approaches: 1. Configuring a VPC with a Cloud NAT in the region of the VM -- Use the \[vpc\] module which automates NAT creation +- Use the [vpc] module which automates NAT creation ### Inbound internet access @@ -143,10 +143,9 @@ environment. SSH access can be enabled one of 2 ways: - Add firewall rules that open SSH to the VM The Packer template defaults to using to the 1st IAP-based solution because it -is more secure (no exposure to public internet) and because the -[Toolkit VPC module](../../network/vpc/README.md) automatically sets up all -necessary firewall rules for SSH tunneling and outbound-only access to the -internet through [Cloud NAT][cloudnat]. +is more secure (no exposure to public internet) and because the [vpc] module +automatically sets up all necessary firewall rules for SSH tunneling and +outbound-only access to the internet through [Cloud NAT][cloudnat]. In either SSH solution, customization scripts should be supplied as files in the [shell_scripts][shell] and [ansible_playbooks][ansible] settings. @@ -327,3 +326,4 @@ No outputs. [sss]: #input_startup_script [startup-metadata]: https://cloud.google.com/compute/docs/instances/startup-scripts/linux [startup-script]: ../../../modules/scripts/startup-script +[vpc]: ../../network/vpc/README.md From 4e1cebce1f150624cc9a0af398c6bbd2baf1a878 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Fri, 9 Aug 2024 18:00:37 +0000 Subject: [PATCH 054/180] replacing centos7 with rocky8 in packer modules --- modules/packer/custom-image/README.md | 2 +- modules/packer/custom-image/variables.pkr.hcl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/packer/custom-image/README.md b/modules/packer/custom-image/README.md index 190a2429d0..b464902b1f 100644 --- a/modules/packer/custom-image/README.md +++ b/modules/packer/custom-image/README.md @@ -293,7 +293,7 @@ No resources. | [shell\_scripts](#input\_shell\_scripts) | A list of paths to local shell scripts which will be uploaded to customize the VM image | `list(string)` | `[]` | no | | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance (must set var.enabled\_shielded\_vm) |
object({
enable_secure_boot = bool
enable_vtpm = bool
enable_integrity_monitoring = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [source\_image](#input\_source\_image) | Source OS image to build from | `string` | `null` | no | -| [source\_image\_family](#input\_source\_image\_family) | Alternative to source\_image. Specify image family to build from latest image in family | `string` | `"hpc-centos-7"` | no | +| [source\_image\_family](#input\_source\_image\_family) | Alternative to source\_image. Specify image family to build from latest image in family | `string` | `"hpc-rocky-linux-8"` | no | | [source\_image\_project\_id](#input\_source\_image\_project\_id) | A list of project IDs to search for the source image. Packer will search the
first project ID in the list first, and fall back to the next in the list,
until it finds the source image. | `list(string)` | `null` | no | | [ssh\_username](#input\_ssh\_username) | Username to use for SSH access to VM | `string` | `"hpc-toolkit-packer"` | no | | [startup\_script](#input\_startup\_script) | Startup script (as raw string) used to build the custom Linux VM image (overridden by var.startup\_script\_file if both are set) | `string` | `null` | no | diff --git a/modules/packer/custom-image/variables.pkr.hcl b/modules/packer/custom-image/variables.pkr.hcl index 0fd12991d4..3cede102ce 100644 --- a/modules/packer/custom-image/variables.pkr.hcl +++ b/modules/packer/custom-image/variables.pkr.hcl @@ -99,7 +99,7 @@ variable "source_image" { variable "source_image_family" { description = "Alternative to source_image. Specify image family to build from latest image in family" type = string - default = "hpc-centos-7" + default = "hpc-rocky-linux-8" } variable "service_account_email" { From 2580598fa287fcf7ca863e3470e8c20d5ff495fb Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Fri, 9 Aug 2024 18:33:04 +0000 Subject: [PATCH 055/180] replacing centos7 with rocky8 in vm-instance modules --- modules/compute/vm-instance/README.md | 2 +- modules/compute/vm-instance/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index fb3fde84e6..62b2c28b9a 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -216,7 +216,7 @@ limitations under the License. | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | -| [instance\_image](#input\_instance\_image) | Instance Image | `map(string)` |
{
"family": "hpc-centos-7",
"project": "cloud-hpc-image-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Instance Image | `map(string)` |
{
"family": "hpc-rocky-linux-8",
"project": "cloud-hpc-image-public"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | n/a | yes | | [local\_ssd\_count](#input\_local\_ssd\_count) | The number of local SSDs to attach to each VM. See https://cloud.google.com/compute/docs/disks/local-ssd. | `number` | `0` | no | | [local\_ssd\_interface](#input\_local\_ssd\_interface) | Interface to be used with local SSDs. Can be either 'NVME' or 'SCSI'. No effect unless `local_ssd_count` is also set. | `string` | `"NVME"` | no | diff --git a/modules/compute/vm-instance/variables.tf b/modules/compute/vm-instance/variables.tf index f675325187..a874ddf825 100644 --- a/modules/compute/vm-instance/variables.tf +++ b/modules/compute/vm-instance/variables.tf @@ -30,7 +30,7 @@ variable "instance_image" { type = map(string) default = { project = "cloud-hpc-image-public" - family = "hpc-centos-7" + family = "hpc-rocky-linux-8" } validation { From a69f541b5698040f62267ce44a2aaba75c1f7ae9 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Fri, 9 Aug 2024 19:08:36 +0000 Subject: [PATCH 056/180] replacing centos7 with rocky8 in nfs-server modules --- community/modules/file-system/nfs-server/README.md | 2 +- community/modules/file-system/nfs-server/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/file-system/nfs-server/README.md b/community/modules/file-system/nfs-server/README.md index 198e5014b6..c2fe8bebfd 100644 --- a/community/modules/file-system/nfs-server/README.md +++ b/community/modules/file-system/nfs-server/README.md @@ -126,7 +126,7 @@ No modules. | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment, used as name of the NFS instance if no name is specified. | `string` | n/a | yes | | [disk\_size](#input\_disk\_size) | Storage size gb | `number` | `"100"` | no | | [image](#input\_image) | DEPRECATED: The VM image used by the nfs server | `string` | `null` | no | -| [instance\_image](#input\_instance\_image) | The VM image used by the nfs server.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "hpc-centos-7",
"project": "cloud-hpc-image-public"
}
| no | +| [instance\_image](#input\_instance\_image) | The VM image used by the nfs server.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "hpc-rocky-linux-8",
"project": "cloud-hpc-image-public"
}
| no | | [labels](#input\_labels) | Labels to add to the NFS instance. Key-value pairs. | `map(string)` | n/a | yes | | [local\_mounts](#input\_local\_mounts) | Mountpoint for this NFS compute instance | `list(string)` |
[
"/data"
]
| no | | [machine\_type](#input\_machine\_type) | Type of the VM instance to use | `string` | `"n2d-standard-2"` | no | diff --git a/community/modules/file-system/nfs-server/variables.tf b/community/modules/file-system/nfs-server/variables.tf index c09564bc0e..c7db5b9cfa 100644 --- a/community/modules/file-system/nfs-server/variables.tf +++ b/community/modules/file-system/nfs-server/variables.tf @@ -72,7 +72,7 @@ variable "instance_image" { type = map(string) default = { project = "cloud-hpc-image-public" - family = "hpc-centos-7" + family = "hpc-rocky-linux-8" } validation { From 5d5778a3bc8d9eea6c9283ab6a24414c1b06287e Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Fri, 9 Aug 2024 19:25:26 +0000 Subject: [PATCH 057/180] update test files --- .../expectations/igc_pkr/one/image/variables.pkr.hcl | 2 +- .../expectations/text_escape/zero/lime/variables.pkr.hcl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl index 0fd12991d4..3cede102ce 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl @@ -99,7 +99,7 @@ variable "source_image" { variable "source_image_family" { description = "Alternative to source_image. Specify image family to build from latest image in family" type = string - default = "hpc-centos-7" + default = "hpc-rocky-linux-8" } variable "service_account_email" { diff --git a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/variables.pkr.hcl b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/variables.pkr.hcl index 0fd12991d4..3cede102ce 100644 --- a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/variables.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/variables.pkr.hcl @@ -99,7 +99,7 @@ variable "source_image" { variable "source_image_family" { description = "Alternative to source_image. Specify image family to build from latest image in family" type = string - default = "hpc-centos-7" + default = "hpc-rocky-linux-8" } variable "service_account_email" { From 3b936f35f395d734b38cd716bdbaf5bb6c12f312 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 9 Aug 2024 19:38:59 +0000 Subject: [PATCH 058/180] Babysit. Show "less interesting" statuses first **Motivation:** in case of summary not fitting into a screen, do not hide "interesting" / actionable builds --- tools/cloud-build/babysit/cli_ui.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/cloud-build/babysit/cli_ui.py b/tools/cloud-build/babysit/cli_ui.py index 8831bbe070..503a9bef34 100644 --- a/tools/cloud-build/babysit/cli_ui.py +++ b/tools/cloud-build/babysit/cli_ui.py @@ -62,7 +62,10 @@ def sleep(self, sec: int) -> None: time.sleep(sec) def _render_summary(self, builds: Sequence[Build]) -> None: - order_fn = lambda bc: (bc.build.status, trig_name(bc.build)) + status_order = { # show success and pending first (as less interesting) + Status.SUCCESS: 0, + Status.PENDING: 1} + order_fn = lambda bc: (status_order.get(bc.build.status, 100), bc.build.status, trig_name(bc.build)) cnt = defaultdict(int) ordered = sorted(latest_by_trigger(builds).values(), key=order_fn) From 3353dcc45c12140aa7fd49dfe36f35a8477d2559 Mon Sep 17 00:00:00 2001 From: dgouju Date: Tue, 30 Jul 2024 19:12:40 +0200 Subject: [PATCH 059/180] Defining replicas and regions for CloudSQL secret --- .../slurm-cloudsql-federation/README.md | 1 + .../slurm-cloudsql-federation/main.tf | 4 ++++ .../slurm-cloudsql-federation/outputs.tf | 9 ++++---- .../slurm-cloudsql-federation/variables.tf | 9 ++++++++ .../controller.tf | 22 ++++++++++++++++++- .../variables.tf | 5 +++++ 6 files changed, 45 insertions(+), 5 deletions(-) diff --git a/community/modules/database/slurm-cloudsql-federation/README.md b/community/modules/database/slurm-cloudsql-federation/README.md index e89f6764b6..5fa2782290 100644 --- a/community/modules/database/slurm-cloudsql-federation/README.md +++ b/community/modules/database/slurm-cloudsql-federation/README.md @@ -86,6 +86,7 @@ No modules. | [sql\_password](#input\_sql\_password) | Password for the SQL database. | `any` | `null` | no | | [sql\_username](#input\_sql\_username) | Username for the SQL database | `string` | `"slurm"` | no | | [tier](#input\_tier) | The machine type to use for the SQL instance | `string` | n/a | yes | +| [user\_managed\_replication](#input\_user\_managed\_replication) | Replication parameters that will be used for defined secret |
list(object({
location = string
kms_key_name = optional(string)
}))
| `[]` | no | ## Outputs diff --git a/community/modules/database/slurm-cloudsql-federation/main.tf b/community/modules/database/slurm-cloudsql-federation/main.tf index 09de939b72..6e2bfaceeb 100644 --- a/community/modules/database/slurm-cloudsql-federation/main.tf +++ b/community/modules/database/slurm-cloudsql-federation/main.tf @@ -19,6 +19,10 @@ locals { labels = merge(var.labels, { ghpc_module = "slurm-cloudsql-federation", ghpc_role = "database" }) } +locals { + user_managed_replication = var.user_managed_replication +} + resource "random_id" "resource_name_suffix" { byte_length = 4 } diff --git a/community/modules/database/slurm-cloudsql-federation/outputs.tf b/community/modules/database/slurm-cloudsql-federation/outputs.tf index 5f78c3adc8..21d8bbfcc9 100644 --- a/community/modules/database/slurm-cloudsql-federation/outputs.tf +++ b/community/modules/database/slurm-cloudsql-federation/outputs.tf @@ -18,9 +18,10 @@ output "cloudsql" { description = "Describes the cloudsql instance." sensitive = true value = { - server_ip = google_sql_database_instance.instance.ip_address[0].ip_address - user = google_sql_user.users.name - password = google_sql_user.users.password - db_name = google_sql_database.database.name + server_ip = google_sql_database_instance.instance.ip_address[0].ip_address + user = google_sql_user.users.name + password = google_sql_user.users.password + db_name = google_sql_database.database.name + user_managed_replication = local.user_managed_replication } } diff --git a/community/modules/database/slurm-cloudsql-federation/variables.tf b/community/modules/database/slurm-cloudsql-federation/variables.tf index 701f15d1ea..ec41c70e9d 100644 --- a/community/modules/database/slurm-cloudsql-federation/variables.tf +++ b/community/modules/database/slurm-cloudsql-federation/variables.tf @@ -96,3 +96,12 @@ variable "private_vpc_connection_peering" { type = string default = null } + +variable "user_managed_replication" { + type = list(object({ + location = string + kms_key_name = optional(string) + })) + description = "Replication parameters that will be used for defined secrets" + default = [] +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index d2c345cf8d..4e9e74c500 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -131,7 +131,27 @@ resource "google_secret_manager_secret" "cloudsql" { secret_id = "${local.slurm_cluster_name}-slurm-secret-cloudsql" replication { - auto {} + dynamic "auto" { + for_each = length(var.cloudsql.user_managed_replication) == 0 ? [1] : [] + content {} + } + dynamic "user_managed" { + for_each = length(var.cloudsql.user_managed_replication) == 0 ? [] : [1] + content { + dynamic "replicas" { + for_each = nonsensitive(var.cloudsql.user_managed_replication) + content { + location = replicas.value.location + dynamic "customer_managed_encryption" { + for_each = compact([replicas.value.kms_key_name]) + content { + kms_key_name = customer_managed_encryption.value + } + } + } + } + } + } } labels = { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 44ed33f994..f5765ef712 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -588,12 +588,17 @@ Use this database instead of the one on the controller. user : The user to access the database as. password : The password, given the user, to access the given database. (sensitive) db_name : The database to access. + user_managed_replication : The list of location and (optional) kms_key_name for secret EOD type = object({ server_ip = string user = string password = string # sensitive db_name = string + user_managed_replication = optional(list(object({ + location = string + kms_key_name = optional(string) + })), []) }) default = null sensitive = true From 0ba99e531847b6f083ad7fec9af146e354b66335 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 9 Aug 2024 16:26:13 -0500 Subject: [PATCH 060/180] Update README for Cloud SQL Slurm changes --- community/modules/database/slurm-cloudsql-federation/README.md | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/database/slurm-cloudsql-federation/README.md b/community/modules/database/slurm-cloudsql-federation/README.md index 5fa2782290..5fbee1bf2e 100644 --- a/community/modules/database/slurm-cloudsql-federation/README.md +++ b/community/modules/database/slurm-cloudsql-federation/README.md @@ -86,7 +86,7 @@ No modules. | [sql\_password](#input\_sql\_password) | Password for the SQL database. | `any` | `null` | no | | [sql\_username](#input\_sql\_username) | Username for the SQL database | `string` | `"slurm"` | no | | [tier](#input\_tier) | The machine type to use for the SQL instance | `string` | n/a | yes | -| [user\_managed\_replication](#input\_user\_managed\_replication) | Replication parameters that will be used for defined secret |
list(object({
location = string
kms_key_name = optional(string)
}))
| `[]` | no | +| [user\_managed\_replication](#input\_user\_managed\_replication) | Replication parameters that will be used for defined secrets |
list(object({
location = string
kms_key_name = optional(string)
}))
| `[]` | no | ## Outputs diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 96c9f41272..c3599c61c3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -230,7 +230,7 @@ limitations under the License. | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | | [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number)
})
| `{}` | no | -| [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access. |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
})
| `null` | no | +| [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access.
user\_managed\_replication : The list of location and (optional) kms\_key\_name for secret |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
user_managed_replication = optional(list(object({
location = string
kms_key_name = optional(string)
})), [])
})
| `null` | no | | [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `"# no-op"` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `"# no-op"` | no | From ea22ee9650c2c14b5483942b1ffd4fde39f4c7bb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 9 Aug 2024 21:57:19 +0000 Subject: [PATCH 061/180] Bump django from 4.2.11 to 4.2.15 in /community/front-end/ofe Bumps [django](https://github.com/django/django) from 4.2.11 to 4.2.15. - [Commits](https://github.com/django/django/compare/4.2.11...4.2.15) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 6f57f7f7fa..a3b1362c93 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -19,7 +19,7 @@ dill==0.3.6 distlib==0.3.6 # django-revproxy==0.11.0 released but not yet in pypi git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb0465664ccfa5787 -Django==4.2.11 +Django==4.2.15 django-allauth==0.54.0 django-extensions==3.2.3 djangorestframework==3.15.2 From 1c01914f90fc7bde1b9abc281390a7d2a7843364 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 10 Aug 2024 05:49:17 +0000 Subject: [PATCH 062/180] cleanup-build. Minor improvements * Look for `gcluster` in `PATH` to enable running in `/tmp`; * Harcode default `project_id` to simplify usage; **Breaking change:** removed positional argument `project_id` --- tools/cleanup-build.py | 52 ++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/tools/cleanup-build.py b/tools/cleanup-build.py index eec3384ef9..8061ae5958 100755 --- a/tools/cleanup-build.py +++ b/tools/cleanup-build.py @@ -12,15 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import argparse -import os -import shutil -import tarfile - -# pip install google-cloud-storage -from google.cloud import storage - -DESCRIPTION = """ +""" This tool automates some manual tasks for cleaning up failed builds. When provided with the uri for a deployment folder this tool will: - download the tar locally @@ -29,9 +21,20 @@ - remove the tar and deployment folder Usage: -tools/cleanup-build.py my-project gs://my-bucket/test-name/build.tgz +tools/cleanup-build.py gs://my-bucket/test-name/build.tgz """ +import argparse +import os +import shutil +import tarfile +import shlex +import subprocess +import sys + +# pip install google-cloud-storage +from google.cloud import storage + def cp_from_gcs(gcs_source_uri: str, local_destination_path: str, project_id: str) -> str: """Downloads a file from Google Cloud Storage to a local destination. Args: @@ -53,10 +56,17 @@ def unpack_tgz(tar_file: str, destination_folder: str): with tarfile.open(tar_file, "r:gz") as tar: tar.extractall(destination_folder) +def gcluster_path() -> str: + gcluster = "gcluster" + if os.path.exists(gcluster): + return f"./{gcluster}" + if shutil.which(gcluster) is not None: + return gcluster # it's in PATH + raise RuntimeError(f"Could not find {gcluster} in PATH or current directory") + def destroy(deployment_folder: str) -> bool: - import subprocess - import sys - process = subprocess.Popen(["./ghpc" , "destroy", deployment_folder, "--auto-approve"], stdout=subprocess.PIPE) + cmd = f"{gcluster_path()} destroy {deployment_folder} --auto-approve" + process = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) for line in iter(lambda: process.stdout.read(1), b""): sys.stdout.buffer.write(line) process.wait() @@ -65,19 +75,13 @@ def destroy(deployment_folder: str) -> bool: stdout, stderr = process.communicate() print(f'stdout: {stdout}') print(f'stderr: {stderr}\n\n') - print("Deployment destroy failed. Command to manually destroy:") - print(f"./ghpc destroy {deployment_folder} --auto-approve") + print(f"Deployment destroy failed. Command to manually destroy:\n{cmd}") return False print("Deployment destroyed") return True -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("project_id", help="Your Google Cloud project ID.") - parser.add_argument("gcs_tar_path", help="The path to the GCS tar file.") - args = parser.parse_args() - +def main(args: argparse.Namespace) -> None: print('Downloading tgz file') tgz_file = cp_from_gcs(args.gcs_tar_path, ".", args.project_id) @@ -92,4 +96,8 @@ def main(): shutil.rmtree(deployment_folder) if __name__ == "__main__": - main() + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--project_id", type=str, default="hpc-toolkit-dev", help="Your Google Cloud project ID.") + parser.add_argument("gcs_tar_path", help="The path to the GCS tar file.") + + main(parser.parse_args()) From 49596dbce3967b1d8808f7d30de9a3ac3f72c071 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Sat, 10 Aug 2024 06:22:21 +0000 Subject: [PATCH 063/180] fix passing additional networks to login nodes --- .../scheduler/schedmd-slurm-gcp-v6-controller/login.tf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index e693dc22f8..8eda16a4e5 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -72,10 +72,11 @@ module "slurm_login_instance" { labels = merge(each.value.labels, local.files_cs_labels) num_instances = each.value.num_instances - region = each.value.region - static_ips = each.value.static_ips - subnetwork = each.value.subnetwork - zone = each.value.zone + additional_networks = each.value.additional_networks + region = each.value.region + static_ips = each.value.static_ips + subnetwork = each.value.subnetwork + zone = each.value.zone # trigger replacement of login nodes when the controller instance is replaced replace_trigger = module.slurm_controller_instance.instances_self_links[0] From 587b26b289c058fd0b5a8d6d232a54ce6c8775ee Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 10 Aug 2024 06:13:22 +0000 Subject: [PATCH 064/180] Fix misuse of `log.exception` --- .../modules/slurm_files/scripts/setup.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index bee74a9cdf..e459102d31 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -173,8 +173,7 @@ def run_custom_scripts(): log.error(f"script {script} did not complete within timeout={timeout}") raise e except Exception as e: - log.error(f"script {script} encountered an exception") - log.exception(e) + log.exception(f"script {script} encountered an exception") raise e @@ -508,7 +507,6 @@ def main(): ) log.error("Aborting setup...") failed_motd() - except Exception as e: - log.exception(e) - log.error("Aborting setup...") + except Exception: + log.exception("Aborting setup...") failed_motd() From 8ce6c18bddbfaf7f82034a7f4044593d7c293e84 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Tue, 6 Aug 2024 16:12:30 -0700 Subject: [PATCH 065/180] Add setting `allow_automatic_updates: false` to examples --- community/examples/AMD/hpc-amd-slurm.yaml | 2 ++ community/examples/client-google-cloud-storage.yaml | 1 + community/examples/hpc-build-slurm-image.yaml | 1 + community/examples/hpc-slurm-gromacs.yaml | 1 + community/examples/hpc-slurm-local-ssd.yaml | 1 + community/examples/hpc-slurm-ramble-gromacs.yaml | 1 + community/examples/hpc-slurm-sharedvpc.yaml | 2 ++ community/examples/hpc-slurm6-apptainer.yaml | 1 + community/examples/hpc-slurm6-tpu-maxtext.yaml | 1 + community/examples/htc-htcondor.yaml | 2 ++ community/examples/htc-slurm.yaml | 4 ++++ community/examples/omnia-cluster.yaml | 1 + community/examples/tutorial-fluent.yaml | 1 + community/examples/tutorial-starccm.yaml | 1 + examples/cae/cae-slurm.yaml | 2 ++ examples/hcls-blueprint.yaml | 3 +++ examples/hpc-enterprise-slurm.yaml | 7 +++++++ examples/hpc-slurm-static.yaml | 2 ++ examples/hpc-slurm.yaml | 3 +++ examples/image-builder.yaml | 1 + examples/pfs-lustre.yaml | 1 + examples/ps-slurm.yaml | 1 + examples/serverless-batch-mpi.yaml | 4 +++- examples/serverless-batch.yaml | 3 ++- modules/scheduler/batch-job-template/README.md | 2 +- modules/scheduler/batch-job-template/variables.tf | 3 ++- tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml | 1 + 27 files changed, 49 insertions(+), 4 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index 0eb1f71571..5decf96a2d 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -179,6 +179,7 @@ deployment_groups: node_count_dynamic_max: 10 bandwidth_tier: gvnic_enabled enable_placement: false + allow_automatic_updates: false - id: low_cost_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -194,6 +195,7 @@ deployment_groups: node_count_dynamic_max: 50 bandwidth_tier: gvnic_enabled enable_placement: true + allow_automatic_updates: false # Because is_default is set to true, jobs will run on this partition unless an # alternative partition is specified using, for example, "srun -p lowcost" diff --git a/community/examples/client-google-cloud-storage.yaml b/community/examples/client-google-cloud-storage.yaml index e23abeed3a..b876b0b42b 100644 --- a/community/examples/client-google-cloud-storage.yaml +++ b/community/examples/client-google-cloud-storage.yaml @@ -57,6 +57,7 @@ deployment_groups: settings: name_prefix: workstation machine_type: e2-standard-2 + allow_automatic_updates: false - id: wait source: community/modules/scripts/wait-for-startup diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml index f721019844..491cfb7c65 100644 --- a/community/examples/hpc-build-slurm-image.yaml +++ b/community/examples/hpc-build-slurm-image.yaml @@ -104,6 +104,7 @@ deployment_groups: settings: machine_type: n2d-standard-2 instance_image: $(vars.built_instance_image) + allow_automatic_updates: false - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/hpc-slurm-gromacs.yaml b/community/examples/hpc-slurm-gromacs.yaml index a6ccf8867c..af6b8864b8 100644 --- a/community/examples/hpc-slurm-gromacs.yaml +++ b/community/examples/hpc-slurm-gromacs.yaml @@ -97,6 +97,7 @@ deployment_groups: settings: node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/hpc-slurm-local-ssd.yaml b/community/examples/hpc-slurm-local-ssd.yaml index f2e1b7e8e4..9921a5c621 100644 --- a/community/examples/hpc-slurm-local-ssd.yaml +++ b/community/examples/hpc-slurm-local-ssd.yaml @@ -73,6 +73,7 @@ deployment_groups: machine_type: c2-standard-4 node_count_dynamic_max: 5 node_count_static: 0 + allow_automatic_updates: false - id: partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/hpc-slurm-ramble-gromacs.yaml b/community/examples/hpc-slurm-ramble-gromacs.yaml index 5396c4aef3..523b543c53 100644 --- a/community/examples/hpc-slurm-ramble-gromacs.yaml +++ b/community/examples/hpc-slurm-ramble-gromacs.yaml @@ -118,6 +118,7 @@ deployment_groups: settings: node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/hpc-slurm-sharedvpc.yaml b/community/examples/hpc-slurm-sharedvpc.yaml index 6f18ad040f..827824e432 100644 --- a/community/examples/hpc-slurm-sharedvpc.yaml +++ b/community/examples/hpc-slurm-sharedvpc.yaml @@ -62,6 +62,7 @@ deployment_groups: node_count_dynamic_max: 4 machine_type: n2-standard-2 enable_placement: false # the default is: true + allow_automatic_updates: false - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -77,6 +78,7 @@ deployment_groups: settings: node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/hpc-slurm6-apptainer.yaml b/community/examples/hpc-slurm6-apptainer.yaml index 09a02fa9d4..6848b1b4f0 100644 --- a/community/examples/hpc-slurm6-apptainer.yaml +++ b/community/examples/hpc-slurm6-apptainer.yaml @@ -78,6 +78,7 @@ deployment_groups: instance_image: $(vars.custom_image) instance_image_custom: true bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/hpc-slurm6-tpu-maxtext.yaml b/community/examples/hpc-slurm6-tpu-maxtext.yaml index ab88b6f2de..5e172cd5c2 100644 --- a/community/examples/hpc-slurm6-tpu-maxtext.yaml +++ b/community/examples/hpc-slurm6-tpu-maxtext.yaml @@ -100,6 +100,7 @@ deployment_groups: name: ns2 node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index 08281bdaa3..ce93439b67 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -104,6 +104,7 @@ deployment_groups: name_prefix: grp1 instance_image: $(vars.new_image) min_idle: 2 + allow_automatic_updates: false - id: htcondor_execute_point_spot source: community/modules/compute/htcondor-execute-point @@ -117,6 +118,7 @@ deployment_groups: name_prefix: spot instance_image: $(vars.new_image) spot: true + allow_automatic_updates: false - id: htcondor_access source: community/modules/scheduler/htcondor-access-point diff --git a/community/examples/htc-slurm.yaml b/community/examples/htc-slurm.yaml index fb9be4c147..7165923bbb 100644 --- a/community/examples/htc-slurm.yaml +++ b/community/examples/htc-slurm.yaml @@ -80,6 +80,7 @@ deployment_groups: node_count_dynamic_max: 200 bandwidth_tier: gvnic_enabled enable_placement: false + allow_automatic_updates: false - id: compute_nodeset_c2s30 source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset @@ -89,6 +90,7 @@ deployment_groups: machine_type: c2-standard-30 bandwidth_tier: gvnic_enabled enable_placement: false + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -112,6 +114,7 @@ deployment_groups: node_count_dynamic_max: 10 bandwidth_tier: gvnic_enabled enable_placement: false + allow_automatic_updates: false - id: low_cost_nodeset_n2s4 source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset @@ -122,6 +125,7 @@ deployment_groups: node_count_dynamic_max: 10 bandwidth_tier: gvnic_enabled enable_placement: false + allow_automatic_updates: false - id: low_cost_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/omnia-cluster.yaml b/community/examples/omnia-cluster.yaml index a54a7d376e..89ecfcc263 100644 --- a/community/examples/omnia-cluster.yaml +++ b/community/examples/omnia-cluster.yaml @@ -89,6 +89,7 @@ deployment_groups: name_prefix: omnia-compute add_deployment_name_before_prefix: true instance_count: 2 + allow_automatic_updates: false # This module simply makes terraform wait until the startup script is complete - id: wait diff --git a/community/examples/tutorial-fluent.yaml b/community/examples/tutorial-fluent.yaml index 23341903b8..0ff70e009e 100644 --- a/community/examples/tutorial-fluent.yaml +++ b/community/examples/tutorial-fluent.yaml @@ -144,6 +144,7 @@ deployment_groups: vm_count: 4 # Note: should match instance count collocation: "COLLOCATED" availability_domain_count: null + allow_automatic_updates: false - id: login source: modules/compute/vm-instance diff --git a/community/examples/tutorial-starccm.yaml b/community/examples/tutorial-starccm.yaml index db86f35518..91a7af09ad 100644 --- a/community/examples/tutorial-starccm.yaml +++ b/community/examples/tutorial-starccm.yaml @@ -70,6 +70,7 @@ deployment_groups: vm_count: null collocation: "COLLOCATED" availability_domain_count: null + allow_automatic_updates: false - source: community/modules/scripts/wait-for-startup kind: terraform diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml index 920d1967ba..a3e9820ab9 100644 --- a/examples/cae/cae-slurm.yaml +++ b/examples/cae/cae-slurm.yaml @@ -163,6 +163,7 @@ deployment_groups: machine_type: h3-standard-88 disk_type: 'pd-balanced' bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: h3_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -181,6 +182,7 @@ deployment_groups: machine_type: c3-highmem-176 disk_type: 'pd-balanced' bandwidth_tier: tier_1_enabled + allow_automatic_updates: false - id: c3_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/examples/hcls-blueprint.yaml b/examples/hcls-blueprint.yaml index 1c77626f66..ee55925236 100644 --- a/examples/hcls-blueprint.yaml +++ b/examples/hcls-blueprint.yaml @@ -265,6 +265,7 @@ deployment_groups: add_deployment_name_before_prefix: true threads_per_core: 2 machine_type: c2-standard-16 + allow_automatic_updates: false - group: cluster modules: @@ -300,6 +301,7 @@ deployment_groups: name: ns node_count_dynamic_max: 20 machine_type: c2-standard-60 + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -317,6 +319,7 @@ deployment_groups: node_count_dynamic_max: 20 machine_type: g2-standard-4 enable_placement: False + allow_automatic_updates: false - id: gpu_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index 21dc9e15f9..3ef0ba990f 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -108,6 +108,7 @@ deployment_groups: machine_type: n2-standard-2 instance_image: $(vars.slurm_image) enable_placement: false # the default is: true + allow_automatic_updates: false - id: n2_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -130,6 +131,7 @@ deployment_groups: bandwidth_tier: tier_1_enabled disk_type: pd-ssd disk_size_gb: 100 + allow_automatic_updates: false # use `-p c2` to submit jobs to this partition: # ex: `srun -p c2 -N 1 hostname` @@ -151,6 +153,7 @@ deployment_groups: bandwidth_tier: tier_1_enabled disk_type: pd-ssd disk_size_gb: 100 + allow_automatic_updates: false - id: c2d_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -168,6 +171,7 @@ deployment_groups: bandwidth_tier: tier_1_enabled disk_type: pd-ssd disk_size_gb: 100 + allow_automatic_updates: false - id: c3_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -191,6 +195,7 @@ deployment_groups: node_conf: SocketsPerBoard: 2 CoresPerSocket: 24 + allow_automatic_updates: false # use `-p a208` to submit jobs to this partition: # ex: `srun -p a208 --gpus-per-node=8 -N 1 nvidia-smi` @@ -220,6 +225,7 @@ deployment_groups: node_conf: SocketsPerBoard: 2 CoresPerSocket: 24 + allow_automatic_updates: false # use `-p a216` to submit jobs to this partition: # ex: `srun -p a216 --gpus-per-node=16 -N 1 nvidia-smi` @@ -246,6 +252,7 @@ deployment_groups: # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks disk_type: pd-balanced disk_size_gb: 100 + allow_automatic_updates: false # use `-p h3` to submit jobs to this partition: # ex: `srun -p h3 -N 1 hostname` diff --git a/examples/hpc-slurm-static.yaml b/examples/hpc-slurm-static.yaml index 41f2aac52c..fff15e07dc 100644 --- a/examples/hpc-slurm-static.yaml +++ b/examples/hpc-slurm-static.yaml @@ -51,6 +51,7 @@ deployment_groups: reservation_name: $(vars.static_reservation_name) machine_type: $(vars.static_reservation_machine_type) instance_image: $(vars.slurm_instance_image) + allow_automatic_updates: false - id: static_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [static_nodeset] @@ -66,6 +67,7 @@ deployment_groups: machine_type: c2d-standard-112 node_count_dynamic_max: 100 instance_image: $(vars.slurm_instance_image) + allow_automatic_updates: false - id: dynamic_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [dynamic_nodeset] diff --git a/examples/hpc-slurm.yaml b/examples/hpc-slurm.yaml index c425c041df..0a90bdcc89 100644 --- a/examples/hpc-slurm.yaml +++ b/examples/hpc-slurm.yaml @@ -47,6 +47,7 @@ deployment_groups: node_count_dynamic_max: 4 machine_type: n2-standard-2 enable_placement: false # the default is: true + allow_automatic_updates: false - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -63,6 +64,7 @@ deployment_groups: settings: node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -82,6 +84,7 @@ deployment_groups: # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks disk_type: pd-balanced bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: h3_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/examples/image-builder.yaml b/examples/image-builder.yaml index 6bc7f6161d..63f5d89fbd 100644 --- a/examples/image-builder.yaml +++ b/examples/image-builder.yaml @@ -77,6 +77,7 @@ deployment_groups: instance_image: $(vars.custom_image) instance_image_custom: true bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/examples/pfs-lustre.yaml b/examples/pfs-lustre.yaml index 6354ead1b8..1da2de65c9 100644 --- a/examples/pfs-lustre.yaml +++ b/examples/pfs-lustre.yaml @@ -46,3 +46,4 @@ deployment_groups: add_deployment_name_before_prefix: true instance_count: 2 machine_type: n2-standard-2 + allow_automatic_updates: false diff --git a/examples/ps-slurm.yaml b/examples/ps-slurm.yaml index 4a28802924..11c492d6e3 100644 --- a/examples/ps-slurm.yaml +++ b/examples/ps-slurm.yaml @@ -48,6 +48,7 @@ deployment_groups: node_count_dynamic_max: 4 machine_type: c2-standard-60 enable_placement: false # the default is: true + allow_automatic_updates: false - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/examples/serverless-batch-mpi.yaml b/examples/serverless-batch-mpi.yaml index af156a2a83..9dd329d6b8 100644 --- a/examples/serverless-batch-mpi.yaml +++ b/examples/serverless-batch-mpi.yaml @@ -21,7 +21,7 @@ vars: region: us-central1 zone: us-central1-c instance_image: - family: hpc-centos-7 + family: hpc-rocky-linux-8 project: cloud-hpc-image-public deployment_groups: @@ -147,6 +147,7 @@ deployment_groups: name_prefix: spack-builder add_deployment_name_before_prefix: true machine_type: c2-standard-16 + allow_automatic_updates: false ### Batch Modules ### - id: batch-job @@ -158,6 +159,7 @@ deployment_groups: machine_type: c2-standard-60 task_count: 2 mpi_mode: true + allow_automatic_updates: false - id: batch-login source: modules/scheduler/batch-login-node diff --git a/examples/serverless-batch.yaml b/examples/serverless-batch.yaml index 6931077248..538e7d9671 100644 --- a/examples/serverless-batch.yaml +++ b/examples/serverless-batch.yaml @@ -47,8 +47,9 @@ deployment_groups: task_count: 8 task_count_per_node: 4 instance_image: - family: batch-centos-7-official + family: batch-hpc-rocky-linux-8-official project: batch-custom-image + allow_automatic_updates: false - id: batch-login source: modules/scheduler/batch-login-node diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index 345aaf638e..8cf6e6d276 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -157,7 +157,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment, used for the job\_id | `string` | n/a | yes | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true, instances will have public IPs | `bool` | `true` | no | | [gcloud\_version](#input\_gcloud\_version) | The version of the gcloud cli being used. Used for output instructions. Valid inputs are `"alpha"`, `"beta"` and "" (empty string for default version) | `string` | `""` | no | diff --git a/modules/scheduler/batch-job-template/variables.tf b/modules/scheduler/batch-job-template/variables.tf index da214e29ea..bfce75666e 100644 --- a/modules/scheduler/batch-job-template/variables.tf +++ b/modules/scheduler/batch-job-template/variables.tf @@ -235,5 +235,6 @@ variable "allow_automatic_updates" { https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool - default = false + default = true + nullable = false } diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml index dcc3d41bc8..54043e0beb 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml @@ -25,6 +25,7 @@ cli_deployment_vars: slurm_image: "{family: slurm-gcp-6-6-hpc-centos-7, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c + allow_automatic_updates: false zone: us-west4-c workspace: /workspace From 08a9ff58c663f372ba394bc7ffe02d0071638be7 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 10 Aug 2024 19:10:50 +0000 Subject: [PATCH 066/180] SlurmGCP. Remove global `cfg` --- .../modules/slurm_files/scripts/load_bq.py | 17 +++---- .../modules/slurm_files/scripts/resume.py | 10 ++-- .../modules/slurm_files/scripts/setup.py | 24 +-------- .../scripts/setup_network_storage.py | 24 ++++----- .../modules/slurm_files/scripts/util.py | 49 +++++++++---------- 5 files changed, 49 insertions(+), 75 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index f876827a4c..ec52d06ea7 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -28,8 +28,7 @@ from google.api_core import retry, exceptions import util -from util import run -from util import cfg +from util import lkp, run SACCT = "sacct" @@ -176,14 +175,14 @@ def schema_field(field_name, data_type, description, required=False): Job = namedtuple("Job", job_schema.keys()) client = bq.Client( - project=cfg.project, + project=lkp.cfg.project, credentials=util.default_credentials(), client_options=util.create_client_options(util.ApiEndpoint.BQ), ) -dataset_id = f"{cfg.slurm_cluster_name}_job_data" -dataset = bq.DatasetReference(project=cfg.project, dataset_id=dataset_id) +dataset_id = f"{lkp.cfg.slurm_cluster_name}_job_data" +dataset = bq.DatasetReference(project=lkp.project, dataset_id=dataset_id) table = bq.Table( - bq.TableReference(dataset, f"jobs_{cfg.slurm_cluster_name}"), schema_fields + bq.TableReference(dataset, f"jobs_{lkp.cfg.slurm_cluster_name}"), schema_fields ) @@ -198,8 +197,8 @@ def make_job_row(job): if field_name in job } job_row["entry_uuid"] = uuid.uuid4().hex - job_row["cluster_id"] = cfg.cluster_id - job_row["cluster_name"] = cfg.slurm_cluster_name + job_row["cluster_id"] = lkp.cfg.cluster_id + job_row["cluster_name"] = lkp.cfg.slurm_cluster_name return job_row @@ -310,7 +309,7 @@ def update_job_idx_cache(jobs, timestamp): def main(): - if not cfg.enable_bigquery_load: + if not lkp.cfg.enable_bigquery_load: print("bigquery load is not currently enabled") exit(0) init_table() diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 8301ec84b4..4473b73932 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -41,7 +41,7 @@ trim_self_link, wait_for_operation, ) -from util import cfg, lkp, NSDict, TPU +from util import lkp, NSDict, TPU import slurm_gcp_plugins @@ -138,7 +138,7 @@ def create_instances_request(nodes, partition_name, placement_group, job_id=None nodeset = lkp.node_nodeset(model) template = lkp.node_template(model) region = lkp.node_region(model) - partition = cfg.partitions[partition_name] + partition = lkp.cfg.partitions[partition_name] log.debug(f"create_instances_request: {model} placement: {placement_group}") body = NSDict() @@ -182,7 +182,7 @@ def create_instances_request(nodes, partition_name, placement_group, job_id=None ) request = lkp.compute.regionInstances().bulkInsert( - project=cfg.project, region=region, body=body.to_dict() + project=lkp.project, region=region, body=body.to_dict() ) if log.isEnabledFor(logging.DEBUG): @@ -497,7 +497,7 @@ def create_placement_request(pg_name, region): lkp=lkp, pg_name=pg_name, region=region, request_body=config ) request = lkp.compute.resourcePolicies().insert( - project=cfg.project, region=region, body=config + project=lkp.project, region=region, body=config ) log_api_request(request) return request @@ -521,7 +521,7 @@ def create_nodeset_placement_groups(node_list: list, job_id=0): region = lkp.node_region(model) groups = { - f"{cfg.slurm_cluster_name}-slurmgcp-managed-{nodeset.nodeset_name}-{job_id}-{i}": nodes + f"{lkp.cfg.slurm_cluster_name}-slurmgcp-managed-{nodeset.nodeset_name}-{job_id}-{i}": nodes for i, nodes in enumerate(chunked(node_list, n=PLACEMENT_MAX_CNT)) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index bee74a9cdf..23d5c2df3c 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -27,7 +27,6 @@ import util from util import ( lkp, - cfg, dirs, slurmdirs, run, @@ -177,20 +176,6 @@ def run_custom_scripts(): log.exception(e) raise e - -def setup_secondary_disks(): - """Format and mount secondary disk""" - run( - "sudo mkfs.ext4 -m 0 -F -E lazy_itable_init=0,lazy_journal_init=0,discard /dev/sdb" - ) - with open("/etc/fstab", "a") as f: - f.write( - "\n/dev/sdb {0} ext4 discard,defaults,nofail 0 2".format( - dirs.secdisk - ) - ) - - def setup_jwt_key(): jwt_key = Path(slurmdirs.state / "jwt_hs256.key") @@ -326,14 +311,11 @@ def setup_controller(): setup_jwt_key() setup_munge_key() setup_sudoers() - - if cfg.controller_secondary_disk: - setup_secondary_disks() setup_network_storage() run_custom_scripts() - if not cfg.cloudsql_secret: + if not lkp.cfg.cloudsql_secret: configure_mysql() run("systemctl enable slurmdbd", timeout=30) @@ -344,7 +326,7 @@ def setup_controller(): sacctmgr = f"{slurmdirs.prefix}/bin/sacctmgr -i" result = run( - f"{sacctmgr} add cluster {cfg.slurm_cluster_name}", timeout=30, check=False + f"{sacctmgr} add cluster {lkp.cfg.slurm_cluster_name}", timeout=30, check=False ) if "already exists" in result.stdout: log.info(result.stdout) @@ -478,8 +460,6 @@ def main(): parser.add_argument("--slurmd-feature", dest="slurmd_feature", help="Unused, to be removed.") _ = util.init_log_and_parse(parser) - lkp = util.Lookup(cfg) # noqa F811 - try: main() except subprocess.TimeoutExpired as e: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py index 65e5301481..889f4cf59e 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py @@ -27,7 +27,7 @@ from addict import Dict as NSDict import util -from util import lkp, run, cfg, dirs, separate +from util import lkp, run, dirs, separate from more_executors import Executors, ExceptionRetryPolicy @@ -49,7 +49,7 @@ def resolve_network_storage(nodeset=None): nodeset = None # seed mounts with the default controller mounts - if cfg.disable_default_mounts: + if lkp.cfg.disable_default_mounts: default_mounts = [] else: default_mounts = [ @@ -73,9 +73,9 @@ def resolve_network_storage(nodeset=None): # On non-controller instances, entries in network_storage could overwrite # default exports from the controller. Be careful, of course - mounts.update(mounts_by_local(cfg.network_storage)) + mounts.update(mounts_by_local(lkp.cfg.network_storage)) if lkp.instance_role in ("login", "controller"): - mounts.update(mounts_by_local(cfg.login_network_storage)) + mounts.update(mounts_by_local(lkp.cfg.login_network_storage)) if nodeset is not None: mounts.update(mounts_by_local(nodeset.network_storage)) @@ -193,16 +193,16 @@ def mount_path(path): def munge_mount_handler(): - if not cfg.munge_mount: + if not lkp.cfg.munge_mount: log.error("Missing munge_mount in cfg") elif lkp.is_controller: return - mount = cfg.munge_mount + mount = lkp.cfg.munge_mount server_ip = ( mount.server_ip if mount.server_ip - else (cfg.slurm_control_addr or cfg.slurm_control_host) + else (lkp.cfg.slurm_control_addr or lkp.cfg.slurm_control_host) ) remote_mount = mount.remote_mount local_mount = Path("/mnt/munge") @@ -276,18 +276,18 @@ def setup_nfs_exports(): mounts.append( NSDict( { - "server_ip": cfg.munge_mount.server_ip, - "remote_mount": cfg.munge_mount.remote_mount, + "server_ip": lkp.cfg.munge_mount.server_ip, + "remote_mount": lkp.cfg.munge_mount.remote_mount, "local_mount": Path(f"{dirs.munge}_tmp"), - "fs_type": cfg.munge_mount.fs_type, - "mount_options": cfg.munge_mount.mount_options, + "fs_type": lkp.cfg.munge_mount.fs_type, + "mount_options": lkp.cfg.munge_mount.mount_options, } ) ) # controller mounts _, con_mounts = separate_external_internal_mounts(mounts) con_mounts = {m.remote_mount: m for m in con_mounts} - for nodeset in cfg.nodeset.values(): + for nodeset in lkp.cfg.nodeset.values(): # get internal mounts for each nodeset by calling # resolve_network_storage as from a node in each nodeset ns_mounts = resolve_network_storage(nodeset=nodeset) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index b4252a2560..a7ffcac378 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -89,8 +89,6 @@ def mkdirp(path: Path) -> None: p for p in (Path(__file__).parent, Path("/slurm/scripts")) if p.is_dir() ) -# slurm-gcp config object, could be empty if not available -cfg = NSDict() # caching Lookup object lkp = None @@ -419,9 +417,6 @@ def load_config_data(config): "mount_options": "defaults,hard,intr,_netdev", } ) - - if not cfg.enable_debug_logging and isinstance(cfg.enable_debug_logging, NSDict): - cfg.enable_debug_logging = False return cfg @@ -452,8 +447,7 @@ def new_config(config): def fetch_config_yaml(): """Fetch config.yaml from bucket""" config_yaml = blob_get("config.yaml").download_as_text() - cfg = new_config(yaml.safe_load(config_yaml)) - return cfg + return new_config(yaml.safe_load(config_yaml)) def fetch_config_yaml_md5(): @@ -489,7 +483,7 @@ def get_log_path() -> Path: Returns path to log file for the current script. e.g. resume.py -> /var/log/slurm/resume.log """ - log_dir = Path(cfg.slurm_log_dir or ".") + log_dir = Path(lkp.cfg.slurm_log_dir or ".") return (log_dir / Path(sys.argv[0]).name).with_suffix(".log") def init_log_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: @@ -511,10 +505,10 @@ def init_log_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: args = parser.parse_args() loglevel = args.loglevel - if cfg.enable_debug_logging: + if lkp.cfg.enable_debug_logging: loglevel = logging.DEBUG if args.trace_api: - cfg.extra_logging_flags["trace_api"] = True + lkp.cfg.extra_logging_flags["trace_api"] = True # Configure root logger logging.config.dictConfig({ @@ -555,7 +549,7 @@ def init_log_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: def log_api_request(request): """log.trace info about a compute API request""" - if not cfg.extra_logging_flags.get("trace_api"): + if not lkp.cfg.extra_logging_flags.get("trace_api"): return # output the whole request object as pretty yaml @@ -1337,7 +1331,7 @@ def create_node(self, nodename): echo "startup script not found > /var/log/startup_error.log" """ with open( - Path(cfg.slurm_scripts_dir or dirs.scripts) / "startup.sh", "r" + Path(lkp.cfg.slurm_scripts_dir or dirs.scripts) / "startup.sh", "r" ) as script: startup_script = script.read() if isinstance(nodename, list): @@ -1412,8 +1406,8 @@ def delete_node(self, nodename): class Lookup: """Wrapper class for cached data access""" - def __init__(self, cfg=None): - self._cfg = cfg or NSDict() + def __init__(self, cfg): + self._cfg = cfg self.template_cache_path = Path(__file__).parent / "template_info.cache" @property @@ -1446,7 +1440,7 @@ def endpoint_versions(self): @property def scontrol(self): - return Path(self.cfg.slurm_bin_dir if cfg else "") / "scontrol" + return Path(self.cfg.slurm_bin_dir or "") / "scontrol" @cached_property def instance_role(self): @@ -1873,15 +1867,16 @@ def scontrol_reconfigure(lkp: Lookup) -> None: log.info("Running scontrol reconfigure") run(f"{lkp.scontrol} reconfigure", timeout=30) -# Define late globals -lkp = Lookup() -cfg = load_config_file(CONFIG_FILE) -if not cfg: - try: - cfg = fetch_config_yaml() - except Exception as e: - log.warning(f"config not found in bucket: {e}") - if cfg: - save_config(cfg, CONFIG_FILE) - -lkp = Lookup(cfg) +def _init_lkp() -> None: + cfg = load_config_file(CONFIG_FILE) + if not cfg: + try: + cfg = fetch_config_yaml() + except Exception as e: + log.warning(f"config not found in bucket: {e}") + if cfg: + save_config(cfg, CONFIG_FILE) + global lkp + lkp = Lookup(cfg) + +_init_lkp() # TODO: remove this line after refactoring From b32c617c3a856e0f6c0b6ae67edf523393a294f5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:22:48 +0000 Subject: [PATCH 067/180] Bump google.golang.org/api from 0.190.0 to 0.191.0 Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.190.0 to 0.191.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.190.0...v0.191.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 8 ++++---- go.sum | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index 1c67acf666..1b56def0c7 100644 --- a/go.mod +++ b/go.mod @@ -27,7 +27,7 @@ require ( github.com/hashicorp/terraform-exec v0.21.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 - google.golang.org/api v0.190.0 + google.golang.org/api v0.191.0 ) require ( @@ -51,8 +51,8 @@ require ( go.opentelemetry.io/otel/metric v1.24.0 // indirect go.opentelemetry.io/otel/trace v1.24.0 // indirect golang.org/x/mod v0.17.0 // indirect - golang.org/x/sync v0.7.0 // indirect - golang.org/x/time v0.5.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/time v0.6.0 // indirect golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240725223205-93522f1f2a9f // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf // indirect @@ -96,7 +96,7 @@ require ( go.opencensus.io v0.24.0 // indirect golang.org/x/crypto v0.25.0 // indirect golang.org/x/net v0.27.0 // indirect - golang.org/x/oauth2 v0.21.0 // indirect + golang.org/x/oauth2 v0.22.0 // indirect golang.org/x/sys v0.23.0 golang.org/x/text v0.16.0 // indirect google.golang.org/grpc v1.64.1 // indirect diff --git a/go.sum b/go.sum index 3c5627ba31..1d3799328c 100644 --- a/go.sum +++ b/go.sum @@ -646,8 +646,8 @@ golang.org/x/oauth2 v0.0.0-20220822191816-0ebed06d0094/go.mod h1:h4gKUeWbJ4rQPri golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.1.0/go.mod h1:G9FE4dLTsbXUu90h/Pf85g4w1D+SSAgR+q46nJZ8M4A= -golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= -golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/oauth2 v0.22.0 h1:BzDx2FehcG7jJwgWLELCdmLuxk2i+x9UDpSiss2u0ZA= +golang.org/x/oauth2 v0.22.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -662,8 +662,8 @@ golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220601150217-0de741cfad7f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= -golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -754,8 +754,8 @@ golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= -golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U= +golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -868,8 +868,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.190.0 h1:ASM+IhLY1zljNdLu19W1jTmU6A+gMk6M46Wlur61s+Q= -google.golang.org/api v0.190.0/go.mod h1:QIr6I9iedBLnfqoD6L6Vze1UvS5Hzj5r2aUBOaZnLHo= +google.golang.org/api v0.191.0 h1:cJcF09Z+4HAB2t5qTQM1ZtfL/PemsLFkcFG67qq2afk= +google.golang.org/api v0.191.0/go.mod h1:tD5dsFGxFza0hnQveGfVk9QQYKcfp+VzgRqyXFxE0+E= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= From aceb86feb41255710c99fb5802d3d47dd2f770ad Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:22:55 +0000 Subject: [PATCH 068/180] Bump golang.org/x/sys from 0.23.0 to 0.24.0 Bumps [golang.org/x/sys](https://github.com/golang/sys) from 0.23.0 to 0.24.0. - [Commits](https://github.com/golang/sys/compare/v0.23.0...v0.24.0) --- updated-dependencies: - dependency-name: golang.org/x/sys dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 1c67acf666..7f89bc6250 100644 --- a/go.mod +++ b/go.mod @@ -97,7 +97,7 @@ require ( golang.org/x/crypto v0.25.0 // indirect golang.org/x/net v0.27.0 // indirect golang.org/x/oauth2 v0.21.0 // indirect - golang.org/x/sys v0.23.0 + golang.org/x/sys v0.24.0 golang.org/x/text v0.16.0 // indirect google.golang.org/grpc v1.64.1 // indirect google.golang.org/protobuf v1.34.2 // indirect diff --git a/go.sum b/go.sum index 3c5627ba31..76815fc36a 100644 --- a/go.sum +++ b/go.sum @@ -732,8 +732,8 @@ golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= -golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= From 63c19f2eba6861d48235013f78d7e7a096387883 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Mon, 22 Jul 2024 13:26:57 +0000 Subject: [PATCH 069/180] Added support multivpc with GKE --- examples/README.md | 22 +++++ examples/gke-multi-vpc-a3-megagpu-8g.yaml | 94 +++++++++++++++++++ modules/compute/gke-node-pool/README.md | 40 +++++++- modules/compute/gke-node-pool/main.tf | 11 +++ modules/compute/gke-node-pool/variables.tf | 24 +++++ modules/network/multivpc/main.tf | 2 +- modules/scheduler/gke-cluster/README.md | 13 ++- modules/scheduler/gke-cluster/main.tf | 17 ++-- modules/scheduler/gke-cluster/variables.tf | 6 ++ modules/scheduler/gke-cluster/versions.tf | 4 - .../pre-existing-gke-cluster/README.md | 29 ++++++ .../pre-existing-gke-cluster/main.tf | 54 +++++++++++ .../pre-existing-gke-cluster/variables.tf | 25 +++++ .../pre-existing-gke-cluster/versions.tf | 4 + 14 files changed, 328 insertions(+), 17 deletions(-) create mode 100644 examples/gke-multi-vpc-a3-megagpu-8g.yaml diff --git a/examples/README.md b/examples/README.md index b10c2c7b74..5b37cecff3 100644 --- a/examples/README.md +++ b/examples/README.md @@ -52,6 +52,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [hpc-gke.yaml](#hpc-gkeyaml--) ![community-badge] ![experimental-badge] * [ml-gke](#ml-gkeyaml--) ![community-badge] ![experimental-badge] * [storage-gke](#storage-gkeyaml--) ![community-badge] ![experimental-badge] + * [multivpc-gke](#multivpc-gkeyaml--) ![community-badge] ![experimental-badge] * [htc-slurm-v5-legacy.yaml](#htc-slurm-v5-legacyyaml--) ![community-badge] ![experimental-badge] * [htc-slurm.yaml](#htc-slurmyaml-) ![community-badge] * [htc-htcondor.yaml](#htc-htcondoryaml--) ![community-badge] ![experimental-badge] @@ -1535,6 +1536,27 @@ cleaned up when the job is deleted. [storage-gke.yaml]: ../examples/storage-gke.yaml +### [multivpc-gke.yaml] ![community-badge] ![experimental-badge] + +This blueprint shows how to use multiple VPC networks with GKE in the toolkit. + +The blueprint contains the following: + +* Two deployment groups + * The primary to provision the cluster + * The second group to apply mltiple VPCs to the cluster + +> [!Note] +> The Kubernetes API server will only allow requests from authorized networks. +> The `pre-existing-gke-cluster` module needs access to the Kubernetes API server +> to apply a manifest. **You must use +> the `authorized_cidr` variable to supply an authorized network which contains +> the IP address of the machine deploying the blueprint, for example +> `--vars authorized_cidr=/32`.** You can use a service like +> [whatismyip.com](https://whatismyip.com) to determine your IP address. + +[multivpc-gke.yaml]: ../examples/gke-multi-vpc-a3-mgagpu-8g.yaml + ### [htc-htcondor.yaml] ![community-badge] ![experimental-badge] This blueprint provisions an auto-scaling [HTCondor][htcondor] pool based upon diff --git a/examples/gke-multi-vpc-a3-megagpu-8g.yaml b/examples/gke-multi-vpc-a3-megagpu-8g.yaml new file mode 100644 index 0000000000..78b1158189 --- /dev/null +++ b/examples/gke-multi-vpc-a3-megagpu-8g.yaml @@ -0,0 +1,94 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: gke-multi-vpc-a3-megagpu-8g + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: gkemvpc + region: us-central1 + zone: us-central1-c + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: /32 + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet + secondary_ranges: + gke-subnet: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network1] + settings: + enable_private_endpoint: false # Allows for access from authorized public IPs + enable_multi_networking: true + master_authorized_networks: + - cidr_block: $(vars.authorized_cidr) + display_name: "kubectl-access-network" # Allows your machine run kubectl command. + outputs: [instructions] + +- group: multivpcgke + modules: + - id: gpunets + source: modules/network/multivpc + settings: + network_name_prefix: $(vars.deployment_name)-gpunet + global_ip_address_range: 192.169.0.0/16 + network_count: 8 + subnetwork_cidr_suffix: 24 + + - id: existing-gke-cluster + source: modules/scheduler/pre-existing-gke-cluster + use: [gpunets] + settings: + cluster_name: $(vars.deployment_name) + + - id: a3-megagpu_pool + source: modules/compute/gke-node-pool + use: [existing-gke-cluster, gpunets] + settings: + machine_type: a3-megagpu-8g + guest_accelerator: + - type: nvidia-h100-mega-80gb + count: 8 + gpu_driver_installation_config: + - gpu_driver_version: LATEST + static_node_count: 2 + disk_type: pd-ssd + zones: [$(vars.zone)] + local_ssd_count_ephemeral_storage: 16 + + - id: job-template + source: modules/compute/gke-job-template + use: [a3-megagpu_pool] + settings: + image: busybox + command: + - echo + - Hello World + node_count: 3 + outputs: [instructions] diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 2daf69a794..dc54cb67c3 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -73,6 +73,10 @@ use-cases). In this case, ensure that you turn off the [enable_secure_boot](#input\_enable\_secure\_boot) option to allow unsigned kernel modules to be loaded. +To maximize GPU network bandwidth, nodepools accept multiple VPCs. Pass a multivpc module to gke-node-pool module, and [install GPUDirect and configure NCCL] (https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl). +> **_NOTE:_** You must [enable multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) feature when creating the GKE cluster. +> To create network objects in gke cluster pass, the multivpc module to a pre-existing-gke-cluster module as well or [apply a manifest manually](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment). + ### GPUs Examples There are several ways to add GPUs to a GKE node pool. See @@ -149,7 +153,7 @@ Following is an example of using a GPU attached to an `n1` machine: count: 2 ``` -Finally, the following is an example of using a GPU (with sharing config) attached to an `n1` machine: +The following is an example of using a GPU (with sharing config) attached to an `n1` machine: ```yaml - id: n1-t4-pool @@ -168,6 +172,39 @@ Finally, the following is an example of using a GPU (with sharing config) attach gpu_sharing_strategy: "TIME_SHARING" ``` +Finally, the following is adding multivpc to a node pool (with sharing config) attached to an `n1` machine: + +```yaml + - id: network + source: modules/network/vpc + + - id: multinetwork + source: modules/network/multivpc + settings: + network_name_prefix: multivpc-net + network_count: 8 + global_ip_address_range: 172.16.0.0/12 + subnetwork_cidr_suffix: 16 + + - id: existing-gke-cluster ## multinetworking must be enabled in advance when cluster creation + source: modules/scheduler/pre-existing-gke-cluster + use: [multinetwork] + settings: + cluster_name: $(vars.deployment_name) + + - id: a3-megagpu_pool + source: modules/compute/gke-node-pool + use: [existing-gke-cluster, multinetwork] + settings: + machine_type: a3-megagpu-8g + guest_accelerator: + - type: nvidia-h100-mega-80gb + count: 8 + gpu_driver_installation_config: + - gpu_driver_version: LATEST + ... +``` + ## License @@ -221,6 +258,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no | | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 551ba1f5a5..d644488e62 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -152,6 +152,17 @@ resource "google_container_node_pool" "node_pool" { } } + network_config { + dynamic "additional_node_network_configs" { + for_each = var.additional_networks + + content { + network = additional_node_network_configs.value.network + subnetwork = additional_node_network_configs.value.subnetwork + } + } + } + timeouts { create = var.timeout_create update = var.timeout_update diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 4cb4bf0af1..4f9374e0c6 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -265,3 +265,27 @@ variable "service_account" { error_message = "service_account is deprecated and replaced with service_account_email and scopes." } } +variable "additional_networks" { + description = "Additional network interface details for GCE, if any. Providing additional networks adds additional node networks to the node pool" + default = [] + type = list(object({ + network = string + subnetwork = string + subnetwork_project = string + network_ip = string + nic_type = string + stack_type = string + queue_count = number + access_config = list(object({ + nat_ip = string + network_tier = string + })) + ipv6_access_config = list(object({ + network_tier = string + })) + alias_ip_range = list(object({ + ip_cidr_range = string + subnetwork_range_name = string + })) + })) +} diff --git a/modules/network/multivpc/main.tf b/modules/network/multivpc/main.tf index f52b826808..fc7f2b6e9c 100644 --- a/modules/network/multivpc/main.tf +++ b/modules/network/multivpc/main.tf @@ -24,7 +24,7 @@ locals { maximum_subnetworks = pow(2, local.subnetwork_new_bits) additional_networks = [ for vpc in module.vpcs : - merge(var.network_interface_defaults, { subnetwork = vpc.subnetwork_name, subnetwork_project = var.project_id }) + merge(var.network_interface_defaults, { network = vpc.network_name, subnetwork = vpc.subnetwork_name, subnetwork_project = var.project_id }) ] } diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index e8bd757493..6b3dbfd5ef 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -42,6 +42,16 @@ are created in the VPC module. By default the `gke-cluster` module will look for ranges with the names `pods` and `services`. These names can be configured using the `pods_ip_range_name` and `services_ip_range_name` settings. +If you are going to add multiple networks to the cluster, you need to + +1. Set `enable_multi_networking` to true when creating the cluster (enables Dataplane V2, too). +2. Add a second deployment group to your blueprint and continue the below in the second deployment group. +3. Add a multivpc module. +4. Add a pre-existing-gke-cluster module and pass the multivpc to the cluster. +5. Add a gke-node-pool module and pass the multivpc to the nodepool, too. + +Find an example of multi networking in GKE [here](../../../examples/gke-multi-vpc-a3-megagpu-8g.yaml). + ### Cluster Limitations The current implementations has the following limitations: @@ -76,7 +86,6 @@ limitations under the License. | [terraform](#requirement\_terraform) | >= 1.0 | | [google](#requirement\_google) | > 5.0 | | [google-beta](#requirement\_google-beta) | > 5.0 | -| [kubernetes](#requirement\_kubernetes) | ~> 2.23 | ## Providers @@ -103,7 +112,6 @@ limitations under the License. | [google_project_iam_member.node_service_account_metric_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | | [google_project_iam_member.node_service_account_monitoring_viewer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | | [google_project_iam_member.node_service_account_resource_metadata_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | -| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source | | [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | ## Inputs @@ -118,6 +126,7 @@ limitations under the License. | [enable\_filestore\_csi](#input\_enable\_filestore\_csi) | The status of the Filestore Container Storage Interface (CSI) driver addon, which allows the usage of filestore instance as volumes. | `bool` | `false` | no | | [enable\_gcsfuse\_csi](#input\_enable\_gcsfuse\_csi) | The status of the GCSFuse Filestore Container Storage Interface (CSI) driver addon, which allows the usage of a gcs bucket as volumes. | `bool` | `false` | no | | [enable\_master\_global\_access](#input\_enable\_master\_global\_access) | Whether the cluster master is accessible globally (from any region) or only within the same region as the private endpoint. | `bool` | `false` | no | +| [enable\_multi\_networking](#input\_enable\_multi\_networking) | Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster). This setting is immutable on clusters and enables Dataplane V2. | `bool` | `false` | no | | [enable\_persistent\_disk\_csi](#input\_enable\_persistent\_disk\_csi) | The status of the Google Compute Engine Persistent Disk Container Storage Interface (CSI) driver addon, which allows the usage of a PD as volumes. | `bool` | `true` | no | | [enable\_private\_endpoint](#input\_enable\_private\_endpoint) | (Beta) Whether the master's internal IP address is used as the cluster endpoint. | `bool` | `true` | no | | [enable\_private\_ipv6\_google\_access](#input\_enable\_private\_ipv6\_google\_access) | The private IPv6 google access type for the VMs in this subnet. | `bool` | `true` | no | diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index 59e6822a19..00ef26359b 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -30,6 +30,9 @@ locals { }] sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email + + #multi networking needs Dataplane v2 enabled + derived_enable_dataplane_v2 = var.enable_multi_networking ? true : var.enable_dataplane_v2 } data "google_compute_default_service_account" "default_sa" { @@ -85,7 +88,11 @@ resource "google_container_cluster" "gke_cluster" { autoscaling_profile = var.autoscaling_profile } - datapath_provider = var.enable_dataplane_v2 ? "ADVANCED_DATAPATH" : "LEGACY_DATAPATH" + datapath_provider = local.derived_enable_dataplane_v2 ? "ADVANCED_DATAPATH" : "LEGACY_DATAPATH" + + enable_multi_networking = var.enable_multi_networking + + networking_mode = "VPC_NATIVE" network_policy { # Enabling NetworkPolicy for clusters with DatapathProvider=ADVANCED_DATAPATH @@ -290,14 +297,6 @@ resource "google_project_iam_member" "node_service_account_artifact_registry" { member = "serviceAccount:${local.sa_email}" } -data "google_client_config" "default" {} - -provider "kubernetes" { - host = "https://${google_container_cluster.gke_cluster.endpoint}" - cluster_ca_certificate = base64decode(google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate) - token = data.google_client_config.default.access_token -} - module "workload_identity" { count = var.configure_workload_identity_sa ? 1 : 0 source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity" diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf index 3cf7c7dc5f..b20ec4ffcc 100644 --- a/modules/scheduler/gke-cluster/variables.tf +++ b/modules/scheduler/gke-cluster/variables.tf @@ -296,3 +296,9 @@ variable "service_account" { error_message = "service_account is deprecated and replaced with service_account_email and scopes." } } + +variable "enable_multi_networking" { + description = "Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster). This setting is immutable on clusters and enables Dataplane V2." + type = bool + default = false +} diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf index 56db91d4db..d6b4233510 100644 --- a/modules/scheduler/gke-cluster/versions.tf +++ b/modules/scheduler/gke-cluster/versions.tf @@ -24,10 +24,6 @@ terraform { source = "hashicorp/google-beta" version = "> 5.0" } - kubernetes = { - source = "hashicorp/kubernetes" - version = "~> 2.23" - } } provider_meta "google" { module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.37.2" diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md index e4a40c2315..c93462d89b 100644 --- a/modules/scheduler/pre-existing-gke-cluster/README.md +++ b/modules/scheduler/pre-existing-gke-cluster/README.md @@ -31,6 +31,29 @@ GKE node pool will be created. > **_NOTE:_** The `project_id` and `region` settings would be inferred from the > deployment variables of the same name, but they are included here for clarity. +### Multi-networking + +To create network objects in GKE cluster, you can pass a multivpc module to a pre-existing-gke-cluster module instead of [applying a manifest manually](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment). + +```yaml + - id: network + source: modules/network/vpc + + - id: multinetwork + source: modules/network/multivpc + settings: + network_name_prefix: multivpc-net + network_count: 8 + global_ip_address_range: 172.16.0.0/12 + subnetwork_cidr_suffix: 16 + + - id: existing-gke-cluster ## multinetworking must be enabled in advance when cluster creation + source: modules/scheduler/pre-existing-gke-cluster + use: [multinetwork] + settings: + cluster_name: $(vars.deployment_name) +``` + ## License @@ -54,12 +77,14 @@ limitations under the License. |------|---------| | [terraform](#requirement\_terraform) | >= 1.0.0 | | [google](#requirement\_google) | > 5.0 | +| [kubernetes](#requirement\_kubernetes) | > 2.23 | ## Providers | Name | Version | |------|---------| | [google](#provider\_google) | > 5.0 | +| [kubernetes.gke\_cluster](#provider\_kubernetes.gke\_cluster) | > 2.23 | ## Modules @@ -69,12 +94,16 @@ No modules. | Name | Type | |------|------| +| [kubernetes_manifest.additional_net_params](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/manifest) | resource | +| [kubernetes_manifest.additional_nets](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/manifest) | resource | +| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source | | [google_container_cluster.existing_gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. Providing additional networks creates relevat network objects on the cluster. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [cluster\_name](#input\_cluster\_name) | Name of the existing cluster | `string` | n/a | yes | | [project\_id](#input\_project\_id) | Project that hosts the existing cluster | `string` | n/a | yes | | [region](#input\_region) | Region in which to search for the cluster | `string` | n/a | yes | diff --git a/modules/scheduler/pre-existing-gke-cluster/main.tf b/modules/scheduler/pre-existing-gke-cluster/main.tf index c59e35e8da..bbbeb9f8b7 100644 --- a/modules/scheduler/pre-existing-gke-cluster/main.tf +++ b/modules/scheduler/pre-existing-gke-cluster/main.tf @@ -19,3 +19,57 @@ data "google_container_cluster" "existing_gke_cluster" { project = var.project_id location = var.region } + +data "google_client_config" "default" {} + +provider "kubernetes" { + alias = "gke_cluster" + host = "https://${data.google_container_cluster.existing_gke_cluster.endpoint}" #"https://34.27.120.195" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(data.google_container_cluster.existing_gke_cluster.master_auth[0].cluster_ca_certificate) +} + +resource "kubernetes_manifest" "additional_net_params" { + for_each = { for idx, network_info in var.additional_networks : idx => network_info } + + depends_on = [data.google_container_cluster.existing_gke_cluster] + + manifest = { + "apiVersion" = "networking.gke.io/v1" + "kind" = "GKENetworkParamSet" + "metadata" = { + "name" = "additional-network-${each.key}" # Unique name for each GKENetworkParamSet + } + "spec" = { + "vpc" = each.value.network + "vpcSubnet" = each.value.subnetwork + "deviceMode" = "NetDevice" + } + } + + provider = kubernetes.gke_cluster +} + +resource "kubernetes_manifest" "additional_nets" { + for_each = { for idx, network_info in var.additional_networks : idx => network_info } + + depends_on = [data.google_container_cluster.existing_gke_cluster, kubernetes_manifest.additional_net_params] + + manifest = { + "apiVersion" = "networking.gke.io/v1" + "kind" = "Network" + "metadata" = { + "name" = "additional-network-${each.key}" # Unique name for each Network + } + "spec" = { + "parametersRef" = { + "group" = "networking.gke.io" + "kind" = "GKENetworkParamSet" + "name" = "additional-network-${each.key}" # Reference the corresponding param set + } + "type" = "Device" + } + } + + provider = kubernetes.gke_cluster +} diff --git a/modules/scheduler/pre-existing-gke-cluster/variables.tf b/modules/scheduler/pre-existing-gke-cluster/variables.tf index 5d2121ba69..af7d39d8f4 100644 --- a/modules/scheduler/pre-existing-gke-cluster/variables.tf +++ b/modules/scheduler/pre-existing-gke-cluster/variables.tf @@ -28,3 +28,28 @@ variable "region" { description = "Region in which to search for the cluster" type = string } + +variable "additional_networks" { + description = "Additional network interface details for GCE, if any. Providing additional networks creates relevat network objects on the cluster." + default = [] + type = list(object({ + network = string + subnetwork = string + subnetwork_project = string + network_ip = string + nic_type = string + stack_type = string + queue_count = number + access_config = list(object({ + nat_ip = string + network_tier = string + })) + ipv6_access_config = list(object({ + network_tier = string + })) + alias_ip_range = list(object({ + ip_cidr_range = string + subnetwork_range_name = string + })) + })) +} diff --git a/modules/scheduler/pre-existing-gke-cluster/versions.tf b/modules/scheduler/pre-existing-gke-cluster/versions.tf index 827bfc63a9..66738b62e1 100644 --- a/modules/scheduler/pre-existing-gke-cluster/versions.tf +++ b/modules/scheduler/pre-existing-gke-cluster/versions.tf @@ -20,6 +20,10 @@ terraform { source = "hashicorp/google" version = "> 5.0" } + kubernetes = { + source = "hashicorp/kubernetes" + version = "> 2.23" + } } provider_meta "google" { From 68773ae4964406515ebd15ba33fe3752594d5bcb Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Sun, 28 Jul 2024 19:03:11 +0000 Subject: [PATCH 070/180] multivpc support added to gke-cluster module. example and doc updated. --- examples/README.md | 17 +++--- examples/gke-multi-vpc-a3-megagpu-8g.yaml | 47 +++++++-------- modules/compute/gke-node-pool/README.md | 17 +++--- modules/compute/gke-node-pool/main.tf | 14 +++-- modules/scheduler/gke-cluster/README.md | 35 ++++++++--- modules/scheduler/gke-cluster/main.tf | 58 ++++++++++++++++++- modules/scheduler/gke-cluster/variables.tf | 27 ++++++++- modules/scheduler/gke-cluster/versions.tf | 4 ++ .../pre-existing-gke-cluster/README.md | 2 +- .../pre-existing-gke-cluster/main.tf | 2 +- .../pre-existing-gke-cluster/variables.tf | 2 +- 11 files changed, 166 insertions(+), 59 deletions(-) diff --git a/examples/README.md b/examples/README.md index 5b37cecff3..cd3f64186f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1536,26 +1536,27 @@ cleaned up when the job is deleted. [storage-gke.yaml]: ../examples/storage-gke.yaml -### [multivpc-gke.yaml] ![community-badge] ![experimental-badge] +### [gke-multi-vpc-a3-megagpu-8g.yaml] ![community-badge] ![experimental-badge] This blueprint shows how to use multiple VPC networks with GKE in the toolkit. -The blueprint contains the following: - -* Two deployment groups - * The primary to provision the cluster - * The second group to apply mltiple VPCs to the cluster +After provisioning the cluster and the nodepool, we need to do the following: +1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl +2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector +3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload +4- Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl +5- Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests > [!Note] > The Kubernetes API server will only allow requests from authorized networks. -> The `pre-existing-gke-cluster` module needs access to the Kubernetes API server +> The `gke-cluster` module needs access to the Kubernetes API server > to apply a manifest. **You must use > the `authorized_cidr` variable to supply an authorized network which contains > the IP address of the machine deploying the blueprint, for example > `--vars authorized_cidr=/32`.** You can use a service like > [whatismyip.com](https://whatismyip.com) to determine your IP address. -[multivpc-gke.yaml]: ../examples/gke-multi-vpc-a3-mgagpu-8g.yaml +[gke-multi-vpc-a3-megagpu-8g.yaml]: ../examples/gke-multi-vpc-a3-megagpu-8g.yaml ### [htc-htcondor.yaml] ![community-badge] ![experimental-badge] diff --git a/examples/gke-multi-vpc-a3-megagpu-8g.yaml b/examples/gke-multi-vpc-a3-megagpu-8g.yaml index 78b1158189..e15a7776ae 100644 --- a/examples/gke-multi-vpc-a3-megagpu-8g.yaml +++ b/examples/gke-multi-vpc-a3-megagpu-8g.yaml @@ -40,19 +40,6 @@ deployment_groups: - range_name: services ip_cidr_range: 10.0.32.0/20 - - id: gke_cluster - source: modules/scheduler/gke-cluster - use: [network1] - settings: - enable_private_endpoint: false # Allows for access from authorized public IPs - enable_multi_networking: true - master_authorized_networks: - - cidr_block: $(vars.authorized_cidr) - display_name: "kubectl-access-network" # Allows your machine run kubectl command. - outputs: [instructions] - -- group: multivpcgke - modules: - id: gpunets source: modules/network/multivpc settings: @@ -61,15 +48,20 @@ deployment_groups: network_count: 8 subnetwork_cidr_suffix: 24 - - id: existing-gke-cluster - source: modules/scheduler/pre-existing-gke-cluster - use: [gpunets] + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network1, gpunets] settings: - cluster_name: $(vars.deployment_name) + enable_private_endpoint: false # Allows for access from authorized public IPs + enable_multi_networking: true + master_authorized_networks: + - cidr_block: $(vars.authorized_cidr) + display_name: "kubectl-access-network" # Allows your machine run kubectl command. + outputs: [instructions] - id: a3-megagpu_pool source: modules/compute/gke-node-pool - use: [existing-gke-cluster, gpunets] + use: [gke_cluster, gpunets] settings: machine_type: a3-megagpu-8g guest_accelerator: @@ -77,18 +69,23 @@ deployment_groups: count: 8 gpu_driver_installation_config: - gpu_driver_version: LATEST - static_node_count: 2 - disk_type: pd-ssd + disk_type: pd-balanced + autoscaling_total_min_nodes: 2 zones: [$(vars.zone)] - local_ssd_count_ephemeral_storage: 16 + +# We need to do the following here: +# 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl +# 2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector +# 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload +# 4- Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl +# 5- Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests - id: job-template source: modules/compute/gke-job-template use: [a3-megagpu_pool] settings: - image: busybox + image: nvidia/cuda:11.0.3-runtime-ubuntu20.04 command: - - echo - - Hello World - node_count: 3 + - nvidia-smi + node_count: 2 outputs: [instructions] diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index dc54cb67c3..0fe3f291a4 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -73,9 +73,10 @@ use-cases). In this case, ensure that you turn off the [enable_secure_boot](#input\_enable\_secure\_boot) option to allow unsigned kernel modules to be loaded. -To maximize GPU network bandwidth, nodepools accept multiple VPCs. Pass a multivpc module to gke-node-pool module, and [install GPUDirect and configure NCCL] (https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl). -> **_NOTE:_** You must [enable multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) feature when creating the GKE cluster. -> To create network objects in gke cluster pass, the multivpc module to a pre-existing-gke-cluster module as well or [apply a manifest manually](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment). +To maximize GPU network bandwidth, nodepools accept multiple VPCs. Pass a multivpc module to gke-node-pool module, and [take these steps] (https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl) to install GPUDirect, configure NCCL, use recommended settings, and add GPUDirect to your pods. + +> **_NOTE:_** You must [enable multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) feature when creating the GKE cluster. Passing the multivpc module to a gke-cluster module enables multi networking on the cluster creation. +> Passing the multivpc module to a gke-cluster or pre-existing-gke-cluster module creates required network objects on the cluster for multi networking. You can do so by [applying a manifest manually](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment), too. ### GPUs Examples @@ -172,7 +173,7 @@ The following is an example of using a GPU (with sharing config) attached to an gpu_sharing_strategy: "TIME_SHARING" ``` -Finally, the following is adding multivpc to a node pool (with sharing config) attached to an `n1` machine: +Finally, the following is adding multivpc to a node pool: ```yaml - id: network @@ -186,15 +187,15 @@ Finally, the following is adding multivpc to a node pool (with sharing config) a global_ip_address_range: 172.16.0.0/12 subnetwork_cidr_suffix: 16 - - id: existing-gke-cluster ## multinetworking must be enabled in advance when cluster creation - source: modules/scheduler/pre-existing-gke-cluster - use: [multinetwork] + - id: gke-cluster + source: modules/scheduler/gke-cluster + use: [network, multinetwork] settings: cluster_name: $(vars.deployment_name) - id: a3-megagpu_pool source: modules/compute/gke-node-pool - use: [existing-gke-cluster, multinetwork] + use: [gke-cluster, multinetwork] settings: machine_type: a3-megagpu-8g guest_accelerator: diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index d644488e62..53efcbe88d 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -104,12 +104,18 @@ resource "google_container_node_pool" "node_pool" { } } - ephemeral_storage_local_ssd_config { - local_ssd_count = var.local_ssd_count_ephemeral_storage + dynamic "ephemeral_storage_local_ssd_config" { + for_each = var.local_ssd_count_ephemeral_storage > 0 ? [1] : [] + content { + local_ssd_count = var.local_ssd_count_ephemeral_storage + } } - local_nvme_ssd_block_config { - local_ssd_count = var.local_ssd_count_nvme_block + dynamic "local_nvme_ssd_block_config" { + for_each = var.local_ssd_count_nvme_block > 0 ? [1] : [] + content { + local_ssd_count = var.local_ssd_count_nvme_block + } } shielded_instance_config { diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 6b3dbfd5ef..4f1e97246d 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -42,13 +42,28 @@ are created in the VPC module. By default the `gke-cluster` module will look for ranges with the names `pods` and `services`. These names can be configured using the `pods_ip_range_name` and `services_ip_range_name` settings. -If you are going to add multiple networks to the cluster, you need to +### Multi-networking -1. Set `enable_multi_networking` to true when creating the cluster (enables Dataplane V2, too). -2. Add a second deployment group to your blueprint and continue the below in the second deployment group. -3. Add a multivpc module. -4. Add a pre-existing-gke-cluster module and pass the multivpc to the cluster. -5. Add a gke-node-pool module and pass the multivpc to the nodepool, too. +To create network objects in GKE cluster, you can pass a multivpc module to gke-cluster module instead of [applying a manifest manually](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment). Passing a multivpc module enables multi networking and [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en) on the cluster. + +```yaml + - id: network + source: modules/network/vpc + + - id: multinetwork + source: modules/network/multivpc + settings: + network_name_prefix: multivpc-net + network_count: 8 + global_ip_address_range: 172.16.0.0/12 + subnetwork_cidr_suffix: 16 + + - id: gke-cluster + source: modules/scheduler/gke-cluster + use: [network, multinetwork] ## enables multi networking and Dataplane V2 on cluster + settings: + cluster_name: $(vars.deployment_name) +``` Find an example of multi networking in GKE [here](../../../examples/gke-multi-vpc-a3-megagpu-8g.yaml). @@ -86,6 +101,7 @@ limitations under the License. | [terraform](#requirement\_terraform) | >= 1.0 | | [google](#requirement\_google) | > 5.0 | | [google-beta](#requirement\_google-beta) | > 5.0 | +| [kubectl](#requirement\_kubectl) | >= 1.7.0 | ## Providers @@ -93,6 +109,7 @@ limitations under the License. |------|---------| | [google](#provider\_google) | > 5.0 | | [google-beta](#provider\_google-beta) | > 5.0 | +| [kubectl](#provider\_kubectl) | >= 1.7.0 | ## Modules @@ -112,12 +129,16 @@ limitations under the License. | [google_project_iam_member.node_service_account_metric_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | | [google_project_iam_member.node_service_account_monitoring_viewer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | | [google_project_iam_member.node_service_account_resource_metadata_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | +| [kubectl_manifest.additional_net_params](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | +| [kubectl_manifest.additional_nets](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | +| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source | | [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks enables multi networking and creates relevat network objects on the cluster. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [authenticator\_security\_group](#input\_authenticator\_security\_group) | The name of the RBAC security group for use with Google security groups in Kubernetes RBAC. Group name must be in format gke-security-groups@yourdomain.com | `string` | `null` | no | | [autoscaling\_profile](#input\_autoscaling\_profile) | (Beta) Optimize for utilization or availability when deciding to remove nodes. Can be BALANCED or OPTIMIZE\_UTILIZATION. | `string` | `"OPTIMIZE_UTILIZATION"` | no | | [configure\_workload\_identity\_sa](#input\_configure\_workload\_identity\_sa) | When true, a kubernetes service account will be created and bound using workload identity to the service account used to create the cluster. | `bool` | `false` | no | @@ -126,7 +147,7 @@ limitations under the License. | [enable\_filestore\_csi](#input\_enable\_filestore\_csi) | The status of the Filestore Container Storage Interface (CSI) driver addon, which allows the usage of filestore instance as volumes. | `bool` | `false` | no | | [enable\_gcsfuse\_csi](#input\_enable\_gcsfuse\_csi) | The status of the GCSFuse Filestore Container Storage Interface (CSI) driver addon, which allows the usage of a gcs bucket as volumes. | `bool` | `false` | no | | [enable\_master\_global\_access](#input\_enable\_master\_global\_access) | Whether the cluster master is accessible globally (from any region) or only within the same region as the private endpoint. | `bool` | `false` | no | -| [enable\_multi\_networking](#input\_enable\_multi\_networking) | Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster). This setting is immutable on clusters and enables Dataplane V2. | `bool` | `false` | no | +| [enable\_multi\_networking](#input\_enable\_multi\_networking) | Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en). | `bool` | `false` | no | | [enable\_persistent\_disk\_csi](#input\_enable\_persistent\_disk\_csi) | The status of the Google Compute Engine Persistent Disk Container Storage Interface (CSI) driver addon, which allows the usage of a PD as volumes. | `bool` | `true` | no | | [enable\_private\_endpoint](#input\_enable\_private\_endpoint) | (Beta) Whether the master's internal IP address is used as the cluster endpoint. | `bool` | `true` | no | | [enable\_private\_ipv6\_google\_access](#input\_enable\_private\_ipv6\_google\_access) | The private IPv6 google access type for the VMs in this subnet. | `bool` | `true` | no | diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index 00ef26359b..9af5ff8711 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -31,8 +31,11 @@ locals { sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email - #multi networking needs Dataplane v2 enabled - derived_enable_dataplane_v2 = var.enable_multi_networking ? true : var.enable_dataplane_v2 + # additional VPCs enable multi networking + derived_enable_multi_networking = length(var.additional_networks) > 0 ? true : var.enable_multi_networking + + # multi networking needs enabled Dataplane v2 + derived_enable_dataplane_v2 = local.derived_enable_multi_networking ? true : var.enable_dataplane_v2 } data "google_compute_default_service_account" "default_sa" { @@ -90,7 +93,7 @@ resource "google_container_cluster" "gke_cluster" { datapath_provider = local.derived_enable_dataplane_v2 ? "ADVANCED_DATAPATH" : "LEGACY_DATAPATH" - enable_multi_networking = var.enable_multi_networking + enable_multi_networking = local.derived_enable_multi_networking networking_mode = "VPC_NATIVE" @@ -314,3 +317,52 @@ module "workload_identity" { google_container_cluster.gke_cluster ] } + +data "google_client_config" "default" {} + +provider "kubectl" { + host = "https://${google_container_cluster.gke_cluster.endpoint}" + cluster_ca_certificate = base64decode(google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate) + token = data.google_client_config.default.access_token + load_config_file = false +} + +resource "kubectl_manifest" "additional_net_params" { + for_each = { for idx, network_info in var.additional_networks : idx => network_info } + + depends_on = [google_container_cluster.gke_cluster] + + yaml_body = < network_info } + + depends_on = [google_container_cluster.gke_cluster, kubectl_manifest.additional_net_params] + + yaml_body = < [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. Providing additional networks creates relevat network objects on the cluster. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks creates relevat network objects on the cluster. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [cluster\_name](#input\_cluster\_name) | Name of the existing cluster | `string` | n/a | yes | | [project\_id](#input\_project\_id) | Project that hosts the existing cluster | `string` | n/a | yes | | [region](#input\_region) | Region in which to search for the cluster | `string` | n/a | yes | diff --git a/modules/scheduler/pre-existing-gke-cluster/main.tf b/modules/scheduler/pre-existing-gke-cluster/main.tf index bbbeb9f8b7..755a6164bf 100644 --- a/modules/scheduler/pre-existing-gke-cluster/main.tf +++ b/modules/scheduler/pre-existing-gke-cluster/main.tf @@ -24,7 +24,7 @@ data "google_client_config" "default" {} provider "kubernetes" { alias = "gke_cluster" - host = "https://${data.google_container_cluster.existing_gke_cluster.endpoint}" #"https://34.27.120.195" + host = "https://${data.google_container_cluster.existing_gke_cluster.endpoint}" token = data.google_client_config.default.access_token cluster_ca_certificate = base64decode(data.google_container_cluster.existing_gke_cluster.master_auth[0].cluster_ca_certificate) } diff --git a/modules/scheduler/pre-existing-gke-cluster/variables.tf b/modules/scheduler/pre-existing-gke-cluster/variables.tf index af7d39d8f4..67e7a24dca 100644 --- a/modules/scheduler/pre-existing-gke-cluster/variables.tf +++ b/modules/scheduler/pre-existing-gke-cluster/variables.tf @@ -30,7 +30,7 @@ variable "region" { } variable "additional_networks" { - description = "Additional network interface details for GCE, if any. Providing additional networks creates relevat network objects on the cluster." + description = "Additional network interface details for GKE, if any. Providing additional networks creates relevat network objects on the cluster." default = [] type = list(object({ network = string From f7fd9f707850e0343436e8fe5e595286327952e2 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Sun, 28 Jul 2024 19:41:10 +0000 Subject: [PATCH 071/180] a variable description corrected --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 0fe3f291a4..fee5355b6d 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -259,7 +259,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no | | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 4f9374e0c6..599cbb5d08 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -266,7 +266,7 @@ variable "service_account" { } } variable "additional_networks" { - description = "Additional network interface details for GCE, if any. Providing additional networks adds additional node networks to the node pool" + description = "Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool" default = [] type = list(object({ network = string From 08022c84cb673c18dd2759d55c9ef4313cb0b490 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Thu, 8 Aug 2024 15:58:03 +0000 Subject: [PATCH 072/180] Comments resolved. gke-a3-highgpu example added. --- .../htcondor-execute-point/gpu_definition.tf | 1 + .../gpu_definition.tf | 1 + .../gpu_definition.tf | 1 + .../gpu_definition.tf | 1 + .../gpu_definition.tf | 1 + .../gpu_definition.tf | 1 + .../gpu_definition.tf | 1 + .../gpu_definition.tf | 1 + examples/README.md | 31 ++++++- examples/gke-a3-highgpu.yaml | 86 +++++++++++++++++++ ...a3-megagpu-8g.yaml => gke-a3-megagpu.yaml} | 15 ++-- modules/compute/gke-node-pool/README.md | 9 +- .../compute/gke-node-pool/gpu_definition.tf | 1 + modules/compute/gke-node-pool/main.tf | 6 +- modules/compute/gke-node-pool/variables.tf | 8 +- modules/compute/vm-instance/gpu_definition.tf | 1 + modules/scheduler/gke-cluster/README.md | 16 +++- modules/scheduler/gke-cluster/main.tf | 6 +- modules/scheduler/gke-cluster/variables.tf | 8 +- modules/scheduler/gke-cluster/versions.tf | 2 +- .../pre-existing-gke-cluster/README.md | 10 +-- .../pre-existing-gke-cluster/main.tf | 63 +++++++------- .../pre-existing-gke-cluster/versions.tf | 8 +- 23 files changed, 199 insertions(+), 79 deletions(-) create mode 100644 examples/gke-a3-highgpu.yaml rename examples/{gke-multi-vpc-a3-megagpu-8g.yaml => gke-a3-megagpu.yaml} (88%) diff --git a/community/modules/compute/htcondor-execute-point/gpu_definition.tf b/community/modules/compute/htcondor-execute-point/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/compute/htcondor-execute-point/gpu_definition.tf +++ b/community/modules/compute/htcondor-execute-point/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/examples/README.md b/examples/README.md index cd3f64186f..7efdc91d82 100644 --- a/examples/README.md +++ b/examples/README.md @@ -52,7 +52,8 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [hpc-gke.yaml](#hpc-gkeyaml--) ![community-badge] ![experimental-badge] * [ml-gke](#ml-gkeyaml--) ![community-badge] ![experimental-badge] * [storage-gke](#storage-gkeyaml--) ![community-badge] ![experimental-badge] - * [multivpc-gke](#multivpc-gkeyaml--) ![community-badge] ![experimental-badge] + * [gke-a3-megagpu](#gke-a3-megagpuyaml--) ![community-badge] ![experimental-badge] + * [gke-a3-highgpu](#gke-a3-highgpuyaml--) ![community-badge] ![experimental-badge] * [htc-slurm-v5-legacy.yaml](#htc-slurm-v5-legacyyaml--) ![community-badge] ![experimental-badge] * [htc-slurm.yaml](#htc-slurmyaml-) ![community-badge] * [htc-htcondor.yaml](#htc-htcondoryaml--) ![community-badge] ![experimental-badge] @@ -1536,9 +1537,9 @@ cleaned up when the job is deleted. [storage-gke.yaml]: ../examples/storage-gke.yaml -### [gke-multi-vpc-a3-megagpu-8g.yaml] ![community-badge] ![experimental-badge] +### [gke-a3-megagpu.yaml] ![community-badge] ![experimental-badge] -This blueprint shows how to use multiple VPC networks with GKE in the toolkit. +This blueprint shows how to provision a GKE cluster with A3 Megagpu machines in the toolkit. After provisioning the cluster and the nodepool, we need to do the following: 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl @@ -1556,7 +1557,29 @@ After provisioning the cluster and the nodepool, we need to do the following: > `--vars authorized_cidr=/32`.** You can use a service like > [whatismyip.com](https://whatismyip.com) to determine your IP address. -[gke-multi-vpc-a3-megagpu-8g.yaml]: ../examples/gke-multi-vpc-a3-megagpu-8g.yaml +[gke-a3-megagpu.yaml]: ../examples/gke-a3-megagpu.yaml + +### [gke-a3-highgpu.yaml] ![community-badge] ![experimental-badge] + +This blueprint shows how to provision a GKE cluster with A3 Highgpu machines in the toolkit. + +After provisioning the cluster and the nodepool, we need to do the following: +1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl +2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector +3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload +4- Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl +5- Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests + +> [!Note] +> The Kubernetes API server will only allow requests from authorized networks. +> The `gke-cluster` module needs access to the Kubernetes API server +> to apply a manifest. **You must use +> the `authorized_cidr` variable to supply an authorized network which contains +> the IP address of the machine deploying the blueprint, for example +> `--vars authorized_cidr=/32`.** You can use a service like +> [whatismyip.com](https://whatismyip.com) to determine your IP address. + +[gke-a3-highgpu.yaml]: ../examples/gke-a3-highgpu.yaml ### [htc-htcondor.yaml] ![community-badge] ![experimental-badge] diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml new file mode 100644 index 0000000000..0ff875cdb3 --- /dev/null +++ b/examples/gke-a3-highgpu.yaml @@ -0,0 +1,86 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: gke-a3-highgpu + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: gke-a3-highgpu + region: us-central1 + zone: us-central1-c + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: /32 + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet + secondary_ranges: + gke-subnet: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: gpunets + source: modules/network/multivpc + settings: + network_name_prefix: $(vars.deployment_name)-gpunet + global_ip_address_range: 192.169.0.0/16 + network_count: 4 + subnetwork_cidr_suffix: 24 + + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network1, gpunets] + settings: + enable_private_endpoint: false # Allows for access from authorized public IPs + enable_multi_networking: true + master_authorized_networks: + - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. + display_name: "kubectl-access-network" + outputs: [instructions] + + - id: a3-highgpu_pool + source: modules/compute/gke-node-pool + use: [gke_cluster, gpunets] + settings: + machine_type: a3-highgpu-8g + disk_type: pd-balanced + autoscaling_total_min_nodes: 2 + zones: [$(vars.zone)] + +# We need to do the following here: +# 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl +# 2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector +# 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload +# 4- Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl +# 5- Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests + + - id: job-template + source: modules/compute/gke-job-template + use: [a3-highgpu_pool] + settings: + image: nvidia/cuda:11.0.3-runtime-ubuntu20.04 + command: + - nvidia-smi + node_count: 2 + outputs: [instructions] diff --git a/examples/gke-multi-vpc-a3-megagpu-8g.yaml b/examples/gke-a3-megagpu.yaml similarity index 88% rename from examples/gke-multi-vpc-a3-megagpu-8g.yaml rename to examples/gke-a3-megagpu.yaml index e15a7776ae..fb51cc0933 100644 --- a/examples/gke-multi-vpc-a3-megagpu-8g.yaml +++ b/examples/gke-a3-megagpu.yaml @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,11 +14,11 @@ --- -blueprint_name: gke-multi-vpc-a3-megagpu-8g +blueprint_name: gke-a3-mega vars: project_id: ## Set GCP Project ID Here ## - deployment_name: gkemvpc + deployment_name: gke-a3-mega region: us-central1 zone: us-central1-c @@ -55,8 +55,8 @@ deployment_groups: enable_private_endpoint: false # Allows for access from authorized public IPs enable_multi_networking: true master_authorized_networks: - - cidr_block: $(vars.authorized_cidr) - display_name: "kubectl-access-network" # Allows your machine run kubectl command. + - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. + display_name: "kubectl-access-network" outputs: [instructions] - id: a3-megagpu_pool @@ -64,11 +64,6 @@ deployment_groups: use: [gke_cluster, gpunets] settings: machine_type: a3-megagpu-8g - guest_accelerator: - - type: nvidia-h100-mega-80gb - count: 8 - gpu_driver_installation_config: - - gpu_driver_version: LATEST disk_type: pd-balanced autoscaling_total_min_nodes: 2 zones: [$(vars.zone)] diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index fee5355b6d..458f2b8e9b 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -198,11 +198,6 @@ Finally, the following is adding multivpc to a node pool: use: [gke-cluster, multinetwork] settings: machine_type: a3-megagpu-8g - guest_accelerator: - - type: nvidia-h100-mega-80gb - count: 8 - gpu_driver_installation_config: - - gpu_driver_version: LATEST ... ``` @@ -273,8 +268,8 @@ No modules. | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
[See above](#local-ssd-storage) for more info. | `number` | `0` | no | -| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
[See above](#local-ssd-storage) for more info. | `number` | `0` | no | +| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | diff --git a/modules/compute/gke-node-pool/gpu_definition.tf b/modules/compute/gke-node-pool/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/modules/compute/gke-node-pool/gpu_definition.tf +++ b/modules/compute/gke-node-pool/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 53efcbe88d..5f6b02f9be 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -105,14 +105,14 @@ resource "google_container_node_pool" "node_pool" { } dynamic "ephemeral_storage_local_ssd_config" { - for_each = var.local_ssd_count_ephemeral_storage > 0 ? [1] : [] + for_each = var.local_ssd_count_ephemeral_storage != null ? [1] : [] content { local_ssd_count = var.local_ssd_count_ephemeral_storage } } dynamic "local_nvme_ssd_block_config" { - for_each = var.local_ssd_count_nvme_block > 0 ? [1] : [] + for_each = var.local_ssd_count_nvme_block != null ? [1] : [] content { local_ssd_count = var.local_ssd_count_nvme_block } @@ -183,7 +183,7 @@ resource "google_container_node_pool" "node_pool" { error_message = "static_node_count cannot be set with either autoscaling_total_min_nodes or autoscaling_total_max_nodes." } precondition { - condition = !(var.local_ssd_count_ephemeral_storage > 0 && var.local_ssd_count_nvme_block > 0) + condition = !(coalesce(var.local_ssd_count_ephemeral_storage, 0) > 0 && coalesce(var.local_ssd_count_nvme_block, 0) > 0) error_message = "Only one of local_ssd_count_ephemeral_storage or local_ssd_count_nvme_block can be set to a non-zero value." } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 599cbb5d08..9d3dd5d9e5 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -96,7 +96,11 @@ variable "local_ssd_count_ephemeral_storage" { [See above](#local-ssd-storage) for more info. EOT type = number - default = 0 + default = null + validation { + condition = !(var.local_ssd_count_ephemeral_storage != null && coalesce(var.local_ssd_count_ephemeral_storage, 0) > 0 && var.local_ssd_count_nvme_block != null && coalesce(var.local_ssd_count_nvme_block, 0) > 0) + error_message = "Only one of local_ssd_count_ephemeral_storage or local_ssd_count_nvme_block can be set." + } } variable "local_ssd_count_nvme_block" { @@ -107,7 +111,7 @@ variable "local_ssd_count_nvme_block" { EOT type = number - default = 0 + default = null } diff --git a/modules/compute/vm-instance/gpu_definition.tf b/modules/compute/vm-instance/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/modules/compute/vm-instance/gpu_definition.tf +++ b/modules/compute/vm-instance/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 4f1e97246d..d075418785 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -49,6 +49,14 @@ To create network objects in GKE cluster, you can pass a multivpc module to gke- ```yaml - id: network source: modules/network/vpc + settings: + subnetwork_name: gke-subnet + secondary_ranges: + gke-subnet: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 - id: multinetwork source: modules/network/multivpc @@ -65,7 +73,7 @@ To create network objects in GKE cluster, you can pass a multivpc module to gke- cluster_name: $(vars.deployment_name) ``` -Find an example of multi networking in GKE [here](../../../examples/gke-multi-vpc-a3-megagpu-8g.yaml). +Find an example of multi networking in GKE [here](../../../examples/gke-a3-megagpu.yaml). ### Cluster Limitations @@ -98,7 +106,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.0 | +| [terraform](#requirement\_terraform) | >= 1.9 | | [google](#requirement\_google) | > 5.0 | | [google-beta](#requirement\_google-beta) | > 5.0 | | [kubectl](#requirement\_kubectl) | >= 1.7.0 | @@ -143,11 +151,11 @@ limitations under the License. | [autoscaling\_profile](#input\_autoscaling\_profile) | (Beta) Optimize for utilization or availability when deciding to remove nodes. Can be BALANCED or OPTIMIZE\_UTILIZATION. | `string` | `"OPTIMIZE_UTILIZATION"` | no | | [configure\_workload\_identity\_sa](#input\_configure\_workload\_identity\_sa) | When true, a kubernetes service account will be created and bound using workload identity to the service account used to create the cluster. | `bool` | `false` | no | | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment. Used in the GKE cluster name by default and can be configured with `prefix_with_deployment_name`. | `string` | n/a | yes | -| [enable\_dataplane\_v2](#input\_enable\_dataplane\_v2) | Enables [Dataplane v2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2). This setting is immutable on clusters. | `bool` | `false` | no | +| [enable\_dataplane\_v2](#input\_enable\_dataplane\_v2) | Enables [Dataplane v2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2). This setting is immutable on clusters. | `bool` | `null` | no | | [enable\_filestore\_csi](#input\_enable\_filestore\_csi) | The status of the Filestore Container Storage Interface (CSI) driver addon, which allows the usage of filestore instance as volumes. | `bool` | `false` | no | | [enable\_gcsfuse\_csi](#input\_enable\_gcsfuse\_csi) | The status of the GCSFuse Filestore Container Storage Interface (CSI) driver addon, which allows the usage of a gcs bucket as volumes. | `bool` | `false` | no | | [enable\_master\_global\_access](#input\_enable\_master\_global\_access) | Whether the cluster master is accessible globally (from any region) or only within the same region as the private endpoint. | `bool` | `false` | no | -| [enable\_multi\_networking](#input\_enable\_multi\_networking) | Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en). | `bool` | `false` | no | +| [enable\_multi\_networking](#input\_enable\_multi\_networking) | Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) (Requires GKE Enterprise). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en). | `bool` | `false` | no | | [enable\_persistent\_disk\_csi](#input\_enable\_persistent\_disk\_csi) | The status of the Google Compute Engine Persistent Disk Container Storage Interface (CSI) driver addon, which allows the usage of a PD as volumes. | `bool` | `true` | no | | [enable\_private\_endpoint](#input\_enable\_private\_endpoint) | (Beta) Whether the master's internal IP address is used as the cluster endpoint. | `bool` | `true` | no | | [enable\_private\_ipv6\_google\_access](#input\_enable\_private\_ipv6\_google\_access) | The private IPv6 google access type for the VMs in this subnet. | `bool` | `true` | no | diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index 9af5ff8711..b79f0d7338 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -336,7 +336,7 @@ resource "kubectl_manifest" "additional_net_params" { apiVersion: networking.gke.io/v1 kind: GKENetworkParamSet metadata: - name: additional-network-${each.key} + name: vpc${each.key + 1} spec: vpc: ${each.value.network} vpcSubnet: ${each.value.subnetwork} @@ -355,12 +355,12 @@ resource "kubectl_manifest" "additional_nets" { apiVersion: networking.gke.io/v1 kind: Network metadata: - name: additional-network-${each.key} + name: vpc${each.key + 1} spec: parametersRef: group: networking.gke.io kind: GKENetworkParamSet - name: additional-network-${each.key} + name: vpc${each.key + 1} type: Device YAML diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf index 984a3099b9..352adccfd2 100644 --- a/modules/scheduler/gke-cluster/variables.tf +++ b/modules/scheduler/gke-cluster/variables.tf @@ -262,7 +262,7 @@ variable "authenticator_security_group" { variable "enable_dataplane_v2" { description = "Enables [Dataplane v2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2). This setting is immutable on clusters." type = bool - default = false + default = null } variable "labels" { @@ -298,9 +298,13 @@ variable "service_account" { } variable "enable_multi_networking" { - description = "Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en)." + description = "Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) (Requires GKE Enterprise). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en)." type = bool default = false + validation { + condition = (var.enable_dataplane_v2 == null || coalesce(var.enable_dataplane_v2, false)) || !coalesce(var.enable_multi_networking, true) + error_message = "'enable_dataplane_v2' must be null or true when enabling multi networking." + } } variable "additional_networks" { diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf index 87b491767e..5fbe728e23 100644 --- a/modules/scheduler/gke-cluster/versions.tf +++ b/modules/scheduler/gke-cluster/versions.tf @@ -13,7 +13,7 @@ # limitations under the License. terraform { - required_version = ">= 1.0" + required_version = ">= 1.9" required_providers { google = { diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md index d3102579a4..ccd50df286 100644 --- a/modules/scheduler/pre-existing-gke-cluster/README.md +++ b/modules/scheduler/pre-existing-gke-cluster/README.md @@ -75,16 +75,16 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.0.0 | +| [terraform](#requirement\_terraform) | >= 1.9 | | [google](#requirement\_google) | > 5.0 | -| [kubernetes](#requirement\_kubernetes) | > 2.23 | +| [kubectl](#requirement\_kubectl) | >= 1.7.0 | ## Providers | Name | Version | |------|---------| | [google](#provider\_google) | > 5.0 | -| [kubernetes.gke\_cluster](#provider\_kubernetes.gke\_cluster) | > 2.23 | +| [kubectl](#provider\_kubectl) | >= 1.7.0 | ## Modules @@ -94,8 +94,8 @@ No modules. | Name | Type | |------|------| -| [kubernetes_manifest.additional_net_params](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/manifest) | resource | -| [kubernetes_manifest.additional_nets](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/manifest) | resource | +| [kubectl_manifest.additional_net_params](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | +| [kubectl_manifest.additional_nets](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source | | [google_container_cluster.existing_gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source | diff --git a/modules/scheduler/pre-existing-gke-cluster/main.tf b/modules/scheduler/pre-existing-gke-cluster/main.tf index 755a6164bf..4a99e90487 100644 --- a/modules/scheduler/pre-existing-gke-cluster/main.tf +++ b/modules/scheduler/pre-existing-gke-cluster/main.tf @@ -22,54 +22,49 @@ data "google_container_cluster" "existing_gke_cluster" { data "google_client_config" "default" {} -provider "kubernetes" { - alias = "gke_cluster" +provider "kubectl" { host = "https://${data.google_container_cluster.existing_gke_cluster.endpoint}" token = data.google_client_config.default.access_token cluster_ca_certificate = base64decode(data.google_container_cluster.existing_gke_cluster.master_auth[0].cluster_ca_certificate) + load_config_file = false } -resource "kubernetes_manifest" "additional_net_params" { +resource "kubectl_manifest" "additional_net_params" { for_each = { for idx, network_info in var.additional_networks : idx => network_info } depends_on = [data.google_container_cluster.existing_gke_cluster] - manifest = { - "apiVersion" = "networking.gke.io/v1" - "kind" = "GKENetworkParamSet" - "metadata" = { - "name" = "additional-network-${each.key}" # Unique name for each GKENetworkParamSet - } - "spec" = { - "vpc" = each.value.network - "vpcSubnet" = each.value.subnetwork - "deviceMode" = "NetDevice" - } - } + yaml_body = < network_info } - depends_on = [data.google_container_cluster.existing_gke_cluster, kubernetes_manifest.additional_net_params] + depends_on = [data.google_container_cluster.existing_gke_cluster, kubectl_manifest.additional_net_params] - manifest = { - "apiVersion" = "networking.gke.io/v1" - "kind" = "Network" - "metadata" = { - "name" = "additional-network-${each.key}" # Unique name for each Network - } - "spec" = { - "parametersRef" = { - "group" = "networking.gke.io" - "kind" = "GKENetworkParamSet" - "name" = "additional-network-${each.key}" # Reference the corresponding param set - } - "type" = "Device" - } - } + yaml_body = < Date: Mon, 12 Aug 2024 13:24:48 +0000 Subject: [PATCH 073/180] NO_RESERVATION to be the default reservation type --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/main.tf | 3 --- modules/compute/gke-node-pool/variables.tf | 4 ++-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index e21a5f141a..e96c4f97dc 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -239,7 +239,7 @@ No modules. | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | -| [reservation\_type](#input\_reservation\_type) | Type of reservation to consume | `string` | `null` | no | +| [reservation\_type](#input\_reservation\_type) | Type of reservation to consume | `string` | `"NO_RESERVATION"` | no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 2cf366a7f3..b160eb1984 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -151,9 +151,6 @@ resource "google_container_node_pool" "node_pool" { } } - # TODO(arajmane): Default values for params in this block considering that - # this block need not be passed at all if reservation_affinity is not required - # Or, values of the params key and values are not required when any_reservation is to be used reservation_affinity { consume_reservation_type = var.reservation_type key = var.specific_reservation.key diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 4bec7e3c35..eab92d61e5 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -269,7 +269,7 @@ variable "service_account" { variable "reservation_type" { description = "Type of reservation to consume" type = string - default = null + default = "NO_RESERVATION" validation { condition = contains(["NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"], var.reservation_type) @@ -289,7 +289,7 @@ variable "specific_reservation" { } validation { - condition = (var.specific_reservation.key == "compute.googleapis.com/reservation-name" && length(var.specific_reservation.values) > 0) || (var.specific_reservation.key == null && var.specific_reservation.values == null) + condition = (var.specific_reservation.key == null && var.specific_reservation.values == null) || (var.specific_reservation.key == "compute.googleapis.com/reservation-name" && var.specific_reservation.values != null) error_message = "Value must be equal to `compute.googleapis.com/reservation-name` when targeting a SPECIFIC_RESERVATION. Otherwise, do not specify the value" } } From f04755ee5aee54d846be3931157aca849f597b2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Mon, 12 Aug 2024 14:12:04 +0000 Subject: [PATCH 074/180] Remove additional_networks from template, as it is always when instance is created --- .../modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf | 1 - 1 file changed, 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 8eda16a4e5..d9ce81db36 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -25,7 +25,6 @@ module "slurm_login_template" { name_prefix = each.value.name_prefix additional_disks = each.value.additional_disks - additional_networks = each.value.additional_networks bandwidth_tier = each.value.bandwidth_tier can_ip_forward = each.value.can_ip_forward disable_smt = each.value.disable_smt From 0516e53e9d2be7b7b1b92827d6c26c20205bf4f4 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Mon, 12 Aug 2024 15:24:12 +0000 Subject: [PATCH 075/180] variable validations replaced with precondition in gke modules. --- examples/README.md | 4 ++-- examples/gke-a3-highgpu.yaml | 1 - examples/gke-a3-megagpu.yaml | 1 - modules/compute/gke-node-pool/README.md | 10 +++++++++- modules/compute/gke-node-pool/variables.tf | 4 ---- modules/scheduler/gke-cluster/README.md | 6 +++--- modules/scheduler/gke-cluster/main.tf | 12 ++++++++++-- modules/scheduler/gke-cluster/variables.tf | 6 +----- modules/scheduler/gke-cluster/versions.tf | 2 +- modules/scheduler/pre-existing-gke-cluster/README.md | 2 +- .../scheduler/pre-existing-gke-cluster/versions.tf | 2 +- 11 files changed, 28 insertions(+), 22 deletions(-) diff --git a/examples/README.md b/examples/README.md index 7efdc91d82..e3a7181d2e 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1539,7 +1539,7 @@ cleaned up when the job is deleted. ### [gke-a3-megagpu.yaml] ![community-badge] ![experimental-badge] -This blueprint shows how to provision a GKE cluster with A3 Megagpu machines in the toolkit. +This blueprint shows how to provision a GKE cluster with A3 Mega machines in the toolkit. After provisioning the cluster and the nodepool, we need to do the following: 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl @@ -1561,7 +1561,7 @@ After provisioning the cluster and the nodepool, we need to do the following: ### [gke-a3-highgpu.yaml] ![community-badge] ![experimental-badge] -This blueprint shows how to provision a GKE cluster with A3 Highgpu machines in the toolkit. +This blueprint shows how to provision a GKE cluster with A3 High machines in the toolkit. After provisioning the cluster and the nodepool, we need to do the following: 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml index 0ff875cdb3..908b119d8f 100644 --- a/examples/gke-a3-highgpu.yaml +++ b/examples/gke-a3-highgpu.yaml @@ -64,7 +64,6 @@ deployment_groups: use: [gke_cluster, gpunets] settings: machine_type: a3-highgpu-8g - disk_type: pd-balanced autoscaling_total_min_nodes: 2 zones: [$(vars.zone)] diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml index fb51cc0933..866d175d0f 100644 --- a/examples/gke-a3-megagpu.yaml +++ b/examples/gke-a3-megagpu.yaml @@ -64,7 +64,6 @@ deployment_groups: use: [gke_cluster, gpunets] settings: machine_type: a3-megagpu-8g - disk_type: pd-balanced autoscaling_total_min_nodes: 2 zones: [$(vars.zone)] diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 458f2b8e9b..373e00c634 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -76,7 +76,7 @@ kernel modules to be loaded. To maximize GPU network bandwidth, nodepools accept multiple VPCs. Pass a multivpc module to gke-node-pool module, and [take these steps] (https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl) to install GPUDirect, configure NCCL, use recommended settings, and add GPUDirect to your pods. > **_NOTE:_** You must [enable multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) feature when creating the GKE cluster. Passing the multivpc module to a gke-cluster module enables multi networking on the cluster creation. -> Passing the multivpc module to a gke-cluster or pre-existing-gke-cluster module creates required network objects on the cluster for multi networking. You can do so by [applying a manifest manually](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment), too. +> Passing the multivpc module to a gke-cluster or pre-existing-gke-cluster module [creates required network objects](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment) on the cluster for multi networking. ### GPUs Examples @@ -178,6 +178,14 @@ Finally, the following is adding multivpc to a node pool: ```yaml - id: network source: modules/network/vpc + settings: + subnetwork_name: gke-subnet + secondary_ranges: + gke-subnet: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 - id: multinetwork source: modules/network/multivpc diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 9d3dd5d9e5..4f8f33d330 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -97,10 +97,6 @@ variable "local_ssd_count_ephemeral_storage" { EOT type = number default = null - validation { - condition = !(var.local_ssd_count_ephemeral_storage != null && coalesce(var.local_ssd_count_ephemeral_storage, 0) > 0 && var.local_ssd_count_nvme_block != null && coalesce(var.local_ssd_count_nvme_block, 0) > 0) - error_message = "Only one of local_ssd_count_ephemeral_storage or local_ssd_count_nvme_block can be set." - } } variable "local_ssd_count_nvme_block" { diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index d075418785..71bce049c7 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -44,7 +44,7 @@ the `pods_ip_range_name` and `services_ip_range_name` settings. ### Multi-networking -To create network objects in GKE cluster, you can pass a multivpc module to gke-cluster module instead of [applying a manifest manually](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment). Passing a multivpc module enables multi networking and [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en) on the cluster. +To [enable Multi-networking](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment), pass multivpc module to gke-cluster module as described in example below. Passing a multivpc module enables multi networking and [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en) on the cluster. ```yaml - id: network @@ -106,7 +106,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9 | +| [terraform](#requirement\_terraform) | >= 1.3 | | [google](#requirement\_google) | > 5.0 | | [google-beta](#requirement\_google-beta) | > 5.0 | | [kubectl](#requirement\_kubectl) | >= 1.7.0 | @@ -155,7 +155,7 @@ limitations under the License. | [enable\_filestore\_csi](#input\_enable\_filestore\_csi) | The status of the Filestore Container Storage Interface (CSI) driver addon, which allows the usage of filestore instance as volumes. | `bool` | `false` | no | | [enable\_gcsfuse\_csi](#input\_enable\_gcsfuse\_csi) | The status of the GCSFuse Filestore Container Storage Interface (CSI) driver addon, which allows the usage of a gcs bucket as volumes. | `bool` | `false` | no | | [enable\_master\_global\_access](#input\_enable\_master\_global\_access) | Whether the cluster master is accessible globally (from any region) or only within the same region as the private endpoint. | `bool` | `false` | no | -| [enable\_multi\_networking](#input\_enable\_multi\_networking) | Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) (Requires GKE Enterprise). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en). | `bool` | `false` | no | +| [enable\_multi\_networking](#input\_enable\_multi\_networking) | Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) (Requires GKE Enterprise). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en). | `bool` | `null` | no | | [enable\_persistent\_disk\_csi](#input\_enable\_persistent\_disk\_csi) | The status of the Google Compute Engine Persistent Disk Container Storage Interface (CSI) driver addon, which allows the usage of a PD as volumes. | `bool` | `true` | no | | [enable\_private\_endpoint](#input\_enable\_private\_endpoint) | (Beta) Whether the master's internal IP address is used as the cluster endpoint. | `bool` | `true` | no | | [enable\_private\_ipv6\_google\_access](#input\_enable\_private\_ipv6\_google\_access) | The private IPv6 google access type for the VMs in this subnet. | `bool` | `true` | no | diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index b79f0d7338..99f1dd9c9c 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -32,10 +32,10 @@ locals { sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email # additional VPCs enable multi networking - derived_enable_multi_networking = length(var.additional_networks) > 0 ? true : var.enable_multi_networking + derived_enable_multi_networking = length(var.additional_networks) > 0 ? true : coalesce(var.enable_multi_networking, false) # multi networking needs enabled Dataplane v2 - derived_enable_dataplane_v2 = local.derived_enable_multi_networking ? true : var.enable_dataplane_v2 + derived_enable_dataplane_v2 = local.derived_enable_multi_networking ? true : coalesce(var.enable_dataplane_v2, false) } data "google_compute_default_service_account" "default_sa" { @@ -178,6 +178,14 @@ resource "google_container_cluster" "gke_cluster" { ignore_changes = [ node_config ] + precondition { + condition = !(!coalesce(var.enable_dataplane_v2, true) && (coalesce(var.enable_multi_networking, false) || length(var.additional_networks) > 0)) + error_message = "'enable_dataplane_v2' cannot be false when enabling multi networking." + } + precondition { + condition = !(!coalesce(var.enable_multi_networking, true) && length(var.additional_networks) > 0) + error_message = "'enable_multi_networking' cannot be false when passing multivpc module." + } } logging_service = "logging.googleapis.com/kubernetes" diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf index 352adccfd2..ab1f465392 100644 --- a/modules/scheduler/gke-cluster/variables.tf +++ b/modules/scheduler/gke-cluster/variables.tf @@ -300,11 +300,7 @@ variable "service_account" { variable "enable_multi_networking" { description = "Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) (Requires GKE Enterprise). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en)." type = bool - default = false - validation { - condition = (var.enable_dataplane_v2 == null || coalesce(var.enable_dataplane_v2, false)) || !coalesce(var.enable_multi_networking, true) - error_message = "'enable_dataplane_v2' must be null or true when enabling multi networking." - } + default = null } variable "additional_networks" { diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf index 5fbe728e23..47928fc116 100644 --- a/modules/scheduler/gke-cluster/versions.tf +++ b/modules/scheduler/gke-cluster/versions.tf @@ -13,7 +13,7 @@ # limitations under the License. terraform { - required_version = ">= 1.9" + required_version = ">= 1.3" required_providers { google = { diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md index ccd50df286..e227450c81 100644 --- a/modules/scheduler/pre-existing-gke-cluster/README.md +++ b/modules/scheduler/pre-existing-gke-cluster/README.md @@ -75,7 +75,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9 | +| [terraform](#requirement\_terraform) | >= 1.3 | | [google](#requirement\_google) | > 5.0 | | [kubectl](#requirement\_kubectl) | >= 1.7.0 | diff --git a/modules/scheduler/pre-existing-gke-cluster/versions.tf b/modules/scheduler/pre-existing-gke-cluster/versions.tf index 24bfd82c9c..52fa5e10e2 100644 --- a/modules/scheduler/pre-existing-gke-cluster/versions.tf +++ b/modules/scheduler/pre-existing-gke-cluster/versions.tf @@ -30,5 +30,5 @@ terraform { module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.37.2" } - required_version = ">= 1.9" + required_version = ">= 1.3" } From 3cb5e2a514b22df5db0b52a71ce06006fa594db4 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Mon, 12 Aug 2024 15:49:24 +0000 Subject: [PATCH 076/180] enable_multi_networking removed from gke-a3 blueprints --- examples/gke-a3-highgpu.yaml | 1 - examples/gke-a3-megagpu.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml index 908b119d8f..0fb28d9098 100644 --- a/examples/gke-a3-highgpu.yaml +++ b/examples/gke-a3-highgpu.yaml @@ -53,7 +53,6 @@ deployment_groups: use: [network1, gpunets] settings: enable_private_endpoint: false # Allows for access from authorized public IPs - enable_multi_networking: true master_authorized_networks: - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. display_name: "kubectl-access-network" diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml index 866d175d0f..0ad89334d9 100644 --- a/examples/gke-a3-megagpu.yaml +++ b/examples/gke-a3-megagpu.yaml @@ -53,7 +53,6 @@ deployment_groups: use: [network1, gpunets] settings: enable_private_endpoint: false # Allows for access from authorized public IPs - enable_multi_networking: true master_authorized_networks: - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. display_name: "kubectl-access-network" From b4478806a03abf50fbbb29e12827c80cf55655ad Mon Sep 17 00:00:00 2001 From: Scott Gordon Date: Mon, 12 Aug 2024 19:10:12 +0100 Subject: [PATCH 077/180] OFE update --- community/front-end/ofe/README.md | 2 +- community/front-end/ofe/deploy.sh | 8 +- .../front-end/ofe/docs/developer_guide.md | 10 +- .../roles/c2_daemon/files/ghpcfe_c2daemon.py | 20 ++- .../roles/c2_daemon/tasks/main.yaml | 39 ++--- .../gcs_bucket/webserver/startup.sh | 29 ++-- community/front-end/ofe/tf/README.md | 2 +- community/front-end/ofe/tf/variables.tf | 2 +- .../ghpcfe/cluster_manager/cloud_info.py | 149 ++++++++++++++++++ .../ghpcfe/cluster_manager/clusterinfo.py | 26 ++- .../ghpcfe/cluster_manager/filesystem.py | 2 +- .../website/ghpcfe/cluster_manager/image.py | 2 +- .../front-end/ofe/website/ghpcfe/forms.py | 89 +++++++++++ .../front-end/ofe/website/ghpcfe/models.py | 5 +- .../blueprint/cluster_config.yaml.j2 | 32 ++-- .../blueprint/partition_config.yaml.j2 | 19 +-- .../ghpcfe/templates/cluster/update_form.html | 84 ++++++---- .../ofe/website/ghpcfe/views/clusters.py | 23 +++ community/front-end/ofe/website/nginx.conf | 2 +- .../front-end/ofe/website/website/settings.py | 2 +- 20 files changed, 423 insertions(+), 124 deletions(-) diff --git a/community/front-end/ofe/README.md b/community/front-end/ofe/README.md index 6d9a028122..8bcfb54a7d 100644 --- a/community/front-end/ofe/README.md +++ b/community/front-end/ofe/README.md @@ -15,7 +15,7 @@ steps: * Prepare the client side environment and secure sufficient IAM permissions for the system deployment. * When ready, clone this repository and run the deployment script at - `hpc-toolkit/community/front-end/ofe/deploy.sh` from a client machine or a Cloud + `cluster-toolkit/community/front-end/ofe/deploy.sh` from a client machine or a Cloud Shell. Follow instructions to complete the deployment. The whole process is automated via Terraform and should complete within 15 minutes. * Perform post-deployment configurations. diff --git a/community/front-end/ofe/deploy.sh b/community/front-end/ofe/deploy.sh index 7b9fd09c82..a546d9686d 100755 --- a/community/front-end/ofe/deploy.sh +++ b/community/front-end/ofe/deploy.sh @@ -504,7 +504,7 @@ deploy() { # -- Collect deployment files # # For a tarball deployment, it is important that the 'root' directory is - # named 'hpc-toolkit' as most of the install depends on it. + # named 'cluster-toolkit' as most of the install depends on it. # # Simplest way to ensure this is to build from a temporary copy that # definitely is named correctly. @@ -512,7 +512,7 @@ deploy() { if [ "${deployment_mode}" == "tarball" ]; then basedir=$(git rev-parse --show-toplevel) - tdir=/tmp/hpc-toolkit + tdir=/tmp/cluster-toolkit cp -R "${basedir}" ${tdir}/ ( @@ -523,7 +523,7 @@ deploy() { --exclude=.terraform.lock.hcl \ --exclude=tf \ --directory=/tmp \ - ./hpc-toolkit 2>/dev/null + ./cluster-toolkit 2>/dev/null ) rm -rf ${tdir} @@ -562,7 +562,7 @@ TFVARS fi if [ "${deployment_mode}" == "git" ]; then - echo "Will clone hpc-toolkit from github.com/${repo_fork}/hpc-toolkit.git ${repo_branch} branch." + echo "Will clone cluster-toolkit from github.com/${repo_fork}/cluster-toolkit.git ${repo_branch} branch." cat <<-END >>terraform.tfvars repo_fork = "${repo_fork}" diff --git a/community/front-end/ofe/docs/developer_guide.md b/community/front-end/ofe/docs/developer_guide.md index 63abc18a9a..1b6a4202a4 100644 --- a/community/front-end/ofe/docs/developer_guide.md +++ b/community/front-end/ofe/docs/developer_guide.md @@ -148,7 +148,7 @@ The home directory of the *gcluster* account is at `/opt/gcluster`. For a new de #### For cloud resources Run-time data to support creating and managing cloud resources are generated -and stored in the following sub-directories within `hpc-toolkit/frontend` on +and stored in the following sub-directories within `cluster-toolkit/frontend` on the service machine: - `clusters/cluster_\` - holding run-time data for a cluster. `\` here @@ -246,7 +246,7 @@ define the major components: | dir | description | |-----------------------------|-------------| -| `hpc-toolkit/frontend/` | Top level | +| `cluster-toolkit/frontend/` | Top level | | `.../cli/` | client commandline interface | | `.../docs/` | documentation | | `.../infrastructure_files/` | Support files for deploying cloud infrastructure | @@ -344,7 +344,7 @@ not currently support Vertex AI Workbenches. ### Infrastructure files Workbenches are created using a template configuration in -`hpc-toolkit/frontend/infrastructure_files/workbench_tf`. The Terraform +`cluster-toolkit/frontend/infrastructure_files/workbench_tf`. The Terraform template was originally based on the Terraform template provided by the [Google Cloud Platform Rad-Lab git repo](https://github.com/GoogleCloudPlatform/rad-lab) however the configuration diverged during early development. The main reason @@ -353,11 +353,11 @@ specific OSLogin user rather than the generic Jupyter user which would make it impossible to interact properly with any mounted shared storage. The process of creating the workbench files is mostly contained within the file -`hpc-toolkit/frontend/website/ghpcfe/cluster_manager/workbenchinfo.py`. The +`cluster-toolkit/frontend/website/ghpcfe/cluster_manager/workbenchinfo.py`. The `copy_terraform()` routine copies files from the `infrastructure_files` directory while the `prepare_terraform_vars()` routine creates a `terraform.tfvars` file within the -`hpc-toolkit/frontend/workbenches/workbench_##` directory to provide the +`cluster-toolkit/frontend/workbenches/workbench_##` directory to provide the following info gathered by the FrontEnd during the workbench creation process: - region diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py index 2a4a144e28..89256e514e 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py @@ -243,11 +243,17 @@ def _slurm_get_job_info(jobid): def _slurm_get_job_state(jobid): - """Returns the job state, or None if job isn't in the queue""" - # N.B - eventually, pyslurm might work with our version of Slurm, - # and this can be changed to something more sane. For now, call squeue - state = _slurm_get_job_info(jobid) - return state.get("job_state", None) if state else None + """Returns the job state, or None if the job isn't in the queue""" + state = _slurm_get_job_info(jobid) # Fetch job info using an external function + job_state = state.get("job_state", None) if state else None # Get the 'job_state' if available + + if job_state and isinstance(job_state, list) and job_state: + logger.info("Slurm returned job %s with state %s", jobid, job_state[0]) # Log the first state if available + return job_state[0] # Return the first element of the state list + else: + logger.info("No valid job state available for job %s", jobid) # Log when no valid state is found + + return None # Return None if there is no job state or it's not a list def _spack_submit_build(app_id, partition, app_name, spec, extra_sbatch=None): @@ -925,12 +931,14 @@ def cb_run_job(message, **kwargs): try: slurm_job_info = _slurm_get_job_info(slurm_jobid) response["job_runtime"] = ( - slurm_job_info["end_time"] - slurm_job_info["start_time"] + slurm_job_info["end_time"]["number"] - slurm_job_info["start_time"]["number"] ) except KeyError: logger.warning( "Job data from SLURM did not include start time and end time" ) + except Exception as E: + logger.error("Unexpected error: %s", E) kpi = job_dir / "kpi.json" if kpi.is_file(): diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml index 6ffe2cf2ae..8a5d8d4724 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml @@ -13,37 +13,30 @@ # limitations under the License. --- -- name: Set most recent Python version as default - ansible.builtin.shell: - cmd: | - latest_version=$(ls -1 /usr/bin/python3* | awk -F/ '{print $NF}' | grep -E 'python[0-9]+\.[0-9]+$' | sort -V | tail -1) - alternatives --set python3 /usr/bin/$latest_version - when: ansible_distribution == 'Rocky' +- name: Get default Python interpreter from update-alternatives + ansible.builtin.shell: > + update-alternatives --display python3 | + grep 'link currently points to' | + awk '{print $NF}' + register: default_python + changed_when: false -- name: Install pip3 - ansible.builtin.package: - name: python3-pip - state: present - become: true - when: ansible_distribution == 'Rocky' +- name: Set default Python interpreter for Ansible + ansible.builtin.set_fact: + ansible_python_interpreter: "{{ default_python.stdout }}" -- name: Install setuptools for Python 3.11 - ansible.builtin.command: - cmd: /usr/bin/python3.11 -m ensurepip --upgrade - become: true - when: ansible_distribution == 'Rocky' +- name: Verify Python interpreter + ansible.builtin.command: "{{ ansible_python_interpreter }} --version" + register: python_version -- name: Upgrade PIP3 - ansible.builtin.pip: - executable: pip3 - name: pip - state: forcereinstall +- name: Display Python version + ansible.builtin.debug: + msg: "The Python interpreter version is: {{ python_version.stdout }}" # Can't use the pip action here because we need to explicitly enable # a modern gcc from the dev_env role - name: Install FE C&C Dependencies ansible.builtin.pip: - executable: pip3 name: - requests - pexpect diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh index ecce1e3a32..c43eab7380 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh @@ -35,7 +35,7 @@ deploy_mode=$(curl --silent --show-error http://metadata/computeMetadata/v1/inst # Exit if deployment already exists to stop startup script running on reboots # -if [[ -d /opt/gcluster/hpc-toolkit ]]; then +if [[ -d /opt/gcluster/cluster-toolkit ]]; then printf "It appears gcluster has already been deployed. Exiting...\n" exit 0 fi @@ -48,9 +48,10 @@ printf "####################\n#### Installing required packages\n############### dnf install -y epel-release dnf update -y --security dnf config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo +dnf install -y terraform-1.4.6 dnf install --best -y google-cloud-sdk nano make gcc python38-devel unzip git \ rsync wget nginx bind-utils policycoreutils-python-utils \ - terraform packer supervisor python3-certbot-nginx jq + packer supervisor python3-certbot-nginx jq curl --silent --show-error --location https://github.com/mikefarah/yq/releases/download/v4.13.4/yq_linux_amd64 --output /usr/local/bin/yq chmod +x /usr/local/bin/yq curl --silent --show-error --location https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz --output /tmp/shellcheck.tar.xz @@ -75,7 +76,7 @@ EOL dnf install -y grafana -# Packages for https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller#input_enable_cleanup_compute +# Packages for https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller#input_enable_cleanup_compute pip3.8 install google-api-python-client \ google-cloud-secret-manager \ google.cloud.pubsub \ @@ -136,7 +137,7 @@ fi useradd -r -m -d /opt/gcluster gcluster if [ "${deploy_mode}" == "git" ]; then - fetch_hpc_toolkit="git clone -b \"${repo_branch}\" https://github.com/${repo_fork}/hpc-toolkit.git" + fetch_hpc_toolkit="git clone -b \"${repo_branch}\" https://github.com/${repo_fork}/cluster-toolkit.git" elif [ "${deploy_mode}" == "tarball" ]; then printf "\n####################\n#### Download web application files\n####################\n" @@ -159,7 +160,7 @@ EOF # Install go version specified in go.mod file # # Note: go.mod doesn't reference minor version so we need to capture the latest -GO_MAJOR_VERSION=$(awk '/^go/ {print $2}' "/opt/gcluster/hpc-toolkit/go.mod") +GO_MAJOR_VERSION=$(awk '/^go/ {print $2}' "/opt/gcluster/cluster-toolkit/go.mod") GO_API_RESPONSE=$(curl --silent "https://go.dev/dl/?mode=json") GO_VERSION=$(echo "$GO_API_RESPONSE" | jq -r --arg major "go$GO_MAJOR_VERSION" '.[] | select(.version | startswith($major)).version' | sort -V | tail -n 1) GO_DOWNLOAD_URL="https://golang.org/dl/${GO_VERSION}.linux-amd64.tar.gz" @@ -171,7 +172,7 @@ rm -rf /usr/local/go && tar -C /usr/local -xzf "/tmp/${GO_VERSION}.linux-amd64.t echo 'export PATH=$PATH:/usr/local/go/bin:~/go/bin' >>/etc/bashrc sudo su - gcluster -c /bin/bash < configuration.yaml @@ -243,7 +244,7 @@ EOL printf "Creating supervisord service..." echo "[program:gcluster-uvicorn-background] process_name=%(program_name)s_%(process_num)02d -directory=/opt/gcluster/hpc-toolkit/community/front-end/ofe/website +directory=/opt/gcluster/cluster-toolkit/community/front-end/ofe/website command=/opt/gcluster/django-env/bin/uvicorn website.asgi:application --reload --host 127.0.0.1 --port 8001 autostart=true autorestart=true @@ -261,8 +262,8 @@ After=supervisord.service grafana-server.service [Service] Type=forking -ExecStart=/usr/sbin/nginx -p /opt/gcluster/run/ -c /opt/gcluster/hpc-toolkit/community/front-end/ofe/website/nginx.conf -ExecStop=/usr/sbin/nginx -p /opt/gcluster/run/ -c /opt/gcluster/hpc-toolkit/community/front-end/ofe/website/nginx.conf -s stop +ExecStart=/usr/sbin/nginx -p /opt/gcluster/run/ -c /opt/gcluster/cluster-toolkit/community/front-end/ofe/website/nginx.conf +ExecStop=/usr/sbin/nginx -p /opt/gcluster/run/ -c /opt/gcluster/cluster-toolkit/community/front-end/ofe/website/nginx.conf -s stop PIDFile=/opt/gcluster/run/nginx.pid Restart=no @@ -280,7 +281,7 @@ systemctl status gcluster.service # sudo su - gcluster -c /bin/bash <>"${tmpcron}" # .. if something more forceful/complete is needed: - # echo "0 12 * * * /usr/bin/certbot certonly --force-renew --quiet" --nginx --nginx-server-root=/opt/gcluster/hpc-toolkit/community/front-end/ofe/website --cert-name "${SERVER_HOSTNAME}" -m "${DJANGO_EMAIL}" >>"${tmpcron}" + # echo "0 12 * * * /usr/bin/certbot certonly --force-renew --quiet" --nginx --nginx-server-root=/opt/gcluster/cluster-toolkit/community/front-end/ofe/website --cert-name "${SERVER_HOSTNAME}" -m "${DJANGO_EMAIL}" >>"${tmpcron}" crontab -u root "${tmpcron}" rm "${tmpcron}" diff --git a/community/front-end/ofe/tf/README.md b/community/front-end/ofe/tf/README.md index faa58140ae..212979bfc1 100644 --- a/community/front-end/ofe/tf/README.md +++ b/community/front-end/ofe/tf/README.md @@ -61,7 +61,7 @@ limitations under the License. | [project\_id](#input\_project\_id) | GCP Project in which to deploy the HPC Frontend. | `string` | n/a | yes | | [region](#input\_region) | GCP Region for HPC Frontend deployment. | `string` | n/a | yes | | [repo\_branch](#input\_repo\_branch) | git branch to checkout when deploying the HPC Frontend | `string` | `"main"` | no | -| [repo\_fork](#input\_repo\_fork) | GitHub repository name in which to find the hpc-toolkit repo | `string` | `"GoogleCloudPlatform"` | no | +| [repo\_fork](#input\_repo\_fork) | GitHub repository name in which to find the cluster-toolkit repo | `string` | `"GoogleCloudPlatform"` | no | | [server\_instance\_type](#input\_server\_instance\_type) | Instance size to use from HPC Frontend webserver | `string` | `"e2-standard-2"` | no | | [static\_ip](#input\_static\_ip) | Optional pre-configured static IP for HPC Frontend. | `string` | `""` | no | | [subnet](#input\_subnet) | Subnet in which to deploy HPC Frontend. | `string` | `""` | no | diff --git a/community/front-end/ofe/tf/variables.tf b/community/front-end/ofe/tf/variables.tf index fec65fd059..be06e2578a 100644 --- a/community/front-end/ofe/tf/variables.tf +++ b/community/front-end/ofe/tf/variables.tf @@ -94,7 +94,7 @@ variable "repo_branch" { variable "repo_fork" { default = "GoogleCloudPlatform" type = string - description = "GitHub repository name in which to find the hpc-toolkit repo" + description = "GitHub repository name in which to find the cluster-toolkit repo" } variable "deployment_key" { diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py index b5708ef1f4..402ef39c92 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py @@ -83,6 +83,56 @@ def _get_gcp_client(credentials, service="compute", api_version="v1"): ) +def _get_vm_reservations(credentials, zone, ttl_hash=None): + try: + # logger.info(f"Fetching VM reservations for credentials: {credentials}, zone: {zone}") + project, client = _get_gcp_client(credentials) + + req = client.reservations().list(project=project, zone=zone) + resp = req.execute() + + if "items" not in resp: + # logger.info("No reservations found") + return {} + + data = { + reservation["name"]: { + "name": reservation["name"], + "specificReservationRequired": reservation.get("specificReservationRequired", False), + "status": reservation["status"], + "instanceProperties": { + "machineType": reservation + .get("specificReservation", {}) + .get("instanceProperties", {}) + .get("machineType", ""), + "minCpuPlatform": reservation + .get("specificReservation", {}) + .get("instanceProperties", {}) + .get("minCpuPlatform", ""), + "availableCount": int( + reservation + .get("specificReservation", {}) + .get("count", 0) + ) + }, + "shareSettings": reservation.get("shareSettings", {}), + } + for reservation in resp["items"] + } + + # logger.info(f"Reservations data: {data}") + return data + except Exception as e: + logger.error(f"Error fetching VM reservations: {e}") + return {} + +def get_vm_reservations(cloud_provider, credentials, unused_region, zone): + if cloud_provider == "GCP": + return _get_vm_reservations(credentials, zone, ttl_hash=_get_ttl_hash()) + else: + raise Exception(f'Unsupported Cloud Provider "{cloud_provider}"') + + @lru_cache def _get_gcp_disk_types( credentials, zone, ttl_hash=None @@ -116,6 +166,10 @@ def _get_gcp_machine_types( ): # pylint: disable=unused-argument (project, client) = _get_gcp_client(credentials) + # Fetch disk types dynamically + disk_types = _get_gcp_disk_types(credentials, zone, ttl_hash=ttl_hash) + disk_type_names = [disk_type["name"] for disk_type in disk_types] + req = client.machineTypes().list( project=project, zone=zone, filter="isSharedCpu=False" ) @@ -124,6 +178,98 @@ def _get_gcp_machine_types( if "items" not in resp: return [] + invalid_disk_types = { + "c4-": [ + "local-ssd", "pd-standard", "pd-balanced", "pd-ssd", + "pd-extreme", "hyperdisk-ml", "hyperdisk-throughput" + ], + "c3-": [ + "pd-extreme", "pd-standard" + ], + "c3d-": [ + "pd-standard", "pd-extreme", "hyperdisk-extreme" + ], + "n4-": [ + "local-ssd", "pd-standard", "pd-balanced", "pd-ssd", + "pd-extreme", "hyperdisk-extreme", "hyperdisk-ml", + "hyperdisk-throughput" + ], + "n2-": [ + "hyperdisk-balanced", "hyperdisk-ml" + ], + "n2d-": [ + "pd-extreme", "hyperdisk-ml", "hyperdisk-balanced", + "hyperdisk-extreme" + ], + "n1-": [ + "pd-extreme", "hyperdisk-extreme", "hyperdisk-ml", + "hyperdisk-throughput", "hyperdisk-balanced" + ], + "t2d-": [ + "pd-extreme", "local-ssd", "hyperdisk-balanced", + "hyperdisk-ml", "hyperdisk-extreme" + ], + "t2a-": [ + "local-ssd", "pd-extreme", "hyperdisk-balanced", + "hyperdisk-ml", "hyperdisk-extreme", + "hyperdisk-throughput" + ], + "e2-": [ + "local-ssd", "pd-extreme", "hyperdisk-balanced", + "hyperdisk-ml", "hyperdisk-extreme", + "hyperdisk-throughput" + ], + "z3-": [ + "pd-extreme", "pd-standard", "hyperdisk-balanced", + "hyperdisk-ml" + ], + "h3-": [ + "local-ssd", "pd-standard", "pd-ssd", "pd-extreme", + "hyperdisk-ml", "hyperdisk-extreme" + ], + "c2-": [ + "pd-extreme", "hyperdisk-balanced", "hyperdisk-extreme", + "hyperdisk-ml", "hyperdisk-throughput" + ], + "c2d-": [ + "pd-extreme", "hyperdisk-balanced", "hyperdisk-extreme", + "hyperdisk-ml", "hyperdisk-throughput" + ], + "x4-": [ + "local-ssd", "pd-ssd", "pd-standard", "pd-balanced", + "pd-extreme", "hyperdisk-ml", "hyperdisk-throughput" + ], + "m3-": [ + "hyperdisk-throughput", "hyperdisk-ml", "pd-standard" + ], + "m2-": [ + "local-ssd", "hyperdisk-ml", "hyperdisk-throughput" + ], + "m1-": [ + "hyperdisk-ml", "hyperdisk-throughput" + ], + "n1-": [ + "pd-extreme", "hyperdisk-balanced", "hyperdisk-ml", + "hyperdisk-extreme", "hyperdisk-throughput" + ], + "a3-": [ + "pd-extreme", "pd-standard", + "hyperdisk-balanced" + ], + "a2-": [ + "pd-extreme", "hyperdisk-throughput", + "hyperdisk-balanced", "hyperdisk-extreme" + ], + "g2-": [ + "pd-extreme", "pd-standard", "hyperdisk-balanced", + "hyperdisk-extreme" + ] + } + + def get_invalid_disk_types(machine_type_name): + family = machine_type_name.split("-")[0] + "-" + return invalid_disk_types.get(family, []) + data = { mt["name"]: { "name": mt["name"], @@ -138,6 +284,7 @@ def _get_gcp_machine_types( } for acc in mt.get("accelerators", []) }, + "invalid_disk_types": get_invalid_disk_types(mt["name"]) } for mt in resp["items"] } @@ -174,6 +321,8 @@ def _get_gcp_machine_types( items[0]["description"] ) + # logger.info(data) + return data diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py index 516d791701..5868922241 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py @@ -55,7 +55,7 @@ class ClusterInfo: def __init__(self, cluster): self.config = utils.load_config() - self.ghpc_path = "/opt/gcluster/hpc-toolkit/ghpc" + self.ghpc_path = "/opt/gcluster/cluster-toolkit/ghpc" self.cluster = cluster self.cluster_dir = ( @@ -281,6 +281,18 @@ def _prepare_ghpc_yaml(self): } rendered_yaml = template.render(context) + if self.cluster.controller_node_image is not None: + context["controller_image_yaml"] = f"""instance_image: + family: image-{self.cluster.controller_node_image.family} + project: {self.cluster.project_id} + """ + + if self.cluster.login_node_image is not None: + context["login_image_yaml"] = f"""instance_image: + family: image-{self.cluster.login_node_image.family} + project: {self.cluster.project_id} + """ + with yaml_file.open("w") as f: f.write(rendered_yaml) @@ -369,6 +381,9 @@ def _get_tf_state_resource(self, state, filters): Returns each match """ + print(state["resources"]) + print(filters) + def matches(x): try: for k, v in filters.items(): @@ -381,6 +396,7 @@ def matches(x): return list(filter(matches, state["resources"])) def _create_model_instances_from_tf_state(self, state, filters): + print(self._get_tf_state_resource(state, filters)) tf_nodes = self._get_tf_state_resource(state, filters)[0]["instances"] def model_from_tf(tf): @@ -434,14 +450,14 @@ def _get_service_accounts(self, tf_state): # controller & login until we start setting them. filters = { - "module": "module.slurm_controller.module.slurm_controller_instance.module.slurm_controller_instance", #pylint:disable=line-too-long + "module": "module.slurm_controller.module.slurm_controller_instance", #pylint:disable=line-too-long "name": "slurm_instance", } tf_node = self._get_tf_state_resource(tf_state, filters)[0]["instances"][0] #pylint:disable=line-too-long ctrl_sa = tf_node["attributes"]["service_account"][0]["email"] filters = { - "module": "module.slurm_login.module.slurm_login_instance.module.slurm_login_instance", #pylint:disable=line-too-long + "module": 'module.slurm_controller.module.slurm_login_instance["slurm-login"]', #pylint:disable=line-too-long "name": "slurm_instance", } tf_node = self._get_tf_state_resource(tf_state, filters)[0]["instances"][0] #pylint:disable=line-too-long @@ -518,7 +534,7 @@ def _apply_terraform(self): mgmt_nodes = self._create_model_instances_from_tf_state( state, { - "module": "module.slurm_controller.module.slurm_controller_instance.module.slurm_controller_instance", # pylint: disable=line-too-long + "module": "module.slurm_controller.module.slurm_controller_instance", # pylint: disable=line-too-long "name": "slurm_instance", }, ) @@ -539,7 +555,7 @@ def _apply_terraform(self): login_nodes = self._create_model_instances_from_tf_state( state, { - "module": "module.slurm_login.module.slurm_login_instance.module.slurm_login_instance", # pylint: disable=line-too-long + "module": 'module.slurm_controller.module.slurm_login_instance["slurm-login"]', # pylint: disable=line-too-long "name": "slurm_instance", }, ) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/filesystem.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/filesystem.py index f735107123..aaf460c4c4 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/filesystem.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/filesystem.py @@ -90,7 +90,7 @@ def create_filesystem(fs: Filesystem) -> None: def _run_ghpc(target_dir: Path, cred_env: dict) -> None: - ghpc_path = "/opt/gcluster/hpc-toolkit/ghpc" + ghpc_path = "/opt/gcluster/cluster-toolkit/ghpc" try: logger.info("Invoking ghpc create") diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py index 363029db9b..27efbd552e 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py @@ -33,7 +33,7 @@ class ImageBackend: def __init__(self, image): self.config = utils.load_config() - self.ghpc_path = "/opt/gcluster/hpc-toolkit/ghpc" + self.ghpc_path = "/opt/gcluster/cluster-toolkit/ghpc" self.image = image self.image_dir = ( diff --git a/community/front-end/ofe/website/ghpcfe/forms.py b/community/front-end/ofe/website/ghpcfe/forms.py index d6db9c4618..dc4d7046d0 100644 --- a/community/front-end/ofe/website/ghpcfe/forms.py +++ b/community/front-end/ofe/website/ghpcfe/forms.py @@ -248,6 +248,7 @@ class Meta: "dynamic_node_count", "static_node_count", "reservation_name", + "exclusive", "enable_placement", "enable_hyperthreads", "enable_tier1_networking", @@ -316,6 +317,94 @@ def clean(self): raise ValidationError( "SlurmGCP does not support Placement Groups for selected instance type" # pylint: disable=line-too-long ) + + # schedmd-slurm-gcp-v6-partition/outputs.tf + if cleaned_data["dynamic_node_count"] > 0 and not cleaned_data[ + "exclusive" + ]: + raise ValidationError( + "If any non-static nodesets have enable placement set to true, exclusive must be true." + ) + + if cleaned_data["static_node_count"] > 0 and cleaned_data[ + "exclusive" + ]: + raise ValidationError( + "Can't use static nodes within partition with exclusive set to true." + ) + + # schedmd-slurm-gcp-v6-nodeset/outputs.tf + if cleaned_data["reservation_name"] and cleaned_data[ + "enable_placement" + ]: + raise ValidationError("If a reservation is specified, placement must be false.") + + if cleaned_data["enable_placement"] and cleaned_data[ + "static_node_count" + ] > 0 and cleaned_data[ + "dynamic_node_count" + ] > 0: + raise ValidationError( + "Cannot use placement with static and auto-scaling nodes in the same node set." + ) + + # Reservation validation logic + reservation_name = cleaned_data.get("reservation_name") + if reservation_name: + try: + cluster = cleaned_data.get('cluster') + cloud_credential = cluster.cloud_credential.detail + cloud_zone = cluster.cloud_zone + + # logger.info(f"Cluster: {cluster}") + # logger.info(f"Cloud Credential: {cloud_credential}") + # logger.info(f"Cloud Zone: {cloud_zone}") + + reservations = cloud_info.get_vm_reservations("GCP", cloud_credential, None, cloud_zone) + + if not reservations: + raise ValidationError("No reservations found for the specified zone.") + + matching_reservation = reservations.get(reservation_name) + + if not matching_reservation: + raise ValidationError( + f"Reservation {reservation_name} does not exist in the specified zone." + ) + + if matching_reservation[ + "instanceProperties" + ][ + "machineType" + ] != cleaned_data["machine_type"]: + raise ValidationError( + f"Reservation {reservation_name} does not support the specified machine type. " + f"Machine type: {cleaned_data['machine_type']}." + ) + + total_requested_nodes = cleaned_data["dynamic_node_count"] + cleaned_data["static_node_count"] + available_nodes = matching_reservation.get("instanceProperties", {}).get("availableCount", 0) + + if total_requested_nodes > available_nodes: + raise ValidationError( + f"Reservation {reservation_name} does not have enough available nodes." + f"Requested: {total_requested_nodes}, Available: {available_nodes}" + ) + + specific_reservation = matching_reservation.get("specificReservationRequired") + if specific_reservation == False: + raise ValidationError( + f"You must use a 'specific' reservation type." + f"Please read the following URL for more information about setting up reservations:" + f"https://cloud.google.com/compute/docs/instances/reservations-overview#how-reservations-work" + ) + + except Exception as e: + logger.error(f"Error validating reservation: {reservation_name}. Exception: {e}") + raise ValidationError( + f"Error validating reservation: {reservation_name}. Exception: {str(e)}" + ) + return cleaned_data diff --git a/community/front-end/ofe/website/ghpcfe/models.py b/community/front-end/ofe/website/ghpcfe/models.py index 12ea18ff62..075f4587f5 100644 --- a/community/front-end/ofe/website/ghpcfe/models.py +++ b/community/front-end/ofe/website/ghpcfe/models.py @@ -604,7 +604,7 @@ class Image(CloudResource): max_length=60, help_text="Enter a source image family", blank=False, - default="schedmd-v5-slurm-22-05-8-rocky-linux-8", + default="slurm-gcp-6-5-hpc-rocky-linux-8", ) startup_script = models.ManyToManyField( @@ -919,6 +919,9 @@ class ClusterPartition(models.Model): enable_hyperthreads = models.BooleanField( default=False, help_text="Enable Hyperthreads (SMT)" ) + exclusive = models.BooleanField( + default=True, help_text="Exclusive job access to nodes." + ) enable_tier1_networking = models.BooleanField( default=False, help_text=( diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 index a22569d024..ac33085f4e 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 @@ -5,9 +5,7 @@ vars: deployment_name: {{ cluster.cloud_id }} region: {{ cluster.cloud_region }} zone: {{ cluster.cloud_zone }} - enable_reconfigure: True - enable_cleanup_compute: False - enable_cleanup_subscriptions: True + enable_cleanup_compute: True enable_bigquery_load: {{ cluster.use_bigquery }} instance_image_custom: True labels: @@ -47,7 +45,7 @@ deployment_groups: {{ cloudsql_yaml | safe }} - - source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + - source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller kind: terraform id: slurm_controller settings: @@ -61,9 +59,8 @@ deployment_groups: disk_type: {{ cluster.controller_disk_type }} disk_size_gb: {{ cluster.controller_disk_size }} {{ controller_image_yaml | safe }} - service_account: - email: $(hpc_service_account.service_account_email) - scopes: + service_account_email: $(hpc_service_account.service_account_email) + service_account_scopes: - https://www.googleapis.com/auth/cloud-platform - https://www.googleapis.com/auth/monitoring.write - https://www.googleapis.com/auth/logging.write @@ -74,31 +71,30 @@ deployment_groups: echo "******************************************** CALLING CONTROLLER STARTUP" gsutil cp gs://{{ startup_bucket }}/clusters/{{ cluster.id }}/bootstrap_controller.sh - | bash compute_startup_script: | + echo "******************************************** CALLING COMPUTE STARTUP" #!/bin/bash gsutil cp gs://{{ startup_bucket }}/clusters/{{ cluster.id }}/bootstrap_compute.sh - | bash + login_startup_script: | + #!/bin/bash + echo "******************************************** CALLING LOGIN STARTUP" + gsutil cp gs://{{ startup_bucket }}/clusters/{{ cluster.id }}/bootstrap_login.sh - | bash use: + - slurm_login {{ controller_uses | safe }} - - source: community/modules/scheduler/schedmd-slurm-gcp-v5-login + - source: community/modules/scheduler/schedmd-slurm-gcp-v6-login kind: terraform id: slurm_login settings: num_instances: {{ cluster.num_login_nodes }} - subnetwork_self_link: {{ cluster.subnet.cloud_id }} + subnetwork_self_link: "projects/{{ cluster.project_id }}/regions/{{ cluster.cloud_region }}/subnetworks/{{ cluster.subnet.cloud_id }}" machine_type: {{ cluster.login_node_instance_type }} disk_type: {{ cluster.login_node_disk_type }} disk_size_gb: {{ cluster.login_node_disk_size }} {{ login_image_yaml | safe }} - service_account: - email: $(hpc_service_account.service_account_email) - scopes: + service_account_email: $(hpc_service_account.service_account_email) + service_account_scopes: - https://www.googleapis.com/auth/cloud-platform - https://www.googleapis.com/auth/monitoring.write - https://www.googleapis.com/auth/logging.write - https://www.googleapis.com/auth/devstorage.read_write - startup_script: | - #!/bin/bash - echo "******************************************** CALLING LOGIN STARTUP" - gsutil cp gs://{{ startup_bucket }}/clusters/{{ cluster.id }}/bootstrap_login.sh - | bash - use: - - slurm_controller diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 index 86ade8151c..9951079cf2 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 @@ -1,24 +1,25 @@ -- source: community/modules/compute/schedmd-slurm-gcp-v5-partition +- source: community/modules/compute/schedmd-slurm-gcp-v6-partition kind: terraform id: {{ part_id }} use: - - {{ part_id }}-group -{{ uses_str }} + - {{ part_id }}-nodeset settings: partition_name: {{ part.name }} - subnetwork_self_link: {{ cluster.subnet.cloud_id }} - enable_placement: {{ part.enable_placement }} - exclusive: {{ exclusive }} + exclusive: {{ part.exclusive }} + resume_timeout: 500 -- source: community/modules/compute/schedmd-slurm-gcp-v5-node-group - id: {{ part_id }}-group +- source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + id: {{ part_id }}-nodeset use: +{{ uses_str }} settings: bandwidth_tier: {% if part.enable_tier1_networking %}tier_1_enabled{% else %}platform_default{% endif %} + subnetwork_self_link: "projects/{{ cluster.project_id }}/regions/{{ cluster.cloud_region }}/subnetworks/{{ cluster.subnet.cloud_id }}" enable_smt: {{ part.enable_hyperthreads }} + enable_placement: {{ part.enable_placement }} machine_type: {{ part.machine_type }} {% if part.reservation_name %} - reservation_name: {{ part.reservation_name }} + reservation_name: "projects/{{ cluster.project_id }}/reservations/{{ part.reservation_name }}" {% endif %} node_count_dynamic_max: {{ part.dynamic_node_count }} node_count_static: {{ part.static_node_count }} diff --git a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html index 8423fbc3ee..391df0d939 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html +++ b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html @@ -100,7 +100,7 @@

{{ title }}

Image - + Controller
@@ -132,7 +132,7 @@

{{ title }}

- + Login Nodes
@@ -443,7 +443,7 @@

{{ title }}

region = cloudRegionInput.options[cloudRegionInput.selectedIndex].text; } else { zone = cloudZoneInput.value; - region = cloudZoneInput.value; + region = cloudRegionInput.value; } $.ajax({ @@ -452,40 +452,60 @@

{{ title }}

dataType: "json", headers: { 'X-CSRFToken': $.cookie("csrftoken") } }).done(function (data) { - $(".part_formset_row").each(function () { + $(".part_formset_row, .login_row, .controller_row").each(function () { var formRow = $(this); var machineTypeSelect = formRow.find('.machine_type_select'); var machineType = machineTypeSelect.val(); + var invalidDiskTypes = []; + + // Fetch the invalid disk types for the selected machine type + $.ajax({ + url: "{% url 'api-instancetype-list' %}" + machineType + "/?cluster={{ object.id }}®ion=" + region + "&zone=" + zone, + type: "GET", + dataType: "json", + async: false, // To ensure we get the data before proceeding + headers: { 'X-CSRFToken': $.cookie("csrftoken") } + }).done(function(machineData) { + invalidDiskTypes = machineData.invalid_disk_types || []; + }); + formRow.find(".disk_type_select").each(function (pos, selObj) { - var curVal = selObj.value; - $(selObj).empty(); - - if (machineType && machineType.startsWith('c4-')) { - var option = document.createElement("option"); - option.text = "Hyperdisk Balanced Persistent Disk"; - option.setAttribute("value", "hyperdisk-balanced"); - selObj.appendChild(option); - } else { - var additionalDisk = selObj.id.slice(0, selObj.id.lastIndexOf("_disk_type")); - if (additionalDisk.endsWith("additional")) { - $.each(data.disks, function (i, disk_info) { - var option = document.createElement("option"); - option.text = disk_info.description; - option.setAttribute("value", disk_info.name); - selObj.appendChild(option); - }); - } else { - $.each(data.disks, function (i, disk_info) { - if (disk_info.name === 'local-ssd' || disk_info.name.startsWith("pd-")) { - var option = document.createElement("option"); - option.text = disk_info.description; - option.setAttribute("value", disk_info.name); - selObj.appendChild(option); - } - }); - } - } + var curVal = selObj.value; + $(selObj).empty(); + + if (machineType && + (machineType.startsWith('c4-') || + machineType.startsWith('n4-') || + machineType.startsWith('x4-'))) { + var option = document.createElement("option"); + option.text = "Hyperdisk Balanced Persistent Disk"; + option.setAttribute("value", "hyperdisk-balanced"); + selObj.appendChild(option); + } else { + var additionalDisk = selObj.id.slice(0, selObj.id.lastIndexOf("_disk_type")); + if (additionalDisk.endsWith("additional")) { + $.each(data.disks, function (i, disk_info) { + if (invalidDiskTypes.indexOf(disk_info.name) === -1) { + var option = document.createElement("option"); + option.text = disk_info.description; + option.setAttribute("value", disk_info.name); + selObj.appendChild(option); + } + }); + } else { + $.each(data.disks, function (i, disk_info) { + if ((disk_info.name === 'local-ssd' || + disk_info.name.startsWith("pd-")) && + invalidDiskTypes.indexOf(disk_info.name) === -1) { + var option = document.createElement("option"); + option.text = disk_info.description; + option.setAttribute("value", disk_info.name); + selObj.appendChild(option); + } + }); + } + } var id_prefix = selObj.id.slice(0, selObj.id.lastIndexOf("_disk_type")); var disk_size_sel = $(selObj).parentsUntil("tbody").find("#" + id_prefix + "_disk_size")[0]; diff --git a/community/front-end/ofe/website/ghpcfe/views/clusters.py b/community/front-end/ofe/website/ghpcfe/views/clusters.py index 733fade339..898601e5ef 100644 --- a/community/front-end/ofe/website/ghpcfe/views/clusters.py +++ b/community/front-end/ofe/website/ghpcfe/views/clusters.py @@ -468,6 +468,7 @@ def form_valid(self, form): parts = partitions.save() try: + total_nodes_requested = {} for part in parts: part.vCPU_per_node = machine_info[part.machine_type]["vCPU"] // (1 if part.enable_hyperthreads else 2) cpu_count = machine_info[part.machine_type]["vCPU"] @@ -507,6 +508,28 @@ def form_valid(self, form): raise ValidationError( f"Invalid combination: machine_type {part.machine_type} cannot be used with disk_type {disk_type}." ) + + # Sum the total nodes for each reservation + if part.reservation_name: + if part.reservation_name not in total_nodes_requested: + total_nodes_requested[part.reservation_name] = 0 + total_nodes_requested[part.reservation_name] += part.dynamic_node_count + part.static_node_count + + # Validate total requested nodes against available nodes + for reservation_name, requested_nodes in total_nodes_requested.items(): + reservation = cloud_info.get_vm_reservations( + "GCP", + self.object.cloud_credential.detail, + None, + self.object.cloud_zone + ) + matching_reservation = reservation.get(reservation_name) + available_nodes = int(matching_reservation["instanceProperties"].get("availableCount", 0)) + if requested_nodes > available_nodes: + raise ValidationError(f"Reservation {reservation_name} does not have enough available nodes." + f"Requested: {requested_nodes}, Available: {available_nodes}" + ) + except KeyError as err: raise ValidationError("Error in Partition - invalid machine type: " f"{part.machine_type}") from err diff --git a/community/front-end/ofe/website/nginx.conf b/community/front-end/ofe/website/nginx.conf index 6c4d691f2a..457edc43d5 100644 --- a/community/front-end/ofe/website/nginx.conf +++ b/community/front-end/ofe/website/nginx.conf @@ -40,7 +40,7 @@ http { } location /static/ { - alias ../hpc-toolkit/community/front-end/ofe/website/static/; + alias ../cluster-toolkit/community/front-end/ofe/website/static/; } location / { diff --git a/community/front-end/ofe/website/website/settings.py b/community/front-end/ofe/website/website/settings.py index d0616e6151..5ac2a911ed 100644 --- a/community/front-end/ofe/website/website/settings.py +++ b/community/front-end/ofe/website/website/settings.py @@ -81,7 +81,7 @@ def get_site_name(): # Build paths inside the project like this: BASE_DIR / 'subdir'. BASE_DIR = Path(__file__).resolve().parent.parent -MEDIA_ROOT = "/opt/gcluster/hpc-toolkit/community/front-end/ofe/website/startup-scripts:" +MEDIA_ROOT = "/opt/gcluster/cluster-toolkit/community/front-end/ofe/website/startup-scripts:" # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/3.1/howto/deployment/checklist/ From 0227b90892304092bf7f154b3c863179343c2b5b Mon Sep 17 00:00:00 2001 From: Scott Gordon Date: Mon, 12 Aug 2024 20:03:54 +0100 Subject: [PATCH 078/180] duplication fix --- .../website/ghpcfe/cluster_manager/clusterinfo.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py index 3bb56716f2..0f7eff76ba 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py @@ -279,19 +279,6 @@ def _prepare_ghpc_yaml(self): "controller_sa": "sa", "startup_bucket": self.config["server"]["gcs_bucket"] } - - if self.cluster.controller_node_image is not None: - context["controller_image_yaml"] = f"""instance_image: - family: image-{self.cluster.controller_node_image.family} - project: {self.cluster.project_id} - """ - - if self.cluster.login_node_image is not None: - context["login_image_yaml"] = f"""instance_image: - family: image-{self.cluster.login_node_image.family} - project: {self.cluster.project_id} - """ - rendered_yaml = template.render(context) if self.cluster.controller_node_image is not None: From 16a4c378e8e5eef8d61342ba2f3e4d5656494543 Mon Sep 17 00:00:00 2001 From: Scott Gordon Date: Mon, 12 Aug 2024 21:12:33 +0100 Subject: [PATCH 079/180] duplication fix --- .../ofe/website/ghpcfe/cluster_manager/clusterinfo.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py index 0f7eff76ba..e01acf1602 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py @@ -383,10 +383,6 @@ def _get_tf_state_resource(self, state, filters): print(state["resources"]) print(filters) - - print(state["resources"]) - print(filters) - def matches(x): try: for k, v in filters.items(): From a2949e19fd5a023dcac058548c19f9a27c7fd1bf Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 12 Aug 2024 20:27:57 +0000 Subject: [PATCH 080/180] Copy `script_nodes.py` along slurm binaries (`/usr/local/bin`) --- .../modules/slurm_files/scripts/setup.py | 5 +++++ .../modules/slurm_files/scripts/sort_nodes.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index 23d5c2df3c..9754f051bd 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -301,6 +301,11 @@ def configure_dirs(): scripts_log.unlink() scripts_log.symlink_to(dirs.log) + for f in ("sort_nodes.py",): # copy auxiliary scripts + dst = Path(lkp.cfg.slurm_bin_dir) / f + shutil.copyfile(util.scripts_dir / f, dst) + os.chmod(dst, 0o755) + def setup_controller(): """Run controller setup""" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py index bfac018830..1747742d1e 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py @@ -29,7 +29,7 @@ #SBATCH --ntasks-per-node=8 #SBATCH --nodes=64 -export SLURM_HOSTFILE=$(/slurm/scripts/sort_nodes.py) +export SLURM_HOSTFILE=$(sort_nodes.py) srun -l hostname | sort ``` From 3519b6d54fd3317a57df24add25e61f90ce02c87 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 13 Aug 2024 11:11:46 +0000 Subject: [PATCH 081/180] link changed in examples readme doc --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index e3a7181d2e..b1816c1be8 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1564,7 +1564,7 @@ After provisioning the cluster and the nodepool, we need to do the following: This blueprint shows how to provision a GKE cluster with A3 High machines in the toolkit. After provisioning the cluster and the nodepool, we need to do the following: -1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl +1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#gpudirect-tcpx_2 2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload 4- Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl From 94748157d2b802abb6b4d6a1a5b9233ba08d487b Mon Sep 17 00:00:00 2001 From: annuay-google Date: Tue, 13 Aug 2024 16:55:40 +0530 Subject: [PATCH 082/180] Remove redundant pip install for pytest --- Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile b/Makefile index b9e4ef089a..02bf23f7cd 100644 --- a/Makefile +++ b/Makefile @@ -71,7 +71,6 @@ install-dev-deps: warn-terraform-version warn-packer-version check-pre-commit ch go install honnef.co/go/tools/cmd/staticcheck@latest pip install -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt pip install -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt - pip install pytest # RULES SUPPORTING THE ABOVE From 475c2ff702e4acd885c0d146e6518f403bc1a4b0 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 13 Aug 2024 17:22:22 +0000 Subject: [PATCH 083/180] SlurmGCP. Minor refactoring. --- .../modules/slurm_files/scripts/slurmsync.py | 26 +++-- .../modules/slurm_files/scripts/util.py | 95 +++++++------------ 2 files changed, 46 insertions(+), 75 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index e648d6b80c..fc97c3b94a 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -289,21 +289,17 @@ def nodes_unknown(): inst = lkp.instance(first) log.error(f"{first} state: {state}, instance status:{inst.status}") - update = dict.get( - { - NodeStatus.orphan: nodes_delete, - NodeStatus.power_down: nodes_power_down, - NodeStatus.preempted: lambda: (nodes_down(), nodes_restart()), - NodeStatus.restore: nodes_idle, - NodeStatus.resume: nodes_resume, - NodeStatus.terminated: nodes_down, - NodeStatus.unbacked: nodes_down, - NodeStatus.unchanged: lambda: None, - NodeStatus.unknown: nodes_unknown, - }, - status, - ) - update() + { + NodeStatus.orphan: nodes_delete, + NodeStatus.power_down: nodes_power_down, + NodeStatus.preempted: lambda: (nodes_down(), nodes_restart()), + NodeStatus.restore: nodes_idle, + NodeStatus.resume: nodes_resume, + NodeStatus.terminated: nodes_down, + NodeStatus.unbacked: nodes_down, + NodeStatus.unchanged: lambda: None, + NodeStatus.unknown: nodes_unknown, + }[status]() def delete_placement_groups(placement_groups): diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index a7ffcac378..9e33104f4e 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -1078,55 +1078,39 @@ def get_insert_operations(group_ids): return get_filtered_operations(" AND ".join(f"({f})" for f in filters if f)) -def machine_type_sockets(template): - pattern = re.compile("^(?P[^-]+)") - m = pattern.match(template.machineType) - if not m: - raise Exception(f"template {template} does not match expected regex") - family = m.group("family") - guestCpus: int = int(template.machine_info.guestCpus) - socket_count = dict.get( - { - "h3": 2, - "c2d": 2 if guestCpus > 56 else 1, - "a3": 2, - }, - family, - 1, # assume 1 socket for all other families - ) - return socket_count +def machine_type_family(mt: str) -> str: + """get machine type family from machine type""" + # TODO: doesn't work with N1 custom machine types + # See https://cloud.google.com/compute/docs/instances/creating-instance-with-custom-machine-type#create + return mt.split("-")[0] -def isSmt(template): - machineType: str = template.machineType +def machine_type_sockets(template) -> int: guestCpus: int = int(template.machine_info.guestCpus) + return { + "h3": 2, + "c2d": 2 if guestCpus > 56 else 1, + "a3": 2, + }.get( + machine_type_family(template.machineType), + 1, # assume 1 socket for all other families + ) - pattern = re.compile("^(?P[^-]+)") - matches = pattern.match(machineType) - machineTypeFamily: str = matches["family"] +def isSmt(template) -> bool: # https://cloud.google.com/compute/docs/cpu-platforms - noSmtFamily = [ - "t2a", - "t2d", - "h3", - ] - if machineTypeFamily in noSmtFamily: + noSmtFamily = ("t2a", "t2d", "h3",) + if machine_type_family(template.machineType) in noSmtFamily: return False - elif guestCpus == 1: + if template.machine_info.guestCpus == 1: return False return True -def getThreadsPerCore(template): - threadsPerCore: int = template.advancedMachineFeatures.threadsPerCore - +def getThreadsPerCore(template) -> int: if not isSmt(template): return 1 - elif threadsPerCore: - return threadsPerCore - else: - return 2 + return template.advancedMachineFeatures.threadsPerCore or 2 @retry( @@ -1709,20 +1693,17 @@ def reservation(self, name: str, zone: str) -> object: ) @lru_cache(maxsize=1) - def machine_types(self, project=None): - project = project or self.project + def machine_types(self): field_names = "name,zone,guestCpus,memoryMb,accelerators" fields = f"items.zones.machineTypes({field_names}),nextPageToken" machines = defaultdict(dict) act = self.compute.machineTypes() - op = act.aggregatedList(project=project, fields=fields) + op = act.aggregatedList(project=self.project, fields=fields) while op is not None: result = ensure_execute(op) machine_iter = chain.from_iterable( - m["machineTypes"] - for m in result["items"].values() - if "machineTypes" in m + scope.get("machineTypes", []) for scope in result["items"].values() ) for machine in machine_iter: name = machine["name"] @@ -1732,20 +1713,13 @@ def machine_types(self, project=None): op = act.aggregatedList_next(op, result) return machines - def machine_type(self, machine_type, project=None, zone=None): + def machine_type(self, machine_type: str): """ """ custom_patt = re.compile( r"((?P\w+)-)?custom-(?P\d+)-(?P\d+)" ) custom_match = custom_patt.match(machine_type) - if zone: - project = project or self.project - machine_info = ensure_execute( - self.compute.machineTypes().get( - project=project, zone=zone, machineType=machine_type - ) - ) - elif custom_match is not None: + if custom_match is not None: groups = custom_match.groupdict() cpus, mem = (groups[k] for k in ["cpus", "mem"]) machine_info = { @@ -1753,18 +1727,20 @@ def machine_type(self, machine_type, project=None, zone=None): "memoryMb": int(mem), } else: - machines = self.machine_types(project=project) - machine_info = next(iter(machines[machine_type].values()), None) - if machine_info is None: + machines = self.machine_types() + if machine_type not in machines: raise Exception(f"machine type {machine_type} not found") + per_zone = machines[machine_type] + assert per_zone + machine_info = next(iter(per_zone.values())) # pick the first/any zone return NSDict(machine_info) - def template_machine_conf(self, template_link, project=None, zone=None): + def template_machine_conf(self, template_link): template = self.template_info(template_link) if not template.machineType: temp_name = trim_self_link(template_link) raise Exception(f"instance template {temp_name} has no machine type") - template.machine_info = self.machine_type(template.machineType, zone=zone) + template.machine_info = self.machine_type(template.machineType) machine = template.machine_info machine_conf = NSDict() @@ -1810,8 +1786,7 @@ def template_cache(self, writeback=False): cache.close() @lru_cache(maxsize=None) - def template_info(self, template_link, project=None): - project = project or self.project + def template_info(self, template_link): template_name = trim_self_link(template_link) # split read and write access to minimize write-lock. This might be a # bit slower? TODO measure @@ -1822,7 +1797,7 @@ def template_info(self, template_link, project=None): template = ensure_execute( self.compute.instanceTemplates().get( - project=project, instanceTemplate=template_name + project=self.project, instanceTemplate=template_name ) ).get("properties") template = NSDict(template) @@ -1833,7 +1808,7 @@ def template_info(self, template_link, project=None): # del template.metadata # translate gpus into an easier-to-read format - machine_info = self.machine_type(template.machineType, project=project) + machine_info = self.machine_type(template.machineType) if machine_info.accelerators: template.gpu_type = machine_info.accelerators[0].guestAcceleratorType template.gpu_count = machine_info.accelerators[0].guestAcceleratorCount From fe8fb7bbbb9d06f66f7823889736e137bea168c3 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 12 Aug 2024 20:10:40 +0000 Subject: [PATCH 084/180] SlurmGCP. Stop using global mutable `lkp` --- .../slurm_files/scripts/get_tpu_vmcount.py | 6 +- .../modules/slurm_files/scripts/load_bq.py | 16 ++-- .../modules/slurm_files/scripts/resume.py | 68 +++++++-------- .../modules/slurm_files/scripts/setup.py | 52 ++++++------ .../scripts/setup_network_storage.py | 38 ++++----- .../modules/slurm_files/scripts/slurmsync.py | 80 +++++++++--------- .../modules/slurm_files/scripts/suspend.py | 22 ++--- .../slurm_files/scripts/tests/test_util.py | 4 +- .../modules/slurm_files/scripts/util.py | 84 +++++++++---------- 9 files changed, 184 insertions(+), 186 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py index 0e6a5074ca..354ec81ad3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py @@ -20,8 +20,8 @@ def get_vmcount_of_tpu_part(part): res = 0 - for ns in util.lkp.cfg.partitions[part].partition_nodeset_tpu: - tpu_obj = util.TPU(util.lkp.cfg.nodeset_tpu[ns]) + for ns in util.lookup().cfg.partitions[part].partition_nodeset_tpu: + tpu_obj = util.TPU(util.lookup().cfg.nodeset_tpu[ns]) if res == 0: res = tpu_obj.vmcount else: @@ -53,7 +53,7 @@ def get_vmcount_of_tpu_part(part): # valid equals to 0 means that we are ok, otherwise it will be set to one of the previously defined exit codes valid = 0 for part in args.partitions.split(","): - if part not in util.lkp.cfg.partitions: + if part not in util.lookup().cfg.partitions: valid = PART_INVALID break else: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index ec52d06ea7..d48f1346a2 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -28,7 +28,7 @@ from google.api_core import retry, exceptions import util -from util import lkp, run +from util import lookup, run SACCT = "sacct" @@ -175,14 +175,14 @@ def schema_field(field_name, data_type, description, required=False): Job = namedtuple("Job", job_schema.keys()) client = bq.Client( - project=lkp.cfg.project, + project=lookup().cfg.project, credentials=util.default_credentials(), client_options=util.create_client_options(util.ApiEndpoint.BQ), ) -dataset_id = f"{lkp.cfg.slurm_cluster_name}_job_data" -dataset = bq.DatasetReference(project=lkp.project, dataset_id=dataset_id) +dataset_id = f"{lookup().cfg.slurm_cluster_name}_job_data" +dataset = bq.DatasetReference(project=lookup().project, dataset_id=dataset_id) table = bq.Table( - bq.TableReference(dataset, f"jobs_{lkp.cfg.slurm_cluster_name}"), schema_fields + bq.TableReference(dataset, f"jobs_{lookup().cfg.slurm_cluster_name}"), schema_fields ) @@ -197,8 +197,8 @@ def make_job_row(job): if field_name in job } job_row["entry_uuid"] = uuid.uuid4().hex - job_row["cluster_id"] = lkp.cfg.cluster_id - job_row["cluster_name"] = lkp.cfg.slurm_cluster_name + job_row["cluster_id"] = lookup().cfg.cluster_id + job_row["cluster_name"] = lookup().cfg.slurm_cluster_name return job_row @@ -309,7 +309,7 @@ def update_job_idx_cache(jobs, timestamp): def main(): - if not lkp.cfg.enable_bigquery_load: + if not lookup().cfg.enable_bigquery_load: print("bigquery load is not currently enabled") exit(0) init_table() diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 4473b73932..e669dd1dca 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -41,7 +41,7 @@ trim_self_link, wait_for_operation, ) -from util import lkp, NSDict, TPU +from util import lookup, NSDict, TPU import slurm_gcp_plugins @@ -61,8 +61,8 @@ def instance_properties(nodeset, model, placement_group, labels=None): props = NSDict() if labels: # merge in extra labels on instance and disks - template = lkp.node_template(model) - template_info = lkp.template_info(template) + template = lookup().node_template(model) + template_info = lookup().template_info(template) props.labels = {**template_info.labels, **labels} @@ -85,7 +85,7 @@ def instance_properties(nodeset, model, placement_group, labels=None): zones = list(nodeset.zone_policy_allow or []) assert len(zones) == 1, "Only single zone is supported if using a reservation" - reservation = lkp.reservation(reservation_name, zones[0]) + reservation = lookup().reservation(reservation_name, zones[0]) props.reservationAffinity = { "consumeReservationType": "SPECIFIC_RESERVATION", @@ -135,10 +135,10 @@ def create_instances_request(nodes, partition_name, placement_group, job_id=None # model here indicates any node that can be used to describe the rest model = next(iter(nodes)) - nodeset = lkp.node_nodeset(model) - template = lkp.node_template(model) - region = lkp.node_region(model) - partition = lkp.cfg.partitions[partition_name] + nodeset = lookup().node_nodeset(model) + template = lookup().node_template(model) + region = lookup().node_region(model) + partition = lookup().cfg.partitions[partition_name] log.debug(f"create_instances_request: {model} placement: {placement_group}") body = NSDict() @@ -173,16 +173,16 @@ def create_instances_request(nodes, partition_name, placement_group, job_id=None } body.locationPolicy.targetShape = nodeset.zone_target_shape - if lkp.cfg.enable_slurm_gcp_plugins: + if lookup().cfg.enable_slurm_gcp_plugins: slurm_gcp_plugins.pre_instance_bulk_insert( - lkp=lkp, + lkp=lookup(), nodes=nodes, placement_group=placement_group, request_body=body, ) - request = lkp.compute.regionInstances().bulkInsert( - project=lkp.project, region=region, body=body.to_dict() + request = lookup().compute.regionInstances().bulkInsert( + project=lookup().project, region=region, body=body.to_dict() ) if log.isEnabledFor(logging.DEBUG): @@ -228,7 +228,7 @@ def group_nodes_bulk(nodes, resume_data=None): ) jobless_nodes_tpu = [] for jobless_node in jobless_nodes[:]: - if lkp.node_is_tpu(jobless_node): + if lookup().node_is_tpu(jobless_node): jobless_nodes.remove(jobless_node) jobless_nodes_tpu.append(jobless_node) @@ -268,7 +268,7 @@ def group_nodes_bulk(nodes, resume_data=None): for job_id, job in jobs.items() if not job.tpu for placement_group, pg_nodes in job.placement_groups.items() - for prefix, nodes in util.groupby_unsorted(pg_nodes, lkp.node_prefix) + for prefix, nodes in util.groupby_unsorted(pg_nodes, lookup().node_prefix) for i, chunk_nodes in enumerate(chunked(nodes, n=BULK_INSERT_LIMIT)) ] grouped_nodes_tpu = [ @@ -281,8 +281,8 @@ def group_nodes_bulk(nodes, resume_data=None): ) for job_id, job in jobs.items() if job.tpu - for prefix, nodes in util.groupby_unsorted(job.nodes_resume, lkp.node_prefix) - for i, chunk_nodes in enumerate(lkp.chunk_tpu_nodes(list(nodes))) + for prefix, nodes in util.groupby_unsorted(job.nodes_resume, lookup().node_prefix) + for i, chunk_nodes in enumerate(lookup().chunk_tpu_nodes(list(nodes))) ] def group_name(chunk: BulkChunk): @@ -339,7 +339,7 @@ def resume_nodes(nodes: List[str], resume_data=None): if resume_data is None and global_resume_data is not None: resume_data = global_resume_data.deepcopy() - nodes = sorted(nodes, key=lkp.node_prefix) + nodes = sorted(nodes, key=lookup().node_prefix) grouped_nodes, grouped_tpu_nodes = group_nodes_bulk(nodes, resume_data) if log.isEnabledFor(logging.DEBUG): @@ -365,7 +365,7 @@ def resume_nodes(nodes: List[str], resume_data=None): # do not create multiple tpu_objs if nodes with the same prefix are used if chunk.prefix not in tpu_objs.keys(): model = chunk.nodes[0] - tpu_objs[chunk.prefix] = TPU(lkp.node_nodeset(model)) + tpu_objs[chunk.prefix] = TPU(lookup().node_nodeset(model)) tpu_start_data.append({"tpu": tpu_objs[chunk.prefix], "node": chunk.nodes}) @@ -466,8 +466,8 @@ def update_job_comment(nodelist: str, comment: str): if any(map(lambda node: node in nodes, util.to_hostnames(job.nodelist_resume))) ) for job in job_list: - run(f"{lkp.scontrol} update jobid={job.job_id} admincomment='{comment}'") - run(f"{lkp.scontrol} notify {job.job_id} '{comment}'") + run(f"{lookup().scontrol} update jobid={job.job_id} admincomment='{comment}'") + run(f"{lookup().scontrol} notify {job.job_id} '{comment}'") def down_nodes(nodelist, reason): @@ -475,13 +475,13 @@ def down_nodes(nodelist, reason): if isinstance(nodelist, list): nodelist = util.to_hostlist(nodelist) update_job_comment(nodelist, reason) - run(f"{lkp.scontrol} update nodename={nodelist} state=down reason='{reason}'") + run(f"{lookup().scontrol} update nodename={nodelist} state=down reason='{reason}'") def hold_job(job_id, reason): """hold job, set comment to reason""" - run(f"{lkp.scontrol} hold jobid={job_id}") - run(f"{lkp.scontrol} update jobid={job_id} comment='{reason}'") + run(f"{lookup().scontrol} hold jobid={job_id}") + run(f"{lookup().scontrol} update jobid={job_id} comment='{reason}'") def create_placement_request(pg_name, region): @@ -492,12 +492,12 @@ def create_placement_request(pg_name, region): "collocation": "COLLOCATED", }, } - if lkp.cfg.enable_slurm_gcp_plugins: + if lookup().cfg.enable_slurm_gcp_plugins: slurm_gcp_plugins.pre_placement_group_insert( - lkp=lkp, pg_name=pg_name, region=region, request_body=config + lkp=lookup(), pg_name=pg_name, region=region, request_body=config ) - request = lkp.compute.resourcePolicies().insert( - project=lkp.project, region=region, body=config + request = lookup().compute.resourcePolicies().insert( + project=lookup().project, region=region, body=config ) log_api_request(request) return request @@ -505,7 +505,7 @@ def create_placement_request(pg_name, region): def create_placement_groups(node_list: list, job_id=0): pgs = {} - node_map = lkp.nodeset_map(node_list) + node_map = lookup().nodeset_map(node_list) for _, nodes in node_map.items(): pgs.update(create_nodeset_placement_groups(nodes, job_id=job_id)) return pgs @@ -513,15 +513,15 @@ def create_placement_groups(node_list: list, job_id=0): def create_nodeset_placement_groups(node_list: list, job_id=0): model = next(iter(node_list)) - nodeset = lkp.node_nodeset(model) + nodeset = lookup().node_nodeset(model) if not nodeset.enable_placement: return {None: node_list} if not valid_placement_nodes(node_list): return {None: node_list} - region = lkp.node_region(model) + region = lookup().node_region(model) groups = { - f"{lkp.cfg.slurm_cluster_name}-slurmgcp-managed-{nodeset.nodeset_name}-{job_id}-{i}": nodes + f"{lookup().cfg.slurm_cluster_name}-slurmgcp-managed-{nodeset.nodeset_name}-{job_id}-{i}": nodes for i, nodes in enumerate(chunked(node_list, n=PLACEMENT_MAX_CNT)) } @@ -579,7 +579,7 @@ def classify_result(item): def valid_placement_nodes(nodelist): invalid_types = frozenset(["e2", "t2d", "n1", "t2a", "m1", "m2", "m3"]) for node in nodelist: - mt = lkp.node_template_info(node).machineType + mt = lookup().node_template_info(node).machineType if mt.split("-")[0] in invalid_types: log.warn(f"Unsupported machine type for placement policy: {mt}.") log.warn( @@ -608,7 +608,7 @@ def main(nodelist): log.debug(f"ResumeProgram {nodelist}") # Filter out nodes not in config.yaml other_nodes, pm_nodes = separate( - lkp.is_power_managed_node, util.to_hostnames(nodelist) + lookup().is_power_managed_node, util.to_hostnames(nodelist) ) if other_nodes: log.debug( @@ -626,7 +626,7 @@ def main(nodelist): resume_nodes(pm_nodes, global_resume_data) # TODO only run below if resume_nodes succeeds but # resume_nodes does not currently return any status. - if lkp.cfg.enable_slurm_gcp_plugins: + if lookup().cfg.enable_slurm_gcp_plugins: slurm_gcp_plugins.post_main_resume_nodes( nodelist=nodelist, global_resume_data=global_resume_data ) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index 9754f051bd..781b5f62ee 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -26,7 +26,7 @@ import util from util import ( - lkp, + lookup, dirs, slurmdirs, run, @@ -101,10 +101,10 @@ def end_motd(broadcast=True): return run( - "wall -n '*** Slurm {} setup complete ***'".format(lkp.instance_role), + "wall -n '*** Slurm {} setup complete ***'".format(lookup().instance_role), timeout=30, ) - if lkp.instance_role != "controller": + if not lookup().is_controller: run( """wall -n ' /home on the controller was mounted over the existing /home. @@ -125,13 +125,13 @@ def failed_motd(): def run_custom_scripts(): """run custom scripts based on instance_role""" custom_dir = dirs.custom_scripts - if lkp.is_controller: + if lookup().is_controller: # controller has all scripts, but only runs controller.d custom_dirs = [custom_dir / "controller.d"] - elif lkp.instance_role == "compute": + elif lookup().instance_role == "compute": # compute setup with compute.d and nodeset.d custom_dirs = [custom_dir / "compute.d", custom_dir / "nodeset.d"] - elif lkp.instance_role == "login": + elif lookup().instance_role == "login": # login setup with only login.d custom_dirs = [custom_dir / "login.d"] else: @@ -149,11 +149,11 @@ def run_custom_scripts(): try: for script in custom_scripts: if "/controller.d/" in str(script): - timeout = lkp.cfg.get("controller_startup_scripts_timeout", 300) + timeout = lookup().cfg.get("controller_startup_scripts_timeout", 300) elif "/compute.d/" in str(script) or "/nodeset.d/" in str(script): - timeout = lkp.cfg.get("compute_startup_scripts_timeout", 300) + timeout = lookup().cfg.get("compute_startup_scripts_timeout", 300) elif "/login.d/" in str(script): - timeout = lkp.cfg.get("login_startup_scripts_timeout", 300) + timeout = lookup().cfg.get("login_startup_scripts_timeout", 300) else: timeout = 300 timeout = None if not timeout or timeout < 0 else timeout @@ -265,13 +265,13 @@ def configure_mysql(): timeout=30, ) run( - f"""{mysql} "drop user 'slurm'@'{lkp.control_host}'";""", + f"""{mysql} "drop user 'slurm'@'{lookup().control_host}'";""", timeout=30, check=False, ) - run(f"""{mysql} "create user 'slurm'@'{lkp.control_host}'";""", timeout=30) + run(f"""{mysql} "create user 'slurm'@'{lookup().control_host}'";""", timeout=30) run( - f"""{mysql} "grant all on slurm_acct_db.* TO 'slurm'@'{lkp.control_host}'";""", + f"""{mysql} "grant all on slurm_acct_db.* TO 'slurm'@'{lookup().control_host}'";""", timeout=30, ) @@ -302,7 +302,7 @@ def configure_dirs(): scripts_log.symlink_to(dirs.log) for f in ("sort_nodes.py",): # copy auxiliary scripts - dst = Path(lkp.cfg.slurm_bin_dir) / f + dst = Path(lookup().cfg.slurm_bin_dir) / f shutil.copyfile(util.scripts_dir / f, dst) os.chmod(dst, 0o755) @@ -312,7 +312,7 @@ def setup_controller(): log.info("Setting up controller") util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600) install_custom_scripts() - conf.gen_controller_configs(lkp) + conf.gen_controller_configs(lookup()) setup_jwt_key() setup_munge_key() setup_sudoers() @@ -320,7 +320,7 @@ def setup_controller(): run_custom_scripts() - if not lkp.cfg.cloudsql_secret: + if not lookup().cfg.cloudsql_secret: configure_mysql() run("systemctl enable slurmdbd", timeout=30) @@ -331,7 +331,7 @@ def setup_controller(): sacctmgr = f"{slurmdirs.prefix}/bin/sacctmgr -i" result = run( - f"{sacctmgr} add cluster {lkp.cfg.slurm_cluster_name}", timeout=30, check=False + f"{sacctmgr} add cluster {lookup().cfg.slurm_cluster_name}", timeout=30, check=False ) if "already exists" in result.stdout: log.info(result.stdout) @@ -369,11 +369,11 @@ def setup_controller(): def setup_login(): """run login node setup""" log.info("Setting up login") - slurmctld_host = f"{lkp.control_host}" - if lkp.control_addr: - slurmctld_host = f"{lkp.control_host}({lkp.control_addr})" + slurmctld_host = f"{lookup().control_host}" + if lookup().control_addr: + slurmctld_host = f"{lookup().control_host}({lookup().control_addr})" slurmd_options = [ - f'--conf-server="{slurmctld_host}:{lkp.control_host_port}"', + f'--conf-server="{slurmctld_host}:{lookup().control_host_port}"', f'--conf="Feature={conf.login_nodeset}"', "-Z", ] @@ -401,11 +401,11 @@ def setup_compute(): """run compute node setup""" log.info("Setting up compute") util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600) - slurmctld_host = f"{lkp.control_host}" - if lkp.control_addr: - slurmctld_host = f"{lkp.control_host}({lkp.control_addr})" + slurmctld_host = f"{lookup().control_host}" + if lookup().control_addr: + slurmctld_host = f"{lookup().control_host}({lookup().control_addr})" slurmd_options = [ - f'--conf-server="{slurmctld_host}:{lkp.control_host_port}"', + f'--conf-server="{slurmctld_host}:{lookup().control_host_port}"', ] try: @@ -454,8 +454,8 @@ def main(): "compute": setup_compute, "login": setup_login, }.get( - lkp.instance_role, - lambda: log.fatal(f"Unknown node role: {lkp.instance_role}"))() + lookup().instance_role, + lambda: log.fatal(f"Unknown node role: {lookup().instance_role}"))() end_motd() diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py index 889f4cf59e..b544d4220e 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py @@ -27,7 +27,7 @@ from addict import Dict as NSDict import util -from util import lkp, run, dirs, separate +from util import lookup, run, dirs, separate from more_executors import Executors, ExceptionRetryPolicy @@ -41,21 +41,21 @@ def mounts_by_local(mounts): def resolve_network_storage(nodeset=None): """Combine appropriate network_storage fields to a single list""" - if lkp.instance_role == "compute": + if lookup().instance_role == "compute": try: - nodeset = lkp.node_nodeset() + nodeset = lookup().node_nodeset() except Exception: # External nodename, skip lookup nodeset = None # seed mounts with the default controller mounts - if lkp.cfg.disable_default_mounts: + if lookup().cfg.disable_default_mounts: default_mounts = [] else: default_mounts = [ NSDict( { - "server_ip": lkp.control_addr or lkp.control_host, + "server_ip": lookup().control_addr or lookup().control_host, "remote_mount": str(path), "local_mount": str(path), "fs_type": "nfs", @@ -73,9 +73,9 @@ def resolve_network_storage(nodeset=None): # On non-controller instances, entries in network_storage could overwrite # default exports from the controller. Be careful, of course - mounts.update(mounts_by_local(lkp.cfg.network_storage)) - if lkp.instance_role in ("login", "controller"): - mounts.update(mounts_by_local(lkp.cfg.login_network_storage)) + mounts.update(mounts_by_local(lookup().cfg.network_storage)) + if lookup().instance_role in ("login", "controller"): + mounts.update(mounts_by_local(lookup().cfg.login_network_storage)) if nodeset is not None: mounts.update(mounts_by_local(nodeset.network_storage)) @@ -89,7 +89,7 @@ def internal_mount(mount): # NOTE: Valid Lustre server_ip can take the form of '@tcp' server_ip = mount.server_ip.split("@")[0] mount_addr = util.host_lookup(server_ip) - return mount_addr == lkp.control_host_addr + return mount_addr == lookup().control_host_addr return separate(internal_mount, mounts) @@ -102,7 +102,7 @@ def setup_network_storage(): all_mounts = resolve_network_storage() ext_mounts, int_mounts = separate_external_internal_mounts(all_mounts) - if lkp.is_controller: + if lookup().is_controller: mounts = ext_mounts else: mounts = ext_mounts + int_mounts @@ -193,16 +193,16 @@ def mount_path(path): def munge_mount_handler(): - if not lkp.cfg.munge_mount: + if not lookup().cfg.munge_mount: log.error("Missing munge_mount in cfg") - elif lkp.is_controller: + elif lookup().is_controller: return - mount = lkp.cfg.munge_mount + mount = lookup().cfg.munge_mount server_ip = ( mount.server_ip if mount.server_ip - else (lkp.cfg.slurm_control_addr or lkp.cfg.slurm_control_host) + else (lookup().cfg.slurm_control_addr or lookup().cfg.slurm_control_host) ) remote_mount = mount.remote_mount local_mount = Path("/mnt/munge") @@ -276,18 +276,18 @@ def setup_nfs_exports(): mounts.append( NSDict( { - "server_ip": lkp.cfg.munge_mount.server_ip, - "remote_mount": lkp.cfg.munge_mount.remote_mount, + "server_ip": lookup().cfg.munge_mount.server_ip, + "remote_mount": lookup().cfg.munge_mount.remote_mount, "local_mount": Path(f"{dirs.munge}_tmp"), - "fs_type": lkp.cfg.munge_mount.fs_type, - "mount_options": lkp.cfg.munge_mount.mount_options, + "fs_type": lookup().cfg.munge_mount.fs_type, + "mount_options": lookup().cfg.munge_mount.mount_options, } ) ) # controller mounts _, con_mounts = separate_external_internal_mounts(mounts) con_mounts = {m.remote_mount: m for m in con_mounts} - for nodeset in lkp.cfg.nodeset.values(): + for nodeset in lookup().cfg.nodeset.values(): # get internal mounts for each nodeset by calling # resolve_network_storage as from a node in each nodeset ns_mounts = resolve_network_storage(nodeset=nodeset) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index e648d6b80c..770b8fbe38 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -45,7 +45,7 @@ TPU, chunked, ) -from util import lkp, CONFIG_FILE +from util import lookup, CONFIG_FILE from suspend import delete_instances from resume import start_tpu import conf @@ -72,9 +72,9 @@ def start_instance_op(inst): - return lkp.compute.instances().start( - project=lkp.project, - zone=lkp.instance(inst).zone, + return lookup().compute.instances().start( + project=lookup().project, + zone=lookup().instance(inst).zone, instance=inst, ) @@ -82,16 +82,14 @@ def start_instance_op(inst): def start_instances(node_list): log.info("{} instances to start ({})".format(len(node_list), ",".join(node_list))) - normal, tpu_nodes = separate(lkp.node_is_tpu, node_list) - invalid, valid = separate(lambda inst: bool(lkp.instance), normal) - - ops = {inst: start_instance_op(inst) for inst in valid} + normal, tpu_nodes = separate(lookup().node_is_tpu, node_list) + ops = {inst: start_instance_op(inst) for inst in normal} done, failed = batch_execute(ops) tpu_start_data = [] - for ns, nodes in util.groupby_unsorted(tpu_nodes, lkp.node_nodeset_name): - tpuobj = TPU(lkp.cfg.nodeset_tpu[ns]) + for ns, nodes in util.groupby_unsorted(tpu_nodes, lookup().node_nodeset_name): + tpuobj = TPU(lookup().cfg.nodeset_tpu[ns]) for snodes in chunked(nodes, n=tpuobj.vmcount): tpu_start_data.append({"tpu": tpuobj, "node": snodes}) execute_with_futures(start_tpu, tpu_start_data) @@ -105,14 +103,14 @@ def _find_dynamic_node_status() -> NodeStatus: def _find_tpu_node_status(nodename, state): - ns = lkp.node_nodeset(nodename) + ns = lookup().node_nodeset(nodename) tpuobj = TPU(ns) inst = tpuobj.get_node(nodename) # If we do not find the node but it is from a Tpu that has multiple vms look for the master node if inst is None and tpuobj.vmcount > 1: # Get the tpu slurm nodelist of the nodes in the same tpu group as nodename nodelist = run( - f"{lkp.scontrol} show topo {nodename}" + f"{lookup().scontrol} show topo {nodename}" + " | awk -F'=' '/Level=0/ { print $NF }'", shell=True, ).stdout @@ -142,7 +140,7 @@ def _find_tpu_node_status(nodename, state): & state.flags ): return NodeStatus.unbacked - if lkp.is_static_node(nodename): + if lookup().is_static_node(nodename): return NodeStatus.resume elif ( state is not None @@ -166,16 +164,16 @@ def _find_tpu_node_status(nodename, state): def find_node_status(nodename): """Determine node/instance status that requires action""" - state = lkp.slurm_node(nodename) + state = lookup().slurm_node(nodename) - if lkp.node_is_dyn(nodename): + if lookup().node_is_dyn(nodename): return _find_dynamic_node_status() - if lkp.node_is_tpu(nodename): + if lookup().node_is_tpu(nodename): return _find_tpu_node_status(nodename, state) # split below is workaround for VMs whose hostname is FQDN - inst = lkp.instance(nodename.split(".")[0]) + inst = lookup().instance(nodename.split(".")[0]) power_flags = frozenset( ("POWER_DOWN", "POWERING_UP", "POWERING_DOWN", "POWERED_DOWN") ) & (state.flags if state is not None else set()) @@ -193,7 +191,7 @@ def find_node_status(nodename): return NodeStatus.unbacked if state.base == "DOWN" and not power_flags: return NodeStatus.power_down - if "POWERED_DOWN" in state.flags and lkp.is_static_node(nodename): + if "POWERED_DOWN" in state.flags and lookup().is_static_node(nodename): return NodeStatus.resume elif ( state is not None @@ -252,7 +250,7 @@ def nodes_down(): f"{count} nodes set down due to node status '{status.name}' ({hostlist})" ) run( - f"{lkp.scontrol} update nodename={hostlist} state=down reason='Instance stopped/deleted'" + f"{lookup().scontrol} update nodename={hostlist} state=down reason='Instance stopped/deleted'" ) def nodes_restart(): @@ -263,12 +261,12 @@ def nodes_restart(): def nodes_idle(): """idle nodes""" log.info(f"{count} nodes to idle ({hostlist})") - run(f"{lkp.scontrol} update nodename={hostlist} state=resume") + run(f"{lookup().scontrol} update nodename={hostlist} state=resume") def nodes_resume(): """resume nodes via scontrol""" log.info(f"{count} instances to resume ({hostlist})") - run(f"{lkp.scontrol} update nodename={hostlist} state=power_up") + run(f"{lookup().scontrol} update nodename={hostlist} state=power_up") def nodes_delete(): """delete instances for nodes""" @@ -278,15 +276,15 @@ def nodes_delete(): def nodes_power_down(): """power_down node in slurm""" log.info(f"{count} instances to power down ({hostlist})") - run(f"{lkp.scontrol} update nodename={hostlist} state=power_down") + run(f"{lookup().scontrol} update nodename={hostlist} state=power_down") def nodes_unknown(): """Error status, nodes shouldn't get in this status""" log.error(f"{count} nodes have unexpected status: ({hostlist})") first = next(iter(nodes)) - state = lkp.slurm_node(first) + state = lookup().slurm_node(first) state = "{}+{}".format(state.base, "+".join(state.flags)) if state else "None" - inst = lkp.instance(first) + inst = lookup().instance(first) log.error(f"{first} state: {state}, instance status:{inst.status}") update = dict.get( @@ -308,8 +306,8 @@ def nodes_unknown(): def delete_placement_groups(placement_groups): def delete_placement_request(pg_name, region): - return lkp.compute.resourcePolicies().delete( - project=lkp.project, region=region, resourcePolicy=pg_name + return lookup().compute.resourcePolicies().delete( + project=lookup().project, region=region, resourcePolicy=pg_name ) requests = { @@ -348,18 +346,18 @@ def sync_placement_groups(): keep_jobs = { str(job["job_id"]) - for job in json.loads(run(f"{lkp.scontrol} show jobs --json").stdout)["jobs"] + for job in json.loads(run(f"{lookup().scontrol} show jobs --json").stdout)["jobs"] if "job_state" in job and set(job["job_state"]) & keep_states } keep_jobs.add("0") # Job 0 is a placeholder for static node placement fields = "items.regions.resourcePolicies,nextPageToken" - flt = f"name={lkp.cfg.slurm_cluster_name}-*" - act = lkp.compute.resourcePolicies() - op = act.aggregatedList(project=lkp.project, fields=fields, filter=flt) + flt = f"name={lookup().cfg.slurm_cluster_name}-*" + act = lookup().compute.resourcePolicies() + op = act.aggregatedList(project=lookup().project, fields=fields, filter=flt) placement_groups = {} pg_regex = re.compile( - rf"{lkp.cfg.slurm_cluster_name}-(?P[^\s\-]+)-(?P\d+)-(?P\d+)" + rf"{lookup().cfg.slurm_cluster_name}-(?P[^\s\-]+)-(?P\d+)-(?P\d+)" ) while op is not None: result = ensure_execute(op) @@ -384,9 +382,9 @@ def sync_placement_groups(): def sync_slurm(): compute_instances = [ - name for name, inst in lkp.instances().items() if inst.role == "compute" + name for name, inst in lookup().instances().items() if inst.role == "compute" ] - slurm_nodes = list(lkp.slurm_nodes().keys()) + slurm_nodes = list(lookup().slurm_nodes().keys()) all_nodes = list( set( @@ -444,20 +442,20 @@ def reconfigure_slurm(): save_hash(CONFIG_HASH, hash_new.hexdigest()) save_config(cfg_new, CONFIG_FILE) cfg_new = load_config_file(CONFIG_FILE) - lkp = Lookup(cfg_new) - util.lkp = lkp - if lkp.is_controller: - conf.gen_controller_configs(lkp) + util._lkp = Lookup(cfg_new) + + if lookup().is_controller: + conf.gen_controller_configs(lookup()) log.info("Restarting slurmctld to make changes take effect.") try: # TODO: consider removing "restart" since "reconfigure" should restart slurmctld as well run("sudo systemctl restart slurmctld.service", check=False) - util.scontrol_reconfigure(lkp) + util.scontrol_reconfigure(lookup()) except Exception: log.exception("failed to reconfigure slurmctld") util.run(f"wall '{update_msg}'", timeout=30) log.debug("Done.") - elif lkp.instance_role_safe in ["compute", "login"]: + elif lookup().instance_role_safe in ["compute", "login"]: log.info("Restarting slurmd to make changes take effect.") run("systemctl restart slurmd") util.run(f"wall '{update_msg}'", timeout=30) @@ -478,7 +476,7 @@ def main(): except Exception: log.exception("failed to reconfigure slurm") - if lkp.is_controller: + if lookup().is_controller: try: sync_slurm() except Exception: @@ -488,7 +486,7 @@ def main(): except Exception: log.exception("failed to sync placement groups") try: - update_topology(lkp) + update_topology(lookup()) except Exception: log.exception("failed to update topology") diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py index 9848e5a995..1d439b22d3 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py @@ -29,7 +29,7 @@ separate, execute_with_futures, ) -from util import lkp, TPU +from util import lookup, TPU import slurm_gcp_plugins @@ -49,9 +49,9 @@ def truncate_iter(iterable, max_count): def delete_instance_request(instance): - request = lkp.compute.instances().delete( - project=lkp.project, - zone=lkp.instance(instance).zone, + request = lookup().compute.instances().delete( + project=lookup().project, + zone=lookup().instance(instance).zone, instance=instance, ) log_api_request(request) @@ -74,10 +74,10 @@ def stop_tpu(data): def delete_tpu_instances(instances): stop_data = [] - for prefix, nodes in util.groupby_unsorted(instances, lkp.node_prefix): + for prefix, nodes in util.groupby_unsorted(instances, lookup().node_prefix): log.info(f"Deleting TPU nodes from prefix {prefix}") lnodes = list(nodes) - tpu_nodeset = lkp.node_nodeset(lnodes[0]) + tpu_nodeset = lookup().node_nodeset(lnodes[0]) tpu = TPU(tpu_nodeset) stop_data.extend( [{"tpu": tpu, "node": node, "nodeset": tpu_nodeset} for node in lnodes] @@ -87,7 +87,7 @@ def delete_tpu_instances(instances): def delete_instances(instances): """delete instances individually""" - invalid, valid = separate(lambda inst: bool(lkp.instance(inst)), instances) + invalid, valid = separate(lambda inst: bool(lookup.instance(inst)), instances) if len(invalid) > 0: log.debug("instances do not exist: {}".format(",".join(invalid))) if len(valid) == 0: @@ -109,7 +109,7 @@ def delete_instances(instances): def suspend_nodes(nodes: List[str]) -> None: tpu_nodes, other_nodes = [], [] for node in nodes[:]: - if lkp.node_is_tpu(node): + if lookup().node_is_tpu(node): tpu_nodes.append(node) else: other_nodes.append(node) @@ -124,7 +124,7 @@ def main(nodelist): # Filter out nodes not in config.yaml other_nodes, pm_nodes = separate( - lkp.is_power_managed_node, util.to_hostnames(nodelist) + lookup().is_power_managed_node, util.to_hostnames(nodelist) ) if other_nodes: log.debug( @@ -137,8 +137,8 @@ def main(nodelist): return log.info(f"suspend {nodelist}") - if lkp.cfg.enable_slurm_gcp_plugins: - slurm_gcp_plugins.pre_main_suspend_nodes(lkp=lkp, nodelist=nodelist) + if lookup().cfg.enable_slurm_gcp_plugins: + slurm_gcp_plugins.pre_main_suspend_nodes(lkp=lookup(), nodelist=nodelist) suspend_nodes(pm_nodes) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py index f74804250a..4dd3c8a17b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py @@ -58,7 +58,7 @@ ], ) def test_node_desc(name, expected): - assert util.lkp._node_desc(name) == expected + assert util.lookup()._node_desc(name) == expected @pytest.mark.parametrize( @@ -69,7 +69,7 @@ def test_node_desc(name, expected): ) def test_node_desc_fail(name): with pytest.raises(Exception): - util.lkp._node_desc(name) + util.lookup()._node_desc(name) @pytest.mark.parametrize( diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index a7ffcac378..06c4af9abf 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -89,8 +89,6 @@ def mkdirp(path: Path) -> None: p for p in (Path(__file__).parent, Path("/slurm/scripts")) if p.is_dir() ) -# caching Lookup object -lkp = None # load all directories as Paths into a dict-like namespace dirs = NSDict( @@ -159,7 +157,7 @@ def universe_domain() -> str: def endpoint_version(api: ApiEndpoint) -> Optional[str]: - return lkp.endpoint_versions.get(api.value, None) + return lookup().endpoint_versions.get(api.value, None) @lru_cache(maxsize=1) @@ -302,9 +300,9 @@ def install_custom_scripts(check_hash=False): """download custom scripts from gcs bucket""" compute_tokens = ["compute", "prolog", "epilog"] - if lkp.instance_role == "compute": + if lookup().instance_role == "compute": try: - compute_tokens.append(f"nodeset-{lkp.node_nodeset_name()}") + compute_tokens.append(f"nodeset-{lookup().node_nodeset_name()}") except Exception as e: log.error(f"Failed to lookup nodeset: {e}") @@ -314,7 +312,7 @@ def install_custom_scripts(check_hash=False): "compute": compute_tokens, "controller": ["controller", "prolog", "epilog"], }, - lkp.instance_role, + lookup().instance_role, [], ) prefixes = [f"slurm-{tok}-script" for tok in prefix_tokens] @@ -483,7 +481,7 @@ def get_log_path() -> Path: Returns path to log file for the current script. e.g. resume.py -> /var/log/slurm/resume.log """ - log_dir = Path(lkp.cfg.slurm_log_dir or ".") + log_dir = Path(lookup().cfg.slurm_log_dir or ".") return (log_dir / Path(sys.argv[0]).name).with_suffix(".log") def init_log_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: @@ -505,10 +503,10 @@ def init_log_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: args = parser.parse_args() loglevel = args.loglevel - if lkp.cfg.enable_debug_logging: + if lookup().cfg.enable_debug_logging: loglevel = logging.DEBUG if args.trace_api: - lkp.cfg.extra_logging_flags["trace_api"] = True + lookup().cfg.extra_logging_flags["trace_api"] = True # Configure root logger logging.config.dictConfig({ @@ -549,7 +547,7 @@ def init_log_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: def log_api_request(request): """log.trace info about a compute API request""" - if not lkp.cfg.extra_logging_flags.get("trace_api"): + if not lookup().cfg.extra_logging_flags.get("trace_api"): return # output the whole request object as pretty yaml @@ -828,7 +826,7 @@ def to_hostlist(nodenames) -> str: tmp_file.writelines("\n".join(sorted(nodenames, key=natural_sort))) tmp_file.close() - hostlist = run(f"{lkp.scontrol} show hostlist {tmp_file.name}").stdout.rstrip() + hostlist = run(f"{lookup().scontrol} show hostlist {tmp_file.name}").stdout.rstrip() os.remove(tmp_file.name) return hostlist @@ -893,7 +891,7 @@ def cur_repr(): def part_is_tpu(part): """check if partition with name part contains a nodeset of type tpu""" - return len(lkp.cfg.partitions[part].partition_nodeset_tpu) > 0 + return len(lookup().cfg.partitions[part].partition_nodeset_tpu) > 0 def to_hostnames(nodelist: str) -> List[str]: """make list of hostnames from hostlist expression""" @@ -903,7 +901,7 @@ def to_hostnames(nodelist: str) -> List[str]: hostlist = nodelist else: hostlist = ",".join(nodelist) - hostnames = run(f"{lkp.scontrol} show hostnames {hostlist}").stdout.splitlines() + hostnames = run(f"{lookup().scontrol} show hostnames {hostlist}").stdout.splitlines() return hostnames @@ -968,7 +966,7 @@ def batch_callback(rid, resp, exc): done[rid] = resp def batch_request(reqs): - batch = lkp.compute.new_batch_http_request(callback=batch_callback) + batch = lookup().compute.new_batch_http_request(callback=batch_callback) for rid, req in reqs: batch.add(req, request_id=rid) return batch @@ -1003,19 +1001,19 @@ def batch_request(reqs): def wait_request(operation, project: str): """makes the appropriate wait request for a given operation""" if "zone" in operation: - req = lkp.compute.zoneOperations().wait( + req = lookup().compute.zoneOperations().wait( project=project, zone=trim_self_link(operation["zone"]), operation=operation["name"], ) elif "region" in operation: - req = lkp.compute.regionOperations().wait( + req = lookup().compute.regionOperations().wait( project=project, region=trim_self_link(operation["region"]), operation=operation["name"], ) else: - req = lkp.compute.globalOperations().wait( + req = lookup().compute.globalOperations().wait( project=project, operation=operation["name"] ) return req @@ -1044,7 +1042,7 @@ def wait_for_operations(operations): def get_filtered_operations(op_filter): """get list of operations associated with group id""" - project = lkp.project + project = lookup().project operations = [] def get_aggregated_operations(items): @@ -1055,7 +1053,7 @@ def get_aggregated_operations(items): ) ) - act = lkp.compute.globalOperations() + act = lookup().compute.globalOperations() op = act.aggregatedList( project=project, filter=op_filter, fields="items.*.operations,nextPageToken" ) @@ -1178,7 +1176,7 @@ def __init__(self, nodeset): if not can_tpu: raise Exception("TPU pip package not installed") self._nodeset = nodeset - self._parent = f"projects/{lkp.project}/locations/{nodeset.zone}" + self._parent = f"projects/{lookup().project}/locations/{nodeset.zone}" co = create_client_options(ApiEndpoint.TPU) self._client = tpu.TpuClient(client_options=co) self.data_disks = [] @@ -1306,7 +1304,7 @@ def get_node(self, nodename): def _register_node(self, nodename, ip_addr): dns_name = socket.getnameinfo((ip_addr, 0), 0)[0] run( - f"{lkp.scontrol} update nodename={nodename} nodeaddr={ip_addr} nodehostname={dns_name}" + f"{lookup().scontrol} update nodename={nodename} nodeaddr={ip_addr} nodehostname={dns_name}" ) def create_node(self, nodename): @@ -1331,7 +1329,7 @@ def create_node(self, nodename): echo "startup script not found > /var/log/startup_error.log" """ with open( - Path(lkp.cfg.slurm_scripts_dir or dirs.scripts) / "startup.sh", "r" + Path(lookup().cfg.slurm_scripts_dir or dirs.scripts) / "startup.sh", "r" ) as script: startup_script = script.read() if isinstance(nodename, list): @@ -1348,12 +1346,12 @@ def create_node(self, nodename): "slurm_docker_image": self.nodeset.docker_image, "startup-script": startup_script, "slurm_instance_role": "compute", - "slurm_cluster_name": lkp.cfg.slurm_cluster_name, - "slurm_bucket_path": lkp.cfg.bucket_path, + "slurm_cluster_name": lookup().cfg.slurm_cluster_name, + "slurm_bucket_path": lookup().cfg.bucket_path, "slurm_names": ";".join(slurm_names), "universe_domain": universe_domain(), } - node.tags = [lkp.cfg.slurm_cluster_name] + node.tags = [lookup().cfg.slurm_cluster_name] if self.nodeset.service_account: node.service_account.email = self.nodeset.service_account.email node.service_account.scope = self.nodeset.service_account.scopes @@ -1393,7 +1391,7 @@ def delete_node(self, nodename): # not been found, and if it ends in 0, it means that is the master node and it should have been found, and in consequence # log an error nodehostname = yaml.safe_load( - run(f"{lkp.scontrol} --yaml show node {nodename}").stdout.rstrip() + run(f"{lookup().scontrol} --yaml show node {nodename}").stdout.rstrip() )["nodes"][0]["hostname"] if nodehostname.split("-")[-1] == "0": log.error(f"TPU master node {nodename} not found") @@ -1643,9 +1641,9 @@ def instances(self, project=None, slurm_cluster_name=None): # "deletionProtection", # "startRestricted", ] - if lkp.cfg.enable_slurm_gcp_plugins: + if lookup().cfg.enable_slurm_gcp_plugins: slurm_gcp_plugins.register_instance_information_fields( - lkp=lkp, + lkp=lookup(), project=project, slurm_cluster_name=slurm_cluster_name, instance_information_fields=instance_information_fields, @@ -1863,20 +1861,22 @@ def nodeset_map(self, hostnames: list): def etc_dir(self) -> Path: return Path(self.cfg.output_dir or slurmdirs.etc) +_lkp: Optional[Lookup] = None + +def lookup() -> Lookup: + global _lkp + if _lkp is None: + cfg = load_config_file(CONFIG_FILE) + if not cfg: + try: + cfg = fetch_config_yaml() + except Exception as e: + log.warning(f"config not found in bucket: {e}") + if cfg: + save_config(cfg, CONFIG_FILE) + _lkp = Lookup(cfg) + return _lkp + def scontrol_reconfigure(lkp: Lookup) -> None: log.info("Running scontrol reconfigure") run(f"{lkp.scontrol} reconfigure", timeout=30) - -def _init_lkp() -> None: - cfg = load_config_file(CONFIG_FILE) - if not cfg: - try: - cfg = fetch_config_yaml() - except Exception as e: - log.warning(f"config not found in bucket: {e}") - if cfg: - save_config(cfg, CONFIG_FILE) - global lkp - lkp = Lookup(cfg) - -_init_lkp() # TODO: remove this line after refactoring From 4413f9529958dd15c167172750b039d771d6ed30 Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Wed, 7 Aug 2024 18:56:27 +0000 Subject: [PATCH 085/180] Update batch-job-template image to hpc-rocky-linux-8 * Updated batch-job-template image to hpc-rocky-linux-8 from batch-hpc-centos-7-official * Verified both batch test pass --- examples/serverless-batch.yaml | 3 --- modules/scheduler/batch-job-template/README.md | 2 +- modules/scheduler/batch-job-template/variables.tf | 4 ++-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/serverless-batch.yaml b/examples/serverless-batch.yaml index 538e7d9671..9c0f89c0b9 100644 --- a/examples/serverless-batch.yaml +++ b/examples/serverless-batch.yaml @@ -46,9 +46,6 @@ deployment_groups: machine_type: n2-standard-4 task_count: 8 task_count_per_node: 4 - instance_image: - family: batch-hpc-rocky-linux-8-official - project: batch-custom-image allow_automatic_updates: false - id: batch-login diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index 8cf6e6d276..229c1cfab5 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -162,7 +162,7 @@ limitations under the License. | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true, instances will have public IPs | `bool` | `true` | no | | [gcloud\_version](#input\_gcloud\_version) | The version of the gcloud cli being used. Used for output instructions. Valid inputs are `"alpha"`, `"beta"` and "" (empty string for default version) | `string` | `""` | no | | [image](#input\_image) | DEPRECATED: Google Cloud Batch compute node image. Ignored if `instance_template` is provided. | `any` | `null` | no | -| [instance\_image](#input\_instance\_image) | Google Cloud Batch compute node image. Ignored if `instance_template` is provided.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "batch-hpc-centos-7-official",
"project": "batch-custom-image"
}
| no | +| [instance\_image](#input\_instance\_image) | Google Cloud Batch compute node image. Ignored if `instance_template` is provided.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "hpc-rocky-linux-8",
"project": "cloud-hpc-image-public"
}
| no | | [instance\_template](#input\_instance\_template) | Compute VM instance template self-link to be used for Google Cloud Batch compute node. If provided, a number of other variables will be ignored as noted by `Ignored if instance_template is provided` in descriptions. | `string` | `null` | no | | [job\_filename](#input\_job\_filename) | The filename of the generated job template file. Will default to `cloud-batch-.json` if not specified | `string` | `null` | no | | [job\_id](#input\_job\_id) | An id for the Google Cloud Batch job. Used for output instructions and file naming. Automatically populated by the module id if not set. If setting manually, ensure a unique value across all jobs. | `string` | n/a | yes | diff --git a/modules/scheduler/batch-job-template/variables.tf b/modules/scheduler/batch-job-template/variables.tf index bfce75666e..f65fbd111e 100644 --- a/modules/scheduler/batch-job-template/variables.tf +++ b/modules/scheduler/batch-job-template/variables.tf @@ -197,8 +197,8 @@ variable "instance_image" { EOD type = map(string) default = { - project = "batch-custom-image" - family = "batch-hpc-centos-7-official" + project = "cloud-hpc-image-public" + family = "hpc-rocky-linux-8" } validation { From 7be95ea1e21419197dda79b462915c579730f85c Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 13 Aug 2024 17:03:50 -0700 Subject: [PATCH 086/180] Revert "Remove `docs/slurm-dws-flex.md`" --- docs/slurm-dws-flex.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 docs/slurm-dws-flex.md diff --git a/docs/slurm-dws-flex.md b/docs/slurm-dws-flex.md new file mode 100644 index 0000000000..ffea0bec16 --- /dev/null +++ b/docs/slurm-dws-flex.md @@ -0,0 +1,32 @@ +# Obtaining SlurmGCP nodes with DWS Flex + +[Dynamic Workload Scheduler](https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler) Flex Start mode is designed for fine-tuning models, experimentation, shorter training jobs, distillation, offline inference, and batch jobs. + +With Dynamic Workload Scheduler in Flex Start mode, you submit a GPU capacity request for your AI/ML jobs by indicating how many you need, a duration, and your preferred region. It supports capacity requests for up to seven days, with no minimum duration requirement. You can request capacity for as little as a few minutes or hours; typically, the scheduler can fulfill shorter requests more quickly than longer ones. + +> [!IMPORTANT] +> The project needs to be allowlisted for private preview access. +> Fill out the [form](https://docs.google.com/forms/d/1etaaXMW9jJUTTxfUC7TIIMttLWT5H-3Q8_3-sG6vwKk/edit). + +In order to make use of DWS Flex Start mode with SlurmGCP, you must specify a proper set of `instance_properties` in the `schedmd-slurm-gcp-v6-nodeset` module. See the example below: + +```yaml + - id: flex_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + instance_properties: + reservationAffinity: + consumeReservationType: NO_RESERVATION + scheduling: + maxRunDuration: { seconds: $(2 * 60 * 60) } # 2 hours + onHostMaintenance: TERMINATE + instanceTerminationAction: DELETE + # the rest of the settings, e.g. node_count_static, machine_type, additional_disks, etc. +``` + +**All** fields in `instance_properties` should match provided values, except for `maxRunDuration`, which should be set to the desired duration in seconds (up to 604800 = 7 days). + +> [!WARNING] +> The use of the `instance_properties` setting directly overrides bulkInsert API parameters. While the documented sample +> was tested at the time of publication, it is not regression tested and may cease to work based on changes in the bulkInsert API. From 90731712945d9a875d1c87ee7590296ad05e4c82 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 14 Aug 2024 04:49:15 +0000 Subject: [PATCH 087/180] SlurmGCP. Fix typo-bug `lookup -> lookup()` --- .../modules/slurm_files/scripts/suspend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py index 1d439b22d3..4866dffb1e 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py @@ -87,7 +87,7 @@ def delete_tpu_instances(instances): def delete_instances(instances): """delete instances individually""" - invalid, valid = separate(lambda inst: bool(lookup.instance(inst)), instances) + invalid, valid = separate(lambda inst: bool(lookup().instance(inst)), instances) if len(invalid) > 0: log.debug("instances do not exist: {}".format(",".join(invalid))) if len(valid) == 0: From 1411676fc42732c5d344a9d09bdd1c1ffdf61215 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Wed, 14 Aug 2024 10:03:27 +0000 Subject: [PATCH 088/180] use preconditions for validating different variables together --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/main.tf | 13 +++++++++++-- modules/compute/gke-node-pool/variables.tf | 10 ++++------ 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index e96c4f97dc..b143d4e896 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -243,7 +243,7 @@ No modules. | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | -| [specific\_reservation](#input\_specific\_reservation) | Reservation resources to consume when targeting SPECIFIC\_RESERVATION. Specify `compute.googleapis.com/reservation-name` as the key and the list of reservation names as the value. |
object({
key = string
values = list(string)
})
|
{
"key": null,
"values": null
}
| no | +| [specific\_reservation](#input\_specific\_reservation) | Reservation resources to consume when targeting SPECIFIC\_RESERVATION.
Specify `compute.googleapis.com/reservation-name` as the key and the list of reservation names as the value. |
object({
key = string
values = list(string)
})
|
{
"key": null,
"values": null
}
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no | | [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index b160eb1984..9045bb3f5a 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -154,8 +154,7 @@ resource "google_container_node_pool" "node_pool" { reservation_affinity { consume_reservation_type = var.reservation_type key = var.specific_reservation.key - # TODO(arajmane): ensure the reservation exists through dependencies? - values = var.specific_reservation.values + values = var.specific_reservation.values } } @@ -176,6 +175,16 @@ resource "google_container_node_pool" "node_pool" { condition = !(var.local_ssd_count_ephemeral_storage > 0 && var.local_ssd_count_nvme_block > 0) error_message = "Only one of local_ssd_count_ephemeral_storage or local_ssd_count_nvme_block can be set to a non-zero value." } + precondition { + condition = ( + (var.reservation_type != "SPECIFIC_RESERVATION" && var.specific_reservation.key == null && var.specific_reservation.values == null) || + (var.reservation_type == "SPECIFIC_RESERVATION" && var.specific_reservation.key == "compute.googleapis.com/reservation-name" && var.specific_reservation.values != null) + ) + error_message = <<-EOT + When using NO_RESERVATION or ANY_RESERVATION as the reservation type, `specific_reservation` cannot be set. + On the other hand, with SPECIFIC_RESERVATION you must set `specific_reservation.key` and `specific_reservation.values` to `compute.googleapis.com/reservation-name` and a list of reservation names respectively. + EOT + } } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index eab92d61e5..40376c928b 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -278,7 +278,10 @@ variable "reservation_type" { } variable "specific_reservation" { - description = "Reservation resources to consume when targeting SPECIFIC_RESERVATION. Specify `compute.googleapis.com/reservation-name` as the key and the list of reservation names as the value." + description = <<-EOT + Reservation resources to consume when targeting SPECIFIC_RESERVATION. + Specify `compute.googleapis.com/reservation-name` as the key and the list of reservation names as the value. + EOT type = object({ key = string values = list(string) @@ -287,9 +290,4 @@ variable "specific_reservation" { key = null values = null } - - validation { - condition = (var.specific_reservation.key == null && var.specific_reservation.values == null) || (var.specific_reservation.key == "compute.googleapis.com/reservation-name" && var.specific_reservation.values != null) - error_message = "Value must be equal to `compute.googleapis.com/reservation-name` when targeting a SPECIFIC_RESERVATION. Otherwise, do not specify the value" - } } From eb9fd07a0c04f4055a50747cfcedf444f5334347 Mon Sep 17 00:00:00 2001 From: Scott Gordon Date: Wed, 14 Aug 2024 17:21:26 +0100 Subject: [PATCH 089/180] OFE: new go version released has moved previous package to archive. update to download url --- .../ofe/infrastructure_files/gcs_bucket/webserver/startup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh index c43eab7380..dd92f7641f 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh @@ -161,7 +161,7 @@ EOF # # Note: go.mod doesn't reference minor version so we need to capture the latest GO_MAJOR_VERSION=$(awk '/^go/ {print $2}' "/opt/gcluster/cluster-toolkit/go.mod") -GO_API_RESPONSE=$(curl --silent "https://go.dev/dl/?mode=json") +GO_API_RESPONSE=$(curl --silent "https://go.dev/dl/?mode=json&include=all") GO_VERSION=$(echo "$GO_API_RESPONSE" | jq -r --arg major "go$GO_MAJOR_VERSION" '.[] | select(.version | startswith($major)).version' | sort -V | tail -n 1) GO_DOWNLOAD_URL="https://golang.org/dl/${GO_VERSION}.linux-amd64.tar.gz" curl --silent --show-error --location "${GO_DOWNLOAD_URL}" --output "/tmp/${GO_VERSION}.linux-amd64.tar.gz" From b23f6bfa7ba88218df718a1c0aabd6f296894106 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Wed, 14 Aug 2024 09:54:29 -0700 Subject: [PATCH 090/180] Add dependabot scanning for Slurm v6 Controller module --- .github/dependabot.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index eb9008bdaa..bd25a3a71f 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -45,3 +45,18 @@ updates: # Disable version updates, do security updates only # See https://docs.github.com/en/code-security/dependabot/dependabot-security-updates/configuring-dependabot-security-updates#overriding-the-default-behavior-with-a-configuration-file open-pull-requests-limit: 0 +- package-ecosystem: pip + directory: /community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/ + labels: + - dependencies + - python + - release-chore + schedule: + interval: weekly + day: monday + time: "03:00" + timezone: America/Los_Angeles + target-branch: develop + # Disable version updates, do security updates only + # See https://docs.github.com/en/code-security/dependabot/dependabot-security-updates/configuring-dependabot-security-updates#overriding-the-default-behavior-with-a-configuration-file + open-pull-requests-limit: 0 From 356b488ad5d7eff2fdf895823656898f16ea026c Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 6 Aug 2024 22:12:55 +0000 Subject: [PATCH 091/180] Add reservation support in slurm sync for scheduled maintenance --- .../modules/slurm_files/scripts/slurmsync.py | 88 ++++++++++++++++++- .../modules/slurm_files/scripts/util.py | 12 ++- 2 files changed, 95 insertions(+), 5 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 770b8fbe38..ff42fc4f18 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -26,6 +26,8 @@ from itertools import chain from pathlib import Path import yaml +from datetime import datetime +from typing import List, Dict, Tuple import util from util import ( @@ -443,7 +445,7 @@ def reconfigure_slurm(): save_config(cfg_new, CONFIG_FILE) cfg_new = load_config_file(CONFIG_FILE) util._lkp = Lookup(cfg_new) - + if lookup().is_controller: conf.gen_controller_configs(lookup()) log.info("Restarting slurmctld to make changes take effect.") @@ -470,6 +472,82 @@ def update_topology(lkp: util.Lookup) -> None: log.debug("Topology configuration updated. Reconfiguring Slurm.") util.scontrol_reconfigure(lkp) + +def delete_reservation(lkp: util.Lookup, reservation_name: str) -> None: + util.run(f"{lkp.scontrol} delete reservation {reservation_name}") + + +def create_reservation(lkp: util.Lookup, reservation_name: str, node: str, start_time: datetime) -> None: + # Format time to be compatible with slurm reservation. + formatted_start_time = start_time.strftime('%Y-%m-%dT%H:%M:%S') + util.run(f"{lkp.scontrol} create reservation user=slurm starttime={formatted_start_time} duration=180 nodes={node} reservationname={reservation_name}") + + +def get_slurm_reservation_maintenance(lkp: util.Lookup) -> Dict[str, datetime]: + res = util.run(f"{lkp.scontrol} show reservation --json") + all_reservations = json.loads(res.stdout) + reservation_map = {} + + for reservation in all_reservations['reservations']: + name = reservation.get('name') + nodes = reservation.get('node_list') + time_epoch = reservation.get('start_time', {}).get('number') + + if name is None or nodes is None or time_epoch is None: + continue + + if reservation.get('node_count') != 1: + continue + + if name != f"{nodes}_maintenance": + continue + + reservation_map[name] = datetime.fromtimestamp(time_epoch) + + return reservation_map + + +def get_upcoming_maintenance(lkp: util.Lookup) -> Dict[str, Tuple[str, datetime]]: + upc_maint_map = {} + + for node, properties in lkp.instances().items(): + if 'upcomingMaintenance' in properties: + start_time = datetime.strptime(properties['upcomingMaintenance']['startTimeWindow']['earliest'], '%Y-%m-%dT%H:%M:%S%z') + upc_maint_map[node + "_maintenance"] = (node, start_time) + + return upc_maint_map + + +def sync_maintenance_reservation(lkp: util.Lookup) -> None: + upc_maint_map = get_upcoming_maintenance(lkp) # map reservation_name -> (node_name, time) + log.debug(f"upcoming-maintenance-vms: {upc_maint_map}") + + curr_reservation_map = get_slurm_reservation_maintenance(lkp) # map reservation_name -> time + log.debug(f"curr-reservation-map: {curr_reservation_map}") + + del_reservation = set(curr_reservation_map.keys() - upc_maint_map.keys()) + create_reservation_map = {} + + for res_name, (node, start_time) in upc_maint_map.items(): + if res_name in curr_reservation_map: + diff = curr_reservation_map[res_name] - start_time + if abs(diff) <= datetime.timedelta(seconds=1): + continue + else: + del_reservation.add(res_name) + create_reservation_map[res_name] = (node, start_time) + else: + create_reservation_map[res_name] = (node, start_time) + + log.debug(f"del-reservation: {del_reservation}") + for res_name in del_reservation: + delete_reservation(lkp, res_name) + + log.debug(f"create-reservation-map: {create_reservation_map}") + for res_name, (node, start_time) in create_reservation_map.items(): + create_reservation(lkp, res_name, node, start_time) + + def main(): try: reconfigure_slurm() @@ -481,15 +559,23 @@ def main(): sync_slurm() except Exception: log.exception("failed to sync instances") + try: sync_placement_groups() except Exception: log.exception("failed to sync placement groups") + try: update_topology(lookup()) except Exception: log.exception("failed to update topology") + ## TODO: Enable reservation for scheduled maintenance. + # try: + # sync_maintenance_reservation(lookup()) + # except Exception: + # log.exception("failed to sync slurm reservation for scheduled maintenance") + try: install_custom_scripts(check_hash=True) except Exception: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 06c4af9abf..f876e21fc1 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -337,7 +337,7 @@ def install_custom_scripts(check_hash=False): chown_slurm(dirs.custom_scripts / par) need_update = True if check_hash and fullpath.exists(): - # TODO: MD5 reported by gcloud may differ from the one calculated here (e.g. if blob got gzipped), + # TODO: MD5 reported by gcloud may differ from the one calculated here (e.g. if blob got gzipped), # consider using gCRC32C need_update = hash_file(fullpath) != blob.md5_hash if need_update: @@ -501,7 +501,6 @@ def init_log_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: help="Enable detailed api request output", ) args = parser.parse_args() - loglevel = args.loglevel if lookup().cfg.enable_debug_logging: loglevel = logging.DEBUG @@ -549,7 +548,6 @@ def log_api_request(request): """log.trace info about a compute API request""" if not lookup().cfg.extra_logging_flags.get("trace_api"): return - # output the whole request object as pretty yaml # the body is nested json, so load it as well rep = json.loads(request.to_json()) @@ -1648,6 +1646,12 @@ def instances(self, project=None, slurm_cluster_name=None): slurm_cluster_name=slurm_cluster_name, instance_information_fields=instance_information_fields, ) + + # TODO: Merge this with all fields when upcoming maintenance is + # supported in beta. + if endpoint_version(ApiEndpoint.COMPUTE) == 'alpha': + instance_information_fields.append("upcomingMaintenance") + instance_information_fields = sorted(set(instance_information_fields)) instance_fields = ",".join(instance_information_fields) fields = f"items.zones.instances({instance_fields}),nextPageToken" @@ -1675,7 +1679,7 @@ def properties(inst): instance_iter = ( (inst["name"], properties(inst)) for inst in chain.from_iterable( - m["instances"] for m in result.get("items", {}).values() + zone.get("instances", []) for zone in result.get("items", {}).values() ) ) instances.update( From cbe838f82a431142464ee1b435c8aa15f740cdfa Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 13 Aug 2024 01:00:32 +0000 Subject: [PATCH 092/180] SlurmGCP. Fetch config once during setup. --- .../modules/slurm_files/scripts/setup.py | 12 ++- .../modules/slurm_files/scripts/slurmsync.py | 77 ++++++----------- .../modules/slurm_files/scripts/util.py | 86 +++++++++---------- 3 files changed, 77 insertions(+), 98 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index 3045bc6690..4bf86f176e 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -445,8 +445,18 @@ def setup_compute(): def main(): start_motd() - configure_dirs() + + sleep_seconds = 5 + while True: + try: + _, cfg = util.fetch_config() + util.update_config(cfg) + break + except Exception as e: + log.exception(f"could not fetch config, sleeping for {sleep_seconds}s") + time.sleep(sleep_seconds) + configure_dirs() # call the setup function for the instance type { "controller": setup_controller, diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index c6d588b78d..81733dbb10 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -17,7 +17,6 @@ import argparse import datetime import fcntl -import hashlib import json import logging import re @@ -34,12 +33,8 @@ batch_execute, ensure_execute, execute_with_futures, - fetch_config_yaml, - fetch_config_yaml_md5, install_custom_scripts, - load_config_file, run, - save_config, separate, to_hostlist_fast, Lookup, @@ -47,7 +42,7 @@ TPU, chunked, ) -from util import lookup, CONFIG_FILE +from util import lookup from suspend import delete_instances from resume import start_tpu import conf @@ -409,55 +404,35 @@ def sync_slurm(): do_node_update(status, nodes) -def read_hash(filename): - filename = Path(filename) - if not filename.exists(): - return None - with open(filename, "r", encoding="utf-8") as file: - return file.readline() - - -def save_hash(filename, hash): - with open(filename, "w+", encoding="utf-8") as file: - file.write(hash) - - def reconfigure_slurm(): - CONFIG_HASH = Path("/slurm/scripts/.config.hash") update_msg = "*** slurm configuration was updated ***" - cfg_old = load_config_file(CONFIG_FILE) - - if cfg_old.hybrid: + if lookup().cfg.hybrid: # terraform handles generating the config.yaml, don't do it here return - - hash_new: hashlib.md5 = fetch_config_yaml_md5() - hash_old: str = read_hash(CONFIG_HASH) - - if hash_new.hexdigest() != hash_old: - log.debug("Delta detected. Reconfiguring Slurm now.") - cfg_new = fetch_config_yaml() - save_hash(CONFIG_HASH, hash_new.hexdigest()) - save_config(cfg_new, CONFIG_FILE) - cfg_new = load_config_file(CONFIG_FILE) - util._lkp = Lookup(cfg_new) - - if lookup().is_controller: - conf.gen_controller_configs(lookup()) - log.info("Restarting slurmctld to make changes take effect.") - try: - # TODO: consider removing "restart" since "reconfigure" should restart slurmctld as well - run("sudo systemctl restart slurmctld.service", check=False) - util.scontrol_reconfigure(lookup()) - except Exception: - log.exception("failed to reconfigure slurmctld") - util.run(f"wall '{update_msg}'", timeout=30) - log.debug("Done.") - elif lookup().instance_role_safe in ["compute", "login"]: - log.info("Restarting slurmd to make changes take effect.") - run("systemctl restart slurmd") - util.run(f"wall '{update_msg}'", timeout=30) - log.debug("Done.") + + upd, cfg_new = util.fetch_config() + if not upd: + log.debug("No changes in config detected.") + return + log.debug("Changes in config detected. Reconfiguring Slurm now.") + util.update_config(cfg_new) + + if lookup().is_controller: + conf.gen_controller_configs(lookup()) + log.info("Restarting slurmctld to make changes take effect.") + try: + # TODO: consider removing "restart" since "reconfigure" should restart slurmctld as well + run("sudo systemctl restart slurmctld.service", check=False) + util.scontrol_reconfigure(lookup()) + except Exception: + log.exception("failed to reconfigure slurmctld") + util.run(f"wall '{update_msg}'", timeout=30) + log.debug("Done.") + elif lookup().instance_role_safe in ["compute", "login"]: + log.info("Restarting slurmd to make changes take effect.") + run("systemctl restart slurmd") + util.run(f"wall '{update_msg}'", timeout=30) + log.debug("Done.") def update_topology(lkp: util.Lookup) -> None: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index fd78d6da35..98a280967d 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -78,7 +78,6 @@ else: CONFIG_FILE = Path(__file__).with_name("config.yaml") API_REQ_LIMIT = 2000 -URI_REGEX = r"[a-z]([-a-z0-9]*[a-z0-9])?" def mkdirp(path: Path) -> None: @@ -394,9 +393,7 @@ def storage_client() -> storage.Client: return storage.Client(client_options=ClientOptions(**co)) -def load_config_data(config): - """load dict-like data into a config object""" - cfg = NSDict(config) +def _fill_cfg_defaults(cfg: NSDict) -> NSDict: if not cfg.slurm_log_dir: cfg.slurm_log_dir = dirs.log if not cfg.slurm_bin_dir: @@ -415,15 +412,7 @@ def load_config_data(config): "mount_options": "defaults,hard,intr,_netdev", } ) - return cfg - - -def new_config(config): - """initialize a new config object - necessary defaults are handled here - """ - cfg = load_config_data(config) - + network_storage_iter = filter( None, ( @@ -442,34 +431,35 @@ def new_config(config): return cfg -def fetch_config_yaml(): - """Fetch config.yaml from bucket""" - config_yaml = blob_get("config.yaml").download_as_text() - return new_config(yaml.safe_load(config_yaml)) - - -def fetch_config_yaml_md5(): - """Fetch config.yaml blob md5 from bucket""" +def _fetch_config(old_hash: Optional[str]) -> Optional[Tuple[NSDict, str]]: + """Fetch config from bucket, returns None if no changes are detected.""" blob = blob_get("config.yaml") blob.reload() # Populate blob with metadata - hash_str = str(blob.md5_hash).encode(encoding="utf-8") - return hashlib.md5(hash_str) + if old_hash == blob.md5_hash: + return None - -def load_config_file(path): - """load config from file""" - content = None - try: - content = yaml.safe_load(Path(path).read_text()) - except FileNotFoundError: - log.warning(f"config file not found: {path}") - return NSDict() - return load_config_data(content) + cfg = NSDict(yaml.safe_load(blob.download_as_text())) + return _fill_cfg_defaults(cfg), blob.md5_hash -def save_config(cfg, path): - """save given config to file at path""" - Path(path).write_text(yaml.dump(cfg, Dumper=Dumper)) +def fetch_config() -> Tuple[bool, NSDict]: + """ + Fetches config from bucket and saves it locally + Returns True if new (updated) config was fetched + """ + hash_file = Path("/slurm/scripts/.config.hash") + old_hash = hash_file.read_text() if hash_file.exists() else None + + cfg_and_hash = _fetch_config(old_hash=old_hash) + if not cfg_and_hash: + return False, _load_config() + + cfg, hash = cfg_and_hash + hash_file.write_text(hash) + chown_slurm(hash_file) + CONFIG_FILE.write_text(yaml.dump(cfg, Dumper=Dumper)) + chown_slurm(CONFIG_FILE) + return True, cfg def owned_file_handler(filename): """create file handler""" @@ -481,7 +471,8 @@ def get_log_path() -> Path: Returns path to log file for the current script. e.g. resume.py -> /var/log/slurm/resume.log """ - log_dir = Path(lookup().cfg.slurm_log_dir or ".") + cfg_log_dir = lookup().cfg.slurm_log_dir + log_dir = Path(cfg_log_dir) if cfg_log_dir else dirs.log return (log_dir / Path(sys.argv[0]).name).with_suffix(".log") def init_log_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: @@ -506,7 +497,6 @@ def init_log_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: loglevel = logging.DEBUG if args.trace_api: lookup().cfg.extra_logging_flags["trace_api"] = True - # Configure root logger logging.config.dictConfig({ "version": 1, @@ -1842,20 +1832,24 @@ def etc_dir(self) -> Path: _lkp: Optional[Lookup] = None +def _load_config() -> NSDict: + return NSDict(yaml.safe_load(CONFIG_FILE.read_text())) + def lookup() -> Lookup: global _lkp if _lkp is None: - cfg = load_config_file(CONFIG_FILE) - if not cfg: - try: - cfg = fetch_config_yaml() - except Exception as e: - log.warning(f"config not found in bucket: {e}") - if cfg: - save_config(cfg, CONFIG_FILE) + try: + cfg = _load_config() + except FileNotFoundError: + log.error(f"config file not found: {CONFIG_FILE}") + cfg = NSDict() # TODO: fail here, once all code paths are covered (mainly init_logging) _lkp = Lookup(cfg) return _lkp +def update_config(cfg: NSDict) -> None: + global _lkp + _lkp = Lookup(cfg) + def scontrol_reconfigure(lkp: Lookup) -> None: log.info("Running scontrol reconfigure") run(f"{lkp.scontrol} reconfigure", timeout=30) From 19327fb1d7a6a28a52359adee147549284b163f5 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 3 Aug 2024 23:37:22 +0000 Subject: [PATCH 093/180] Store nodeset and partition config in separate files --- .../modules/slurm_files/README.md | 2 + .../modules/slurm_files/main.tf | 29 ++++--- .../scripts/setup_network_storage.py | 5 +- .../modules/slurm_files/scripts/slurmsync.py | 3 +- .../modules/slurm_files/scripts/util.py | 81 +++++++++++++++---- 5 files changed, 92 insertions(+), 28 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index 5414e24c8b..8789eab115 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -46,7 +46,9 @@ No modules. | [google_storage_bucket_object.devel](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_storage_bucket_object.epilog_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_storage_bucket_object.login_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.nodeset_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_storage_bucket_object.nodeset_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.parition_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_storage_bucket_object.prolog_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [random_uuid.cluster_id](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/uuid) | resource | | [archive_file.slurm_gcp_devel_zip](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index c25748dc48..0547b15f05 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -67,12 +67,6 @@ locals { epilog_scripts = [for k, v in google_storage_bucket_object.epilog_scripts : k] cloud_parameters = var.cloud_parameters - partitions = { for p in var.partitions : p.partition_name => p } - nodeset = { - for n in var.nodeset : n.nodeset_name => merge(n, { - instance_properties = jsondecode(n.instance_properties_json) - }) - } nodeset_dyn = { for n in var.nodeset_dyn : n.nodeset_name => n } nodeset_tpu = { for n in var.nodeset_tpu[*].nodeset : n.nodeset_name => n } @@ -97,9 +91,6 @@ locals { endpoint_versions = var.endpoint_versions } - config_yaml = "config.yaml" - config_yaml_bucket = format("%s/%s", local.bucket_dir, local.config_yaml) - x_nodeset = toset(var.nodeset[*].nodeset_name) x_nodeset_dyn = toset(var.nodeset_dyn[*].nodeset_name) x_nodeset_tpu = toset(var.nodeset_tpu[*].nodeset.nodeset_name) @@ -128,10 +119,28 @@ locals { resource "google_storage_bucket_object" "config" { bucket = data.google_storage_bucket.this.name - name = local.config_yaml_bucket + name = "${local.bucket_dir}/config.yaml" content = yamlencode(local.config) } +resource "google_storage_bucket_object" "parition_config" { + for_each = { for p in var.partitions : p.partition_name => p } + + bucket = data.google_storage_bucket.this.name + name = "${local.bucket_dir}/partition_configs/${each.key}.yaml" + content = yamlencode(each.value) +} + +resource "google_storage_bucket_object" "nodeset_config" { + for_each = { for ns in var.nodeset : ns.nodeset_name => merge(ns, { + instance_properties = jsondecode(ns.instance_properties_json) + }) } + + bucket = data.google_storage_bucket.this.name + name = "${local.bucket_dir}/nodeset_configs/${each.key}.yaml" + content = yamlencode(each.value) +} + ######### # DEVEL # ######### diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py index b544d4220e..80e4be9397 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py @@ -213,8 +213,6 @@ def munge_mount_handler(): else "defaults,hard,intr,_netdev" ) - munge_key = Path(dirs.munge / "munge.key") - log.info(f"Mounting munge share to: {local_mount}") local_mount.mkdir() if fs_type.lower() == "gcsfuse".lower(): @@ -228,7 +226,7 @@ def munge_mount_handler(): ] else: if remote_mount is None: - remote_mount = Path("/etc/munge") + remote_mount = dirs.munge cmd = [ "mount", f"--types={fs_type}", @@ -252,6 +250,7 @@ def munge_mount_handler(): else: raise err + munge_key = Path(dirs.munge / "munge.key") log.info(f"Copy munge.key from: {local_mount}") shutil.copy2(Path(local_mount / "munge.key"), munge_key) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 81733dbb10..a7195db17f 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -37,7 +37,6 @@ run, separate, to_hostlist_fast, - Lookup, NSDict, TPU, chunked, @@ -548,6 +547,8 @@ def main(): # log.exception("failed to sync slurm reservation for scheduled maintenance") try: + # TODO: it performs 1 to 4 GCS list requests, + # use cached version, combine with `_list_config_blobs` install_custom_scripts(check_hash=True) except Exception: log.exception("failed to sync custom scripts") diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 98a280967d..9fac1f8c8a 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, List, Tuple, Optional +from typing import Iterable, List, Tuple, Optional, Any import argparse import base64 import collections @@ -38,7 +38,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from contextlib import contextmanager from functools import lru_cache, reduce, wraps -from itertools import chain, compress, islice +from itertools import chain, islice from pathlib import Path from time import sleep, time @@ -266,17 +266,18 @@ def map_with_futures(func, seq): res = e yield res +def _get_bucket_and_common_prefix() -> Tuple[str, str]: + uri = instance_metadata("attributes/slurm_bucket_path") + return parse_bucket_uri(uri) def blob_get(file): - uri = instance_metadata("attributes/slurm_bucket_path") - bucket_name, path = parse_bucket_uri(uri) + bucket_name, path = _get_bucket_and_common_prefix() blob_name = f"{path}/{file}" return storage_client().get_bucket(bucket_name).blob(blob_name) def blob_list(prefix="", delimiter=None): - uri = instance_metadata("attributes/slurm_bucket_path") - bucket_name, path = parse_bucket_uri(uri) + bucket_name, path = _get_bucket_and_common_prefix() blob_prefix = f"{path}/{prefix}" # Note: The call returns a response only when the iterator is consumed. blobs = storage_client().list_blobs( @@ -402,8 +403,7 @@ def _fill_cfg_defaults(cfg: NSDict) -> NSDict: cfg.slurm_control_host = f"{cfg.slurm_cluster_name}-controller" if not cfg.slurm_control_host_port: cfg.slurm_control_host_port = "6820-6830" - if not cfg.munge_mount: - # NOTE: should only happen with cloud controller + if not cfg.munge_mount: # NOTE: should only happen with cloud controller cfg.munge_mount = NSDict( { "server_ip": cfg.slurm_control_addr or cfg.slurm_control_host, @@ -416,6 +416,7 @@ def _fill_cfg_defaults(cfg: NSDict) -> NSDict: network_storage_iter = filter( None, ( + cfg.munge_mount, *cfg.network_storage, *cfg.login_network_storage, *chain.from_iterable(ns.network_storage for ns in cfg.nodeset.values()), @@ -430,17 +431,69 @@ def _fill_cfg_defaults(cfg: NSDict) -> NSDict: netstore.server_ip = cfg.slurm_control_host return cfg +def _list_config_blobs() -> Tuple[Any, str]: + _, common_prefix = _get_bucket_and_common_prefix() + res = { # TODO: use a dataclass once we move to python 3.7 + "core": None, + "partition": [], + "nodeset": [], + } + hash = hashlib.md5() + blobs = list(blob_list(prefix="")) + # sort blobs so hash is consistent + for blob in sorted(blobs, key=lambda b: b.name): + if blob.name == f"{common_prefix}/config.yaml": + res["core"] = blob + hash.update(blob.md5_hash.encode("utf-8")) + for key in ("partition", "nodeset"): + if blob.name.startswith(f"{common_prefix}/{key}_configs/"): + res[key].append(blob) + hash.update(blob.md5_hash.encode("utf-8")) + assert res["core"] is not None, "config.yaml not found in bucket" + return res, hash.hexdigest() + def _fetch_config(old_hash: Optional[str]) -> Optional[Tuple[NSDict, str]]: """Fetch config from bucket, returns None if no changes are detected.""" - blob = blob_get("config.yaml") - blob.reload() # Populate blob with metadata - if old_hash == blob.md5_hash: + # TODO: fetch nodeset_dyn, nodeset_tpu, and login + blobs, hash = _list_config_blobs() + if old_hash == hash: return None - cfg = NSDict(yaml.safe_load(blob.download_as_text())) - return _fill_cfg_defaults(cfg), blob.md5_hash - + def _download(bs) -> List[Any]: + return [yaml.safe_load(b.download_as_text()) for b in bs] + + return _assemble_config( + core=_download([blobs["core"]])[0], + partitions=_download(blobs["partition"]), + nodesets=_download(blobs["nodeset"]), + ), hash + +def _assemble_config(core: Any, partitions: List[Any], nodesets: List[Any]) -> NSDict: + cfg = NSDict(core) + + # add partition configs + for p_yaml in partitions: + p_cfg = NSDict(p_yaml) + assert p_cfg.get("partition_name"), "partition_name is required" + p_name = p_cfg.partition_name + assert p_name not in cfg.partitions, f"partition {p_name} already defined" + cfg.partitions[p_name] = p_cfg + + # add nodeset configs + for ns_yaml in nodesets: + ns_cfg = NSDict(ns_yaml) + assert ns_cfg.get("nodeset_name"), "nodeset_name is required" + ns_name = ns_cfg.nodeset_name + assert ns_name not in cfg.nodeset, f"nodeset {ns_name} already defined" + cfg.nodeset[ns_name] = ns_cfg + + # validate that configs for all referenced nodesets are present + for p in cfg.partitions.values(): + for ns_name in p.partition_nodeset: + assert ns_name in cfg.nodeset, f"nodeset {ns_name} not defined in config" + + return _fill_cfg_defaults(cfg) def fetch_config() -> Tuple[bool, NSDict]: """ From 6e00e1e402c3482695aefdbf4a5a41c2ad4b0c7b Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 16 Aug 2024 00:44:24 +0000 Subject: [PATCH 094/180] SlurmGCP. Fix `datetime` usage. --- .../modules/slurm_files/scripts/slurmsync.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index a7195db17f..d9cf579c6c 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -25,8 +25,9 @@ from itertools import chain from pathlib import Path import yaml +import datetime as dt from datetime import datetime -from typing import List, Dict, Tuple +from typing import Dict, Tuple import util from util import ( @@ -229,8 +230,8 @@ def _seconds_since_timestamp(timestamp): """ if timestamp[-3] == ":": # python 36 datetime does not support the colon timestamp = timestamp[:-3] + timestamp[-2:] - creation_dt = datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%f%z") - return datetime.datetime.now().timestamp() - creation_dt.timestamp() + creation_dt = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%f%z") + return datetime.now().timestamp() - creation_dt.timestamp() def do_node_update(status, nodes): @@ -501,7 +502,7 @@ def sync_maintenance_reservation(lkp: util.Lookup) -> None: for res_name, (node, start_time) in upc_maint_map.items(): if res_name in curr_reservation_map: diff = curr_reservation_map[res_name] - start_time - if abs(diff) <= datetime.timedelta(seconds=1): + if abs(diff) <= dt.timedelta(seconds=1): continue else: del_reservation.add(res_name) From c10c0de52b79cb934066a8874f1d49c4a974eae5 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 16 Aug 2024 01:05:46 +0000 Subject: [PATCH 095/180] SlurmGCP. Store TPU and dynamic nodesets configs in separate files --- .../modules/slurm_files/README.md | 2 ++ .../modules/slurm_files/main.tf | 19 ++++++++-- .../modules/slurm_files/scripts/util.py | 36 +++++++++++++------ 3 files changed, 44 insertions(+), 13 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index 8789eab115..7a5d59e3e3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -47,7 +47,9 @@ No modules. | [google_storage_bucket_object.epilog_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_storage_bucket_object.login_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_storage_bucket_object.nodeset_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.nodeset_dyn_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_storage_bucket_object.nodeset_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.nodeset_tpu_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_storage_bucket_object.parition_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_storage_bucket_object.prolog_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [random_uuid.cluster_id](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/uuid) | resource | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index 0547b15f05..00338a3cca 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -67,9 +67,6 @@ locals { epilog_scripts = [for k, v in google_storage_bucket_object.epilog_scripts : k] cloud_parameters = var.cloud_parameters - nodeset_dyn = { for n in var.nodeset_dyn : n.nodeset_name => n } - nodeset_tpu = { for n in var.nodeset_tpu[*].nodeset : n.nodeset_name => n } - # hybrid hybrid = var.enable_hybrid google_app_cred_path = var.enable_hybrid ? local.google_app_cred_path : null @@ -141,6 +138,22 @@ resource "google_storage_bucket_object" "nodeset_config" { content = yamlencode(each.value) } +resource "google_storage_bucket_object" "nodeset_dyn_config" { + for_each = { for ns in var.nodeset_dyn : ns.nodeset_name => ns } + + bucket = data.google_storage_bucket.this.name + name = "${local.bucket_dir}/nodeset_dyn_configs/${each.key}.yaml" + content = yamlencode(each.value) +} + +resource "google_storage_bucket_object" "nodeset_tpu_config" { + for_each = { for n in var.nodeset_tpu[*].nodeset : n.nodeset_name => n } + + bucket = data.google_storage_bucket.this.name + name = "${local.bucket_dir}/nodeset_tpu_configs/${each.key}.yaml" + content = yamlencode(each.value) +} + ######### # DEVEL # ######### diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 9fac1f8c8a..4a20c1b40a 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -437,6 +437,8 @@ def _list_config_blobs() -> Tuple[Any, str]: "core": None, "partition": [], "nodeset": [], + "nodeset_dyn": [], + "nodeset_tpu": [], } hash = hashlib.md5() blobs = list(blob_list(prefix="")) @@ -445,7 +447,7 @@ def _list_config_blobs() -> Tuple[Any, str]: if blob.name == f"{common_prefix}/config.yaml": res["core"] = blob hash.update(blob.md5_hash.encode("utf-8")) - for key in ("partition", "nodeset"): + for key in ("partition", "nodeset", "nodeset_dyn", "nodeset_tpu"): if blob.name.startswith(f"{common_prefix}/{key}_configs/"): res[key].append(blob) hash.update(blob.md5_hash.encode("utf-8")) @@ -455,7 +457,6 @@ def _list_config_blobs() -> Tuple[Any, str]: def _fetch_config(old_hash: Optional[str]) -> Optional[Tuple[NSDict, str]]: """Fetch config from bucket, returns None if no changes are detected.""" - # TODO: fetch nodeset_dyn, nodeset_tpu, and login blobs, hash = _list_config_blobs() if old_hash == hash: return None @@ -467,9 +468,17 @@ def _download(bs) -> List[Any]: core=_download([blobs["core"]])[0], partitions=_download(blobs["partition"]), nodesets=_download(blobs["nodeset"]), + nodesets_dyn=_download(blobs["nodeset_dyn"]), + nodesets_tpu=_download(blobs["nodeset_tpu"]), ), hash -def _assemble_config(core: Any, partitions: List[Any], nodesets: List[Any]) -> NSDict: +def _assemble_config( + core: Any, + partitions: List[Any], + nodesets: List[Any], + nodesets_dyn: List[Any], + nodesets_tpu: List[Any], + ) -> NSDict: cfg = NSDict(core) # add partition configs @@ -481,17 +490,24 @@ def _assemble_config(core: Any, partitions: List[Any], nodesets: List[Any]) -> N cfg.partitions[p_name] = p_cfg # add nodeset configs - for ns_yaml in nodesets: - ns_cfg = NSDict(ns_yaml) - assert ns_cfg.get("nodeset_name"), "nodeset_name is required" - ns_name = ns_cfg.nodeset_name - assert ns_name not in cfg.nodeset, f"nodeset {ns_name} already defined" - cfg.nodeset[ns_name] = ns_cfg + ns_names = set() + def _add_nodesets(yamls: List[Any], target: dict): + for ns_yaml in yamls: + ns_cfg = NSDict(ns_yaml) + assert ns_cfg.get("nodeset_name"), "nodeset_name is required" + ns_name = ns_cfg.nodeset_name + assert ns_name not in ns_names, f"nodeset {ns_name} already defined" + target[ns_name] = ns_cfg + ns_names.add(ns_name) + + _add_nodesets(nodesets, cfg.nodeset) + _add_nodesets(nodesets_dyn, cfg.nodeset_dyn) + _add_nodesets(nodesets_tpu, cfg.nodeset_tpu) # validate that configs for all referenced nodesets are present for p in cfg.partitions.values(): for ns_name in p.partition_nodeset: - assert ns_name in cfg.nodeset, f"nodeset {ns_name} not defined in config" + assert ns_name in ns_names, f"nodeset {ns_name} not defined in config" return _fill_cfg_defaults(cfg) From 94495a9d723c8c8974f123431ab089a89f685293 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 16 Aug 2024 12:17:27 +0000 Subject: [PATCH 096/180] An output enable_multi_networking added to multivpc module. --- modules/network/multivpc/README.md | 1 + modules/network/multivpc/outputs.tf | 5 +++++ modules/scheduler/gke-cluster/main.tf | 2 -- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/network/multivpc/README.md b/modules/network/multivpc/README.md index f0f55175c9..e254bc4888 100644 --- a/modules/network/multivpc/README.md +++ b/modules/network/multivpc/README.md @@ -126,6 +126,7 @@ limitations under the License. | Name | Description | |------|-------------| | [additional\_networks](#output\_additional\_networks) | Network interfaces for each subnetwork created by this module | +| [enable\_multi\_networking](#output\_enable\_multi\_networking) | Enables multi-networking by setting the corresponding variable to true on supported modules (e.g., gke\_cluster). | | [network\_ids](#output\_network\_ids) | IDs of the new VPC network | | [network\_names](#output\_network\_names) | Names of the new VPC networks | | [network\_self\_links](#output\_network\_self\_links) | Self link of the new VPC network | diff --git a/modules/network/multivpc/outputs.tf b/modules/network/multivpc/outputs.tf index c838faa67c..6596f543be 100644 --- a/modules/network/multivpc/outputs.tf +++ b/modules/network/multivpc/outputs.tf @@ -48,3 +48,8 @@ output "subnetwork_addresses" { description = "IP address range of the primary subnetwork" value = module.vpcs[*].subnetwork_address } + +output "enable_multi_networking" { + description = "Enables multi-networking by setting the corresponding variable to true on supported modules (e.g., gke_cluster)." + value = true +} diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index 99f1dd9c9c..de20423b71 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -95,8 +95,6 @@ resource "google_container_cluster" "gke_cluster" { enable_multi_networking = local.derived_enable_multi_networking - networking_mode = "VPC_NATIVE" - network_policy { # Enabling NetworkPolicy for clusters with DatapathProvider=ADVANCED_DATAPATH # is not allowed. Dataplane V2 will take care of network policy enforcement From d157a44afa5390ae82a028e7aa3427cd3f3f1779 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Fri, 16 Aug 2024 13:56:01 +0000 Subject: [PATCH 097/180] Upgrade to DAOS 2.6, skip installation if already installed --- .../parallelstore/scripts/install-daos-client.sh | 16 +++++++++++----- .../scripts/install-daos-client.sh | 16 +++++++++++----- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh index e59f2cfeb9..d2f28c6bf8 100644 --- a/modules/file-system/parallelstore/scripts/install-daos-client.sh +++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh @@ -22,16 +22,22 @@ for arg in "$@"; do fi done +if [ -x /bin/daos ]; then + echo "DAOS already installed" + daos version + exit 0 +fi + # Install the DAOS client library # The following commands should be executed on each client vm. ## For Rocky linux 8. if grep -q "ID=\"rocky\"" /etc/os-release && lsb_release -rs | grep -q "8\.[0-9]"; then # 1) Add the Parallelstore package repository - tee /etc/yum.repos.d/parallelstore-v2-4-el8.repo < Date: Fri, 16 Aug 2024 14:17:58 +0000 Subject: [PATCH 098/180] Add RHEL support --- .../scripts/install-daos-client.sh | 52 +++++++++---------- .../scripts/install-daos-client.sh | 52 +++++++++---------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh index d2f28c6bf8..4c5204859e 100644 --- a/modules/file-system/parallelstore/scripts/install-daos-client.sh +++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh @@ -25,16 +25,15 @@ done if [ -x /bin/daos ]; then echo "DAOS already installed" daos version - exit 0 -fi +else -# Install the DAOS client library -# The following commands should be executed on each client vm. -## For Rocky linux 8. -if grep -q "ID=\"rocky\"" /etc/os-release && lsb_release -rs | grep -q "8\.[0-9]"; then + # Install the DAOS client library + # The following commands should be executed on each client vm. + ## For Rocky linux 8. + if grep -q "ID=\"(rocky|rhel)\"" /etc/os-release && lsb_release -rs | grep -q "8\.[0-9]"; then - # 1) Add the Parallelstore package repository - tee /etc/yum.repos.d/parallelstore-v2-6-el8.repo < Date: Fri, 16 Aug 2024 10:45:37 -0700 Subject: [PATCH 099/180] Change dependabot Golang package version update to monthly This PR changes the package update schedule for Golang packages to a monthly cadence. It does not change the security patch update schedule. See the [dependabot YAML configuration options](https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#configuration-options-for-the-dependabotyml-file) for more info. --- .github/dependabot.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index bd25a3a71f..a4a10fee69 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -23,11 +23,14 @@ updates: - go - release-chore schedule: - interval: weekly + interval: monthly day: monday time: "03:00" timezone: America/Los_Angeles target-branch: develop + ignore: + - dependency-name: "google.golang.org/api" + - package-ecosystem: pip directory: /community/front-end/ofe/ labels: From ba7fe8345ac55d7a28b65cac4c048df5f52b04e2 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 16 Aug 2024 12:37:46 -0700 Subject: [PATCH 100/180] Update A3H instructions: Python dependencies are no longer needed for v6 --- .../machine-learning/a3-highgpu-8g/README.md | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/examples/machine-learning/a3-highgpu-8g/README.md b/examples/machine-learning/a3-highgpu-8g/README.md index 38a967fbe5..5e42a0b075 100644 --- a/examples/machine-learning/a3-highgpu-8g/README.md +++ b/examples/machine-learning/a3-highgpu-8g/README.md @@ -45,23 +45,6 @@ Verify that your release of the Cluster Toolkit is 1.37.0 or later. gcluster --version ``` -The solution requires several Python packages to be available. We recommend -installing them in a Python virtual environment: - -```shell -python3 -m venv toolkit-a3 -source toolkit-a3/bin/activate -pip3 install -r \ - https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/6.5.13/scripts/requirements.txt -``` - -**Always** activate the environment before running any gcluster commands such as -deploy or destroy. - -```shell -source /absolute/path/to/toolkit-a3/bin/activate -``` - ## Top-Level Design of Solution The solution is split into 3 Cluster Toolkit blueprints: From ab8ff6ae65d76a0ec55ac09b6646af2bf4e4958d Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Fri, 16 Aug 2024 19:19:47 +0000 Subject: [PATCH 101/180] Update parallelstore documentation to add section for supported operating systems --- modules/file-system/parallelstore/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/modules/file-system/parallelstore/README.md b/modules/file-system/parallelstore/README.md index 37b4788b53..37e28ed2d2 100644 --- a/modules/file-system/parallelstore/README.md +++ b/modules/file-system/parallelstore/README.md @@ -4,6 +4,11 @@ This module creates [parallelstore](https://cloud.google.com/parallelstore) instance. Parallelstore is Google Cloud's first party parallel file system service based on [Intel DAOS](https://docs.daos.io/v2.2/) +### Supported Operating Systems + +A parallelstore instance can be used with Slurm cluster or compute +VM running Ubuntu 22.04, debian 12 or HPC Rocky Linux 8. + ### Parallelstore Quota To get access to a private preview of Parallelstore APIs, your project needs to @@ -31,9 +36,6 @@ issues. You can specify different mount options as follows, For parallelstore instance, Below snippet creates new VPC and configures private-service-access for this newly created network. -The parallelstore instance created here can be used with Slurm cluster or compute -VM running Ubuntu 22.04, debian 12 or HPC Rocky Linux 8. - ```yaml - id: network source: modules/network/vpc From ec4c10e005709a6ff8406d55111f42232c7778bd Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Fri, 16 Aug 2024 20:38:38 +0000 Subject: [PATCH 102/180] adding optional fields to redirect use of embedded modules to pull from github with versioned tags --- pkg/config/config.go | 2 ++ pkg/config/expand.go | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/pkg/config/config.go b/pkg/config/config.go index 521b68c103..7e3d7d99a8 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -272,6 +272,8 @@ type Blueprint struct { Groups []Group `yaml:"deployment_groups"` TerraformBackendDefaults TerraformBackend `yaml:"terraform_backend_defaults,omitempty"` TerraformProviders map[string]TerraformProvider `yaml:"terraform_providers,omitempty"` + ToolkitModulesURL string `yaml:"toolkit_modules_url,omitempty"` + ToolkitModulesVersion string `yaml:"toolkit_modules_version,omitempty"` // internal & non-serializable fields diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 97b7708852..59dda0cf25 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -17,6 +17,7 @@ package config import ( "errors" "fmt" + "strings" "hpc-toolkit/pkg/modulereader" @@ -143,9 +144,24 @@ func (bp Blueprint) expandGroup(gp groupPath, g *Group) error { func (bp Blueprint) expandModule(mp ModulePath, m *Module) error { bp.applyUseModules(m) bp.applyGlobalVarsInModule(m) + // Versioned Module Logic for Embedded Modules + if bp.ToolkitModulesURL != "" && bp.ToolkitModulesVersion != "" { + if strings.HasPrefix(m.Source, "modules/") || strings.HasPrefix(m.Source, "community/") { + newSource, err := constructVersionedModuleSource(bp.ToolkitModulesURL, m.Source, bp.ToolkitModulesVersion) + if err != nil { + return fmt.Errorf("error constructing versioned module source: %w", err) + } + m.Source = newSource + } + } return validateModuleInputs(mp, *m, bp) } +// TODO: Add validation and error checks for baseURL and version +func constructVersionedModuleSource(baseURL, modulePath, version string) (string, error) { + return fmt.Sprintf("%s//%s?ref=%s&depth=1", baseURL, modulePath, version), nil +} + func (bp Blueprint) expandBackend(grp *Group) { // 1. DEFAULT: use TerraformBackend configuration (if supplied) // 2. If top-level TerraformBackendDefaults is defined, insert that From 4814ad724b3c2e2833003e79f0be669ea53a20ee Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 16 Aug 2024 21:12:53 +0000 Subject: [PATCH 103/180] SlurmGCP. Clean up `get_nodes_status` There is no other states that `startswith("DOWN")` other than "DOWN" See: https://github.com/SchedMD/slurm/blob/master/src/plugins/data_parser/v0.0.42/parsers.c#L7449 --- .../modules/slurm_files/scripts/slurmsync.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index d9cf579c6c..002057a3b1 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -147,7 +147,7 @@ def _find_tpu_node_status(nodename, state): ): if tpuobj.preemptible: return NodeStatus.preempted - if not state.base.startswith("DOWN"): + if state.base != "DOWN": return NodeStatus.terminated elif ( state is None or "POWERED_DOWN" in state.flags @@ -198,7 +198,7 @@ def find_node_status(nodename): ): if inst.scheduling.preemptible: return NodeStatus.preempted - if not state.base.startswith("DOWN"): + if state.base != "DOWN": return NodeStatus.terminated elif (state is None or "POWERED_DOWN" in state.flags) and inst.status == "RUNNING": log.info("%s is potential orphan node", nodename) From d548a67dc263d29f6f1606592a05e9476e8e088b Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 24 Jul 2024 20:54:56 +0000 Subject: [PATCH 104/180] Add `TopologySummary` to track changes to topology --- .../modules/slurm_files/scripts/conf.py | 74 +++++++++++++++++-- .../scripts/tests/requirements.txt | 1 + .../scripts/tests/test_topology.py | 33 ++++++++- 3 files changed, 99 insertions(+), 9 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index eaa16abc23..57e3931b46 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional, Iterable, Dict +from typing import List, Optional, Iterable, Dict, Set from itertools import chain from collections import defaultdict import json @@ -440,10 +440,53 @@ def render_conf_lines(self) -> Iterable[str]: for s in sorted(self.switches.values(), key=lambda s: s.name): yield from s.render_conf_lines() +class TopologySummary: + """ + Represents a summary of the topology, to make judgements about changes. + To be stored in JSON file along side of topology.conf to simplify parsing. + """ + def __init__( + self, + physical_host: Optional[Dict[str, str]] = None, + down_nodes: Optional[Iterable[str]] = None, + tpu_nodes: Optional[Iterable[str]] = None, + ) -> None: + self.physical_host = physical_host or {} + self.down_nodes = set(down_nodes or []) + self.tpu_nodes = set(tpu_nodes or []) + + + @classmethod + def loads(cls, s: str) -> "TopologySummary": + d = json.loads(s) + return cls( + physical_host=d.get("physical_host"), + down_nodes=d.get("down_nodes"), + tpu_nodes=d.get("tpu_nodes"), + ) + + def dumps(self) -> str: + return json.dumps( + { + "physical_host": self.physical_host, + "down_nodes": list(self.down_nodes), + "tpu_nodes": list(self.tpu_nodes), + }, + indent=2) + + def _nodenames(self) -> Set[str]: + return set(self.physical_host) | self.down_nodes | self.tpu_nodes + + def requires_reconfigure(self, prev: "TopologySummary") -> bool: + """ + Reconfigure IFF the nodes were added + """ + return len(self._nodenames() - prev._nodenames()) > 0 class TopologyBuilder: def __init__(self) -> None: self._r = Switch("") # fake root, not part of the tree + self.summary = TopologySummary() def add(self, path: List[str], nodes: Iterable[str]) -> None: n = self._r @@ -460,6 +503,7 @@ def render_conf_lines(self) -> Iterable[str]: def compress(self) -> "TopologyBuilder": compressed = TopologyBuilder() + compressed.summary = self.summary def _walk( u: Switch, c: Switch ): # u: uncompressed node, c: its counterpart in compressed tree @@ -479,7 +523,9 @@ def add_tpu_nodeset_topology(nodeset: object, bldr: TopologyBuilder, lkp: util.L pref = ["tpu-root", f"ns_{nodeset.nodeset_name}"] if tpuobj.vmcount == 1: # Put all nodes in one switch - bldr.add(pref, list(chain(static, dynamic))) + all_nodes = list(chain(static, dynamic)) + bldr.add(pref, all_nodes) + bldr.summary.tpu_nodes.update(all_nodes) return # Chunk nodes into sub-switches of size `vmcount` @@ -488,7 +534,8 @@ def add_tpu_nodeset_topology(nodeset: object, bldr: TopologyBuilder, lkp: util.L for nodeschunk in util.chunked(nodenames, n=tpuobj.vmcount): chunk_name = f"{nodeset.nodeset_name}-{chunk_num}" chunk_num += 1 - bldr.add([*pref, chunk_name], list(nodeschunk)) + bldr.add([*pref, chunk_name], nodeschunk) + bldr.summary.tpu_nodes.update(nodeschunk) def add_nodeset_topology( @@ -497,6 +544,8 @@ def add_nodeset_topology( path = ["slurm-root", f"ns_{nodeset.nodeset_name}"] nodes = list(chain(*lkp.nodenames(nodeset))) bldr.add(path, nodes) + # treat all nodes as down, since we don't make use of physical_host yet + bldr.summary.down_nodes.update(nodes) def gen_topology(lkp: util.Lookup) -> TopologyBuilder: @@ -513,26 +562,35 @@ def gen_topology_conf(lkp: util.Lookup) -> bool: Generates slurm topology.conf. Returns whether the topology.conf got updated. """ - bldr = gen_topology(lkp).compress() + topo = gen_topology(lkp).compress() conf_file = lkp.etc_dir / "cloud_topology.conf" - old_hash = util.hash_file(conf_file) if conf_file.exists() else "" + with open(conf_file, "w") as f: f.writelines(FILE_PREAMBLE + "\n") - for line in bldr.render_conf_lines(): + for line in topo.render_conf_lines(): f.write(line) f.write("\n") f.write("\n") - new_hash = util.hash_file(conf_file) - return old_hash != new_hash + summary_file = lkp.etc_dir / "cloud_topology.summary.json" + prev_summary = TopologySummary() + if summary_file.exists(): + prev_summary = TopologySummary.loads(summary_file.read_text()) + summary_file.write_text(topo.summary.dumps()) + + return topo.summary.requires_reconfigure(prev_summary) def install_topology_conf(lkp: util.Lookup) -> None: conf_file = lkp.etc_dir / "cloud_topology.conf" + summary_file = lkp.etc_dir / "cloud_topology.summary.json" topo_conf = lkp.etc_dir / "topology.conf" + if not topo_conf.exists(): topo_conf.symlink_to(conf_file) + util.chown_slurm(conf_file, mode=0o600) + util.chown_slurm(summary_file, mode=0o600) def gen_controller_configs(lkp: util.Lookup) -> None: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt index 85c80e7f84..fc44e31a6c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt @@ -1,3 +1,4 @@ pytest pytest-mock +pytest_unordered mock diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py index 62c33e52cc..dc60e59318 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py @@ -13,7 +13,9 @@ # limitations under the License. import pytest +import json import mock +from pytest_unordered import unordered from common import TstCfg, TstNodeset, TstTPU, make_to_hostnames_mock import sort_nodes @@ -88,10 +90,39 @@ def tpu_se(ns: TstNodeset) -> TstTPU: "SwitchName=s1_1 Nodes=m22-slim-[0-2]"] assert list(compressed.render_conf_lines()) == want_compressed - conf.gen_topology_conf(util.Lookup(cfg)) + assert conf.gen_topology_conf(lkp) == True want_written = PRELUDE + "\n".join(want_compressed) + "\n\n" assert open(cfg.output_dir + "/cloud_topology.conf").read() == want_written + summary_got = json.loads(open(cfg.output_dir + "/cloud_topology.summary.json").read()) + assert summary_got == { + "down_nodes": unordered( + [f"m22-blue-{i}" for i in range(7)] + + [f"m22-green-{i}" for i in range(5)] + + [f"m22-pink-{i}" for i in range(4)]), + "tpu_nodes": unordered( + [f"m22-bold-{i}" for i in range(9)] + + [f"m22-slim-{i}" for i in range(3)]), + "physical_host": {}, + } + + + +def test_gen_topology_conf_update(): + cfg = TstCfg( + nodeset={ + "c": TstNodeset("green", node_count_static=2), + }, + output_dir=tempfile.mkdtemp(), + ) + lkp = util.Lookup(cfg) + assert conf.gen_topology_conf(lkp) == True + + lkp.cfg.nodeset["c"].node_count_static = 3 + assert conf.gen_topology_conf(lkp) == True + + lkp.cfg.nodeset["c"].node_count_static = 1 + assert conf.gen_topology_conf(lkp) == False @pytest.mark.parametrize( From c24eca4e4b34eb2c8dc1aab67d1a8ae00cae6420 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 26 Jul 2024 04:58:43 +0000 Subject: [PATCH 105/180] Add physical topo. Update rules to trigger reconfiguration --- .../modules/slurm_files/scripts/conf.py | 52 ++++++++++-- .../slurm_files/scripts/tests/common.py | 12 +++ .../scripts/tests/test_topology.py | 84 +++++++++++++++---- .../modules/slurm_files/scripts/util.py | 21 ++--- 4 files changed, 134 insertions(+), 35 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index 57e3931b46..f92bfdf3e7 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -479,9 +479,16 @@ def _nodenames(self) -> Set[str]: def requires_reconfigure(self, prev: "TopologySummary") -> bool: """ - Reconfigure IFF the nodes were added + Reconfigure IFF one of the following occurs: + * A node is added + * A node get a non-empty physicalHost """ - return len(self._nodenames() - prev._nodenames()) > 0 + if len(self._nodenames() - prev._nodenames()) > 0: + return True + for n, ph in self.physical_host.items(): + if ph and ph != prev.physical_host.get(n): + return True + return False class TopologyBuilder: def __init__(self) -> None: @@ -537,16 +544,45 @@ def add_tpu_nodeset_topology(nodeset: object, bldr: TopologyBuilder, lkp: util.L bldr.add([*pref, chunk_name], nodeschunk) bldr.summary.tpu_nodes.update(nodeschunk) +_SLURM_TOPO_ROOT = "slurm-root" + +def _make_physical_path(physical_host: str) -> List[str]: + assert physical_host.startswith("/"), f"Unexpected physicalHost: {physical_host}" + parts = physical_host[1:].split("/") + # Due to issues with Slurm's topology plugin, we can not use all components of `physicalHost`, + # trim it down to `cluster/rack`. + short_path = parts[:2] + return [_SLURM_TOPO_ROOT, *short_path] def add_nodeset_topology( nodeset: object, bldr: TopologyBuilder, lkp: util.Lookup ) -> None: - path = ["slurm-root", f"ns_{nodeset.nodeset_name}"] - nodes = list(chain(*lkp.nodenames(nodeset))) - bldr.add(path, nodes) - # treat all nodes as down, since we don't make use of physical_host yet - bldr.summary.down_nodes.update(nodes) - + up_nodes = set() + default_path = [_SLURM_TOPO_ROOT, f"ns_{nodeset.nodeset_name}"] + + for inst in lkp.instances().values(): + try: + if lkp.node_nodeset_name(inst.name) != nodeset.nodeset_name: + continue + except Exception: + continue + + phys_host = inst.resourceStatus.get("physicalHost", "") + bldr.summary.physical_host[inst.name] = phys_host + up_nodes.add(inst.name) + + if phys_host: + bldr.add(_make_physical_path(phys_host), [inst.name]) + else: + bldr.add(default_path, [inst.name]) + + down_nodes = [] + for node in chain(*lkp.nodenames(nodeset)): + if node not in up_nodes: + down_nodes.append(node) + if down_nodes: + bldr.add(default_path, down_nodes) + bldr.summary.down_nodes.update(down_nodes) def gen_topology(lkp: util.Lookup) -> TopologyBuilder: bldr = TopologyBuilder() diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py index e95e436b0c..8db9add6c3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py @@ -70,6 +70,18 @@ class TstMachineConf: class TstTemplateInfo: gpu_count: int = 0 +@dataclass +class TstInstance: + name: str + region: str = "gondor" + zone: str = "anorien" + placementPolicyId: Optional[str] = None + physicalHost: Optional[str] = None + + @property + def resourceStatus(self): + return {"physicalHost": self.physicalHost} + def make_to_hostnames_mock(tbl: Optional[dict[str, list[str]]]): tbl = tbl or {} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py index dc60e59318..6d44338c81 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py @@ -16,7 +16,7 @@ import json import mock from pytest_unordered import unordered -from common import TstCfg, TstNodeset, TstTPU, make_to_hostnames_mock +from common import TstCfg, TstNodeset, TstTPU, TstInstance import sort_nodes import util @@ -60,12 +60,33 @@ def tpu_se(ns: TstNodeset) -> TstTPU: tpu_mock.side_effect = tpu_se lkp = util.Lookup(cfg) + lkp.instances = lambda: { n.name: n for n in [ + # nodeset blue + TstInstance("m22-blue-0"), # no physicalHost + TstInstance("m22-blue-0", physicalHost="/a/a/a"), + TstInstance("m22-blue-1", physicalHost="/a/a/b"), + TstInstance("m22-blue-2", physicalHost="/a/b/a"), + TstInstance("m22-blue-3", physicalHost="/b/a/a"), + # nodeset green + TstInstance("m22-green-3", physicalHost="/a/a/c"), + ]} + uncompressed = conf.gen_topology(lkp) - want_uncompressed = [ - "SwitchName=slurm-root Switches=ns_blue,ns_green,ns_pink", - "SwitchName=ns_blue Nodes=m22-blue-[0-6]", - "SwitchName=ns_green Nodes=m22-green-[0-4]", + want_uncompressed = [ + #NOTE: the switch names are not unique, it's not valid content for topology.conf + # The uniquefication and compression of names are done in the compress() method + "SwitchName=slurm-root Switches=a,b,ns_blue,ns_green,ns_pink", + # "physical" topology + 'SwitchName=a Switches=a,b', + 'SwitchName=a Nodes=m22-blue-[0-1],m22-green-3', + 'SwitchName=b Nodes=m22-blue-2', + 'SwitchName=b Switches=a', + 'SwitchName=a Nodes=m22-blue-3', + # topology "by nodeset" + "SwitchName=ns_blue Nodes=m22-blue-[4-6]", + "SwitchName=ns_green Nodes=m22-green-[0-2,4]", "SwitchName=ns_pink Nodes=m22-pink-[0-3]", + # TPU topology "SwitchName=tpu-root Switches=ns_bold,ns_slim", "SwitchName=ns_bold Switches=bold-[0-3]", "SwitchName=bold-0 Nodes=m22-bold-[0-2]", @@ -77,10 +98,18 @@ def tpu_se(ns: TstNodeset) -> TstTPU: compressed = uncompressed.compress() want_compressed = [ - "SwitchName=s0 Switches=s0_[0-2]", - "SwitchName=s0_0 Nodes=m22-blue-[0-6]", - "SwitchName=s0_1 Nodes=m22-green-[0-4]", - "SwitchName=s0_2 Nodes=m22-pink-[0-3]", + "SwitchName=s0 Switches=s0_[0-4]", # root + # "physical" topology + 'SwitchName=s0_0 Switches=s0_0_[0-1]', # /a + 'SwitchName=s0_0_0 Nodes=m22-blue-[0-1],m22-green-3', # /a/a + 'SwitchName=s0_0_1 Nodes=m22-blue-2', # /a/b + 'SwitchName=s0_1 Switches=s0_1_0', # /b + 'SwitchName=s0_1_0 Nodes=m22-blue-3', # /b/a + # topology "by nodeset" + "SwitchName=s0_2 Nodes=m22-blue-[4-6]", + "SwitchName=s0_3 Nodes=m22-green-[0-2,4]", + "SwitchName=s0_4 Nodes=m22-pink-[0-3]", + # TPU topology "SwitchName=s1 Switches=s1_[0-1]", "SwitchName=s1_0 Switches=s1_0_[0-3]", "SwitchName=s1_0_0 Nodes=m22-bold-[0-2]", @@ -95,15 +124,21 @@ def tpu_se(ns: TstNodeset) -> TstTPU: assert open(cfg.output_dir + "/cloud_topology.conf").read() == want_written summary_got = json.loads(open(cfg.output_dir + "/cloud_topology.summary.json").read()) - assert summary_got == { + + assert summary_got == { "down_nodes": unordered( - [f"m22-blue-{i}" for i in range(7)] + - [f"m22-green-{i}" for i in range(5)] + + [f"m22-blue-{i}" for i in (4,5,6)] + + [f"m22-green-{i}" for i in (0,1,2,4)] + [f"m22-pink-{i}" for i in range(4)]), "tpu_nodes": unordered( [f"m22-bold-{i}" for i in range(9)] + [f"m22-slim-{i}" for i in range(3)]), - "physical_host": {}, + 'physical_host': { + 'm22-blue-0': '/a/a/a', + 'm22-blue-1': '/a/a/b', + 'm22-blue-2': '/a/b/a', + 'm22-blue-3': '/b/a/a', + 'm22-green-3': '/a/a/c'}, } @@ -116,12 +151,33 @@ def test_gen_topology_conf_update(): output_dir=tempfile.mkdtemp(), ) lkp = util.Lookup(cfg) + lkp.instances = lambda: {} # no instances + + # initial generation - reconfigure assert conf.gen_topology_conf(lkp) == True + # add node: node_count_static 2 -> 3 - reconfigure lkp.cfg.nodeset["c"].node_count_static = 3 assert conf.gen_topology_conf(lkp) == True - lkp.cfg.nodeset["c"].node_count_static = 1 + # remove node: node_count_static 3 -> 2 - no reconfigure + lkp.cfg.nodeset["c"].node_count_static = 2 + assert conf.gen_topology_conf(lkp) == False + + # set empty physicalHost - no reconfigure + lkp.instances = lambda: { n.name: n for n in [TstInstance("m22-green-0", physicalHost="")]} + assert conf.gen_topology_conf(lkp) == False + + # set physicalHost - reconfigure + lkp.instances = lambda: { n.name: n for n in [TstInstance("m22-green-0", physicalHost="/a/b/c")]} + assert conf.gen_topology_conf(lkp) == True + + # change physicalHost - reconfigure + lkp.instances = lambda: { n.name: n for n in [TstInstance("m22-green-0", physicalHost="/a/b/z")]} + assert conf.gen_topology_conf(lkp) == True + + # shut down node - no reconfigure + lkp.instances = lambda: {} assert conf.gen_topology_conf(lkp) == False diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 4a20c1b40a..da62bf5c33 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, List, Tuple, Optional, Any +from typing import Iterable, List, Tuple, Optional, Any, Dict import argparse import base64 import collections @@ -1645,9 +1645,7 @@ def slurm_node(self, nodename): return self.slurm_nodes().get(nodename) @lru_cache(maxsize=1) - def instances(self, project=None, slurm_cluster_name=None): - slurm_cluster_name = slurm_cluster_name or self.cfg.slurm_cluster_name - project = project or self.project + def instances(self) -> Dict[str, object]: instance_information_fields = [ "advancedMachineFeatures", "cpuPlatform", @@ -1685,8 +1683,8 @@ def instances(self, project=None, slurm_cluster_name=None): if lookup().cfg.enable_slurm_gcp_plugins: slurm_gcp_plugins.register_instance_information_fields( lkp=lookup(), - project=project, - slurm_cluster_name=slurm_cluster_name, + project=self.project, + slurm_cluster_name=self.cfg.slurm_cluster_name, instance_information_fields=instance_information_fields, ) @@ -1698,9 +1696,9 @@ def instances(self, project=None, slurm_cluster_name=None): instance_information_fields = sorted(set(instance_information_fields)) instance_fields = ",".join(instance_information_fields) fields = f"items.zones.instances({instance_fields}),nextPageToken" - flt = f"labels.slurm_cluster_name={slurm_cluster_name} AND name:{slurm_cluster_name}-*" + flt = f"labels.slurm_cluster_name={self.cfg.slurm_cluster_name} AND name:{self.cfg.slurm_cluster_name}-*" act = self.compute.instances() - op = act.aggregatedList(project=project, fields=fields, filter=flt) + op = act.aggregatedList(project=self.project, fields=fields, filter=flt) def properties(inst): """change instance properties to a preferred format""" @@ -1731,11 +1729,8 @@ def properties(inst): op = act.aggregatedList_next(op, result) return instances - def instance(self, instance_name, project=None, slurm_cluster_name=None): - instances = self.instances( - project=project, slurm_cluster_name=slurm_cluster_name - ) - return instances.get(instance_name) + def instance(self, instance_name: str) -> Optional[object]: + return self.instances().get(instance_name) @lru_cache() def reservation(self, name: str, zone: str) -> object: From c9c496cfd4eadb24292950b07b5b542c68d3fce0 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 17 Aug 2024 01:27:14 +0000 Subject: [PATCH 106/180] Add `cloud_parameters.topology_param` that defaults to `SwitchAsNodeRank` --- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/README.md | 4 ++-- .../modules/slurm_files/scripts/conf.py | 1 + .../modules/slurm_files/scripts/tests/test_conf.py | 11 ++++++++--- .../modules/slurm_files/variables.tf | 11 ++++++----- .../schedmd-slurm-gcp-v6-controller/variables.tf | 1 + 6 files changed, 19 insertions(+), 11 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 2ff3621296..c8901201df 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -229,7 +229,7 @@ limitations under the License. | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket.
Ignored when 'create\_bucket' is true. | `string` | `null` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number)
})
| `{}` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | | [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access.
user\_managed\_replication : The list of location and (optional) kms\_key\_name for secret |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
user_managed_replication = optional(list(object({
location = string
kms_key_name = optional(string)
})), [])
})
| `null` | no | | [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `"# no-op"` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index 7a5d59e3e3..a4ca8378e6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -66,7 +66,7 @@ No modules. | [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. | `string` | `null` | no | | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket to use. | `string` | n/a | yes | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number)
})
| `{}` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | | [cloudsql\_secret](#input\_cloudsql\_secret) | Secret URI to cloudsql secret. | `string` | `null` | no | | [compute\_startup\_scripts](#input\_compute\_startup\_scripts) | List of scripts to be ran on compute VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | @@ -84,7 +84,7 @@ No modules. | [google\_app\_cred\_path](#input\_google\_app\_cred\_path) | Path to Google Application Credentials. | `string` | `null` | no | | [install\_dir](#input\_install\_dir) | Directory where the hybrid configuration directory will be installed on the
on-premise controller (e.g. /etc/slurm/hybrid). This updates the prefix path
for the resume and suspend scripts in the generated `cloud.conf` file.

This variable should be used when the TerraformHost and the SlurmctldHost
are different.

This will default to var.output\_dir if null. | `string` | `null` | no | | [job\_submit\_lua\_tpl](#input\_job\_submit\_lua\_tpl) | Slurm job\_submit.lua template file path. | `string` | `null` | no | -| [login\_network\_storage](#input\_login\_network\_storage) | Storage to mounted on login and controller instances
* server\_ip : Address of the storage server.
* remote\_mount : The location in the remote instance filesystem to mount from.
* local\_mount : The location on the instance filesystem to mount to.
* fs\_type : Filesystem type (e.g. "nfs").
* mount\_options : Options to mount with. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | +| [login\_network\_storage](#input\_login\_network\_storage) | Storage to mounted on login and controller instances
- server\_ip : Address of the storage server.
- remote\_mount : The location in the remote instance filesystem to mount from.
- local\_mount : The location on the instance filesystem to mount to.
- fs\_type : Filesystem type (e.g. "nfs").
- mount\_options : Options to mount with. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | | [login\_startup\_scripts](#input\_login\_startup\_scripts) | List of scripts to be ran on login VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [munge\_mount](#input\_munge\_mount) | Remote munge mount for compute and login nodes to acquire the munge.key.

By default, the munge mount server will be assumed to be the
`var.slurm_control_host` (or `var.slurm_control_addr` if non-null) when
`server_ip=null`. |
object({
server_ip = string
remote_mount = string
fs_type = string
mount_options = string
})
|
{
"fs_type": "nfs",
"mount_options": "",
"remote_mount": "/etc/munge/",
"server_ip": null
}
| no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index 57e3931b46..e28b7f075e 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -125,6 +125,7 @@ def get(key, default): "TreeWidth": get("tree_width", default_tree_width), "JobSubmitPlugins": "lua" if any_tpu else None, "TopologyPlugin": topology_plugin(lkp), + "TopologyParam": get("topology_param", "SwitchAsNodeRank"), } return dict_to_conf(conf_options, delim="\n") diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py index 56a94ba187..0b25b0df58 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py @@ -100,7 +100,8 @@ def test_dict_to_conf(value: dict, want: str): SuspendRate=0 SuspendTimeout=300 TreeWidth=128 -TopologyPlugin=topology/tree"""), +TopologyPlugin=topology/tree +TopologyParam=SwitchAsNodeRank"""), (TstCfg( install_dir="ukulele", cloud_parameters={ @@ -110,6 +111,7 @@ def test_dict_to_conf(value: dict, want: str): "suspend_rate": None, "suspend_timeout": None, "topology_plugin": None, + "topology_param": None, "tree_width": None, }, ), @@ -121,7 +123,8 @@ def test_dict_to_conf(value: dict, want: str): SuspendRate=0 SuspendTimeout=300 TreeWidth=128 -TopologyPlugin=topology/tree"""), +TopologyPlugin=topology/tree +TopologyParam=SwitchAsNodeRank"""), (TstCfg( install_dir="ukulele", cloud_parameters={ @@ -131,6 +134,7 @@ def test_dict_to_conf(value: dict, want: str): "suspend_rate": 3, "suspend_timeout": 4, "topology_plugin": "guess", + "topology_param": "yellow", "tree_width": 5, }, ), @@ -142,7 +146,8 @@ def test_dict_to_conf(value: dict, want: str): SuspendRate=3 SuspendTimeout=4 TreeWidth=5 -TopologyPlugin=guess"""), +TopologyPlugin=guess +TopologyParam=yellow"""), ]) def test_conflines(cfg, want): assert conf.conflines(util.Lookup(cfg)) == want diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf index bc0a57a486..2c01b6b579 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -272,11 +272,11 @@ EOD variable "login_network_storage" { description = < Date: Mon, 19 Aug 2024 10:48:50 +0000 Subject: [PATCH 107/180] Bump github.com/hashicorp/go-getter from 1.7.5 to 1.7.6 Bumps [github.com/hashicorp/go-getter](https://github.com/hashicorp/go-getter) from 1.7.5 to 1.7.6. - [Release notes](https://github.com/hashicorp/go-getter/releases) - [Changelog](https://github.com/hashicorp/go-getter/blob/main/.goreleaser.yml) - [Commits](https://github.com/hashicorp/go-getter/compare/v1.7.5...v1.7.6) --- updated-dependencies: - dependency-name: github.com/hashicorp/go-getter dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 7a7435fb98..d12ca04b35 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.21 require ( cloud.google.com/go/storage v1.41.0 // indirect github.com/go-git/go-git/v5 v5.12.0 - github.com/hashicorp/go-getter v1.7.5 + github.com/hashicorp/go-getter v1.7.6 github.com/hashicorp/hcl v1.0.0 // indirect github.com/hashicorp/hcl/v2 v2.21.0 github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d diff --git a/go.sum b/go.sum index 0c9ffbf1dc..56278d6bb1 100644 --- a/go.sum +++ b/go.sum @@ -379,8 +379,8 @@ github.com/googleapis/go-type-adapters v1.0.0/go.mod h1:zHW75FOG2aur7gAO2B+MLby+ github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= -github.com/hashicorp/go-getter v1.7.5 h1:dT58k9hQ/vbxNMwoI5+xFYAJuv6152UNvdHokfI5wE4= -github.com/hashicorp/go-getter v1.7.5/go.mod h1:W7TalhMmbPmsSMdNjD0ZskARur/9GJ17cfHTRtXV744= +github.com/hashicorp/go-getter v1.7.6 h1:5jHuM+aH373XNtXl9TNTUH5Qd69Trve11tHIrB+6yj4= +github.com/hashicorp/go-getter v1.7.6/go.mod h1:W7TalhMmbPmsSMdNjD0ZskARur/9GJ17cfHTRtXV744= github.com/hashicorp/go-safetemp v1.0.0 h1:2HR189eFNrjHQyENnQMMpCiBAsRxzbTMIgBhEyExpmo= github.com/hashicorp/go-safetemp v1.0.0/go.mod h1:oaerMy3BhqiTbVye6QuFhFtIceqFoDHxNAB65b+Rj1I= github.com/hashicorp/go-version v1.6.0 h1:feTTfFNnjP967rlCxM/I9g701jU+RN74YKx2mOkIeek= From 1f3f879f00d05b66652768bb478d6e167652c92d Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 15 Aug 2024 06:50:00 +0000 Subject: [PATCH 108/180] Perform cleanup per nodeset --- .../schedmd-slurm-gcp-v6-controller/README.md | 8 +- .../controller.tf | 5 -- .../modules/cleanup_compute/README.md | 41 ++++++++++ .../cleanup_compute/cleanup_compute.sh | 79 +++++++++++++++++++ .../cleanup_compute/main.tf} | 22 +++--- .../modules/cleanup_compute/variables.tf | 66 ++++++++++++++++ .../modules/cleanup_compute/versions.tf | 27 +++++++ .../partition.tf | 31 ++++++++ .../scripts/cleanup_compute.sh | 61 -------------- .../variables.tf | 2 +- .../versions.tf | 6 -- 11 files changed, 257 insertions(+), 91 deletions(-) create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/README.md create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/cleanup_compute.sh rename community/modules/scheduler/schedmd-slurm-gcp-v6-controller/{cleanup.tf => modules/cleanup_compute/main.tf} (61%) create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/variables.tf create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/versions.tf delete mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/scripts/cleanup_compute.sh diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 2ff3621296..3a6c28c29d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -181,14 +181,12 @@ limitations under the License. |------|---------| | [terraform](#requirement\_terraform) | >= 1.3 | | [google](#requirement\_google) | >= 4.84 | -| [null](#requirement\_null) | >= 3.0 | ## Providers | Name | Version | |------|---------| | [google](#provider\_google) | >= 4.84 | -| [null](#provider\_null) | >= 3.0 | ## Modules @@ -196,6 +194,8 @@ limitations under the License. |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | +| [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | +| [nodeset\_tpu\_cleanup](#module\_nodeset\_tpu\_cleanup) | ./modules/cleanup_compute | n/a | | [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.1 | | [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.1 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | @@ -213,8 +213,6 @@ limitations under the License. | [google_secret_manager_secret_version.cloudsql_version](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_version) | resource | | [google_storage_bucket_iam_binding.legacy_readers](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_iam_binding) | resource | | [google_storage_bucket_iam_binding.viewers](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_iam_binding) | resource | -| [null_resource.cleanup_compute](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [null_resource.cleanup_compute_depenencies](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | | [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | @@ -245,7 +243,7 @@ limitations under the License. | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | -| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

*WARNING*: Toggling this off will impact the running workload.
Deployed compute nodes and controller will be destroyed. | `bool` | `true` | no | +| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

*WARNING*: Toggling this off will impact the running workload.
Deployed compute nodes will be destroyed. | `bool` | `true` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_controller\_public\_ips](#input\_enable\_controller\_public\_ips) | If set to true. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. | `bool` | `false` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 4e9e74c500..93df4491b9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -117,11 +117,6 @@ module "slurm_controller_instance" { metadata = var.metadata labels = merge(local.labels, local.files_cs_labels) - - depends_on = [ - # Ensure that controller is destroyed BEFORE doing cleanup - null_resource.cleanup_compute[0], - ] } # SECRETS: CLOUDSQL diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/README.md new file mode 100644 index 0000000000..0405c09f78 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/README.md @@ -0,0 +1,41 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [null](#requirement\_null) | >= 3.0 | + +## Providers + +| Name | Version | +|------|---------| +| [null](#provider\_null) | >= 3.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [null_resource.dependencies](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [null_resource.script](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

*WARNING*: Toggling this off will impact the running workload.
Deployed compute nodes will be destroyed. | `bool` | n/a | yes | +| [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
| n/a | yes | +| [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | n/a | yes | +| [nodeset](#input\_nodeset) | Nodeset to cleanup |
object({
nodeset_name = string
subnetwork_self_link = string
additional_networks = list(object({
subnetwork = string
}))
})
| n/a | yes | +| [project\_id](#input\_project\_id) | Project ID | `string` | n/a | yes | +| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Name of the Slurm cluster | `string` | n/a | yes | +| [universe\_domain](#input\_universe\_domain) | Domain address for alternate API universe | `string` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/cleanup_compute.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/cleanup_compute.sh new file mode 100755 index 0000000000..52d50dfae0 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/cleanup_compute.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e -o pipefail + +project="$1" +cluster_name="$2" +nodeset_name="$3" +universe_domain="$4" +compute_endpoint_version="$5" +gcloud_dir="$6" + +if [[ -z "${project}" || -z "${cluster_name}" || -z "${nodeset_name}" || -z "${universe_domain}" || -z "${compute_endpoint_version}" ]]; then + echo "Usage: $0 " + exit 1 +fi + +if [[ -n "${gcloud_dir}" ]]; then + export PATH="$gcloud_dir:$PATH" +fi + +export CLOUDSDK_API_ENDPOINT_OVERRIDES_COMPUTE="https://www.${universe_domain}/compute/${compute_endpoint_version}/" +export CLOUDSDK_CORE_PROJECT="${project}" + +if ! type -P gcloud 1>/dev/null; then + echo "gcloud is not available and your compute resources are not being cleaned up" + echo "https://console.cloud.google.com/compute/instances?project=${project}" + exit 1 +fi + +echo "Deleting compute nodes" +node_filter="name:${cluster_name}-${nodeset_name}-* labels.slurm_cluster_name=${cluster_name} AND labels.slurm_instance_role=compute" + +tmpfile=$(mktemp) +running_nodes_filter="${node_filter} AND (status!=STOPPING AND status!=TERMINATED)" +# List all currently running instances and attempt to delete them +gcloud compute instances list --format="value(selfLink)" --filter="${running_nodes_filter}" >"$tmpfile" +# Do 10 instances at a time +while batch="$(head -n 10)" && [[ ${#batch} -gt 0 ]]; do + nodes=$(echo "$batch" | paste -sd " " -) # concat into a single space-separated line + # The lack of quotes around ${nodes} is intentional and causes each new space-separated "word" to + # be treated as independent arguments. See PR#2523 + # shellcheck disable=SC2086 + gcloud compute instances delete --quiet ${nodes} || echo "Failed to delete some instances" +done <"$tmpfile" # have to use a temp file, since `< <(gcloud ...)` doesn't work nicely with `head` +rm -f "$tmpfile" + +# In case if controller tries to delete the nodes as well, +# wait until nodes in STOPPING state are deleted, before deleting the resource policies +stopping_nodes_filter="${node_filter} AND status=STOPPING" +while true; do + node=$(gcloud compute instances list --format="value(name)" --filter="${stopping_nodes_filter}" --limit=1) + if [[ -z "${node}" ]]; then + break + fi + echo "Waiting for instances to be deleted: ${node}" + sleep 5 +done + +echo "Deleting resource policies" +policies_filter="name:${cluster_name}-${nodeset_name}-slurmgcp-managed-*" +gcloud compute resource-policies list --format="value(selfLink)" --filter="${policies_filter}" | while read -r line; do + echo "Deleting resource policy: $line" + gcloud compute resource-policies delete --quiet "${line}" || { + echo "Failed to delete resource policy: $line" + } +done diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/cleanup.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/main.tf similarity index 61% rename from community/modules/scheduler/schedmd-slurm-gcp-v6-controller/cleanup.tf rename to community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/main.tf index e1fdf74611..e87a853e74 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/cleanup.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/main.tf @@ -14,36 +14,32 @@ locals { cleanup_dependencies_agg = flatten([ - [ - for ns in var.nodeset : [ - ns.subnetwork_self_link, - [for an in ns.additional_networks : an.subnetwork] - ] - ], - [for ns in var.nodeset_tpu : ns.subnetwork], - ]) + var.nodeset.subnetwork_self_link, + var.nodeset.additional_networks[*].subnetwork]) } -resource "null_resource" "cleanup_compute_depenencies" { +# Can not use variadic list in `depends_on`, wrap it into a collection of `null_resource` +resource "null_resource" "dependencies" { count = length(local.cleanup_dependencies_agg) } -resource "null_resource" "cleanup_compute" { +resource "null_resource" "script" { count = var.enable_cleanup_compute ? 1 : 0 triggers = { project_id = var.project_id - cluster_name = local.slurm_cluster_name + cluster_name = var.slurm_cluster_name + nodeset_name = var.nodeset.nodeset_name universe_domain = var.universe_domain compute_endpoint_version = var.endpoint_versions.compute gcloud_path_override = var.gcloud_path_override } provisioner "local-exec" { - command = "/bin/bash ${path.module}/scripts/cleanup_compute.sh ${self.triggers.project_id} ${self.triggers.cluster_name} ${self.triggers.universe_domain} ${self.triggers.compute_endpoint_version} ${self.triggers.gcloud_path_override}" + command = "/bin/bash ${path.module}/cleanup_compute.sh ${self.triggers.project_id} ${self.triggers.cluster_name} ${self.triggers.nodeset_name} ${self.triggers.universe_domain} ${self.triggers.compute_endpoint_version} ${self.triggers.gcloud_path_override}" when = destroy } # Ensure that clean up is done before attempt to delete the networks - depends_on = [null_resource.cleanup_compute_depenencies] + depends_on = [null_resource.dependencies] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/variables.tf new file mode 100644 index 0000000000..a40aab0f26 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/variables.tf @@ -0,0 +1,66 @@ +/** + * Copyright (C) SchedMD LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + type = string + description = "Project ID" +} + + +variable "slurm_cluster_name" { + type = string + description = "Name of the Slurm cluster" +} + +variable "enable_cleanup_compute" { + description = < " - exit 1 -fi - -if [[ -n "${gcloud_dir}" ]]; then - export PATH="$gcloud_dir:$PATH" -fi - -export CLOUDSDK_API_ENDPOINT_OVERRIDES_COMPUTE="https://www.${universe_domain}/compute/${compute_endpoint_version}/" - -if ! type -P gcloud 1>/dev/null; then - echo "gcloud is not available and your compute resources are not being cleaned up" - echo "https://console.cloud.google.com/compute/instances?project=${project}" - exit 1 -fi - -echo "Deleting compute nodes" -node_filter="labels.slurm_cluster_name=${cluster_name} AND labels.slurm_instance_role=compute" -while true; do - nodes=$(gcloud compute instances list --project "${project}" --format="value(selfLink)" --filter="${node_filter}" --limit=10 | paste -sd " " -) - if [[ -z "${nodes}" ]]; then - break - fi - # The lack of quotes is intentional and causes each new space-separated "word" to - # be treated as independent arguments. See PR#2523 - # shellcheck disable=SC2086 - gcloud compute instances delete --quiet ${nodes} -done - -echo "Deleting resource policies" -policies_filter="name:${cluster_name}-slurmgcp-managed-*" -gcloud compute resource-policies list --project "${project}" --format="value(selfLink)" --filter="${policies_filter}" | while read -r line; do - echo "Deleting resource policy: $line" - gcloud compute resource-policies delete --quiet "${line}" || { - echo "Failed to delete resource policy: $line" - } -done diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 144d5ff0d2..2ec109ae90 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -382,7 +382,7 @@ Enables automatic cleanup of compute nodes and resource policies (e.g. placement groups) managed by this module, when cluster is destroyed. *WARNING*: Toggling this off will impact the running workload. -Deployed compute nodes and controller will be destroyed. +Deployed compute nodes will be destroyed. EOD type = bool default = true diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index 7ab1c46f14..1a0fdfa215 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -22,12 +22,6 @@ terraform { source = "hashicorp/google" version = ">= 4.84" } - - null = { - source = "hashicorp/null" - version = ">= 3.0" - } - } provider_meta "google" { module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.38.0" From 9ffad9d60c49bb51f829331048f232bd3b937aea Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 19 Aug 2024 17:09:03 +0000 Subject: [PATCH 109/180] Address comments --- .../modules/cleanup_compute/main.tf | 2 +- .../cleanup_compute/{ => scripts}/cleanup_compute.sh | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) rename community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/{ => scripts}/cleanup_compute.sh (91%) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/main.tf index e87a853e74..05d9b91cf1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/main.tf @@ -36,7 +36,7 @@ resource "null_resource" "script" { } provisioner "local-exec" { - command = "/bin/bash ${path.module}/cleanup_compute.sh ${self.triggers.project_id} ${self.triggers.cluster_name} ${self.triggers.nodeset_name} ${self.triggers.universe_domain} ${self.triggers.compute_endpoint_version} ${self.triggers.gcloud_path_override}" + command = "/bin/bash ${path.module}/scripts/cleanup_compute.sh ${self.triggers.project_id} ${self.triggers.cluster_name} ${self.triggers.nodeset_name} ${self.triggers.universe_domain} ${self.triggers.compute_endpoint_version} ${self.triggers.gcloud_path_override}" when = destroy } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/cleanup_compute.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/scripts/cleanup_compute.sh similarity index 91% rename from community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/cleanup_compute.sh rename to community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/scripts/cleanup_compute.sh index 52d50dfae0..51352f5989 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/cleanup_compute.sh +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/scripts/cleanup_compute.sh @@ -22,7 +22,7 @@ universe_domain="$4" compute_endpoint_version="$5" gcloud_dir="$6" -if [[ -z "${project}" || -z "${cluster_name}" || -z "${nodeset_name}" || -z "${universe_domain}" || -z "${compute_endpoint_version}" ]]; then +if [[ $# -ne 5 ]]; then echo "Usage: $0 " exit 1 fi @@ -43,7 +43,9 @@ fi echo "Deleting compute nodes" node_filter="name:${cluster_name}-${nodeset_name}-* labels.slurm_cluster_name=${cluster_name} AND labels.slurm_instance_role=compute" -tmpfile=$(mktemp) +tmpfile=$(mktemp) # have to use a temp file, since `< <(gcloud ...)` doesn't work nicely with `head` +trap 'rm -f "$tmpfile"' EXIT + running_nodes_filter="${node_filter} AND (status!=STOPPING AND status!=TERMINATED)" # List all currently running instances and attempt to delete them gcloud compute instances list --format="value(selfLink)" --filter="${running_nodes_filter}" >"$tmpfile" @@ -54,8 +56,7 @@ while batch="$(head -n 10)" && [[ ${#batch} -gt 0 ]]; do # be treated as independent arguments. See PR#2523 # shellcheck disable=SC2086 gcloud compute instances delete --quiet ${nodes} || echo "Failed to delete some instances" -done <"$tmpfile" # have to use a temp file, since `< <(gcloud ...)` doesn't work nicely with `head` -rm -f "$tmpfile" +done <"$tmpfile" # In case if controller tries to delete the nodes as well, # wait until nodes in STOPPING state are deleted, before deleting the resource policies From 43f634bbb92f731dfe91a46d383f594744d9be27 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 20 Aug 2024 00:32:18 +0000 Subject: [PATCH 110/180] job-template removed from a3 examples --- examples/gke-a3-highgpu.yaml | 10 ---------- examples/gke-a3-megagpu.yaml | 10 ---------- modules/compute/gke-node-pool/README.md | 4 ++-- modules/compute/gke-node-pool/variables.tf | 3 ++- modules/network/multivpc/README.md | 1 - modules/network/multivpc/outputs.tf | 5 ----- 6 files changed, 4 insertions(+), 29 deletions(-) diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml index 0fb28d9098..049b37f27f 100644 --- a/examples/gke-a3-highgpu.yaml +++ b/examples/gke-a3-highgpu.yaml @@ -72,13 +72,3 @@ deployment_groups: # 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload # 4- Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl # 5- Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests - - - id: job-template - source: modules/compute/gke-job-template - use: [a3-highgpu_pool] - settings: - image: nvidia/cuda:11.0.3-runtime-ubuntu20.04 - command: - - nvidia-smi - node_count: 2 - outputs: [instructions] diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml index 0ad89334d9..eaa056c547 100644 --- a/examples/gke-a3-megagpu.yaml +++ b/examples/gke-a3-megagpu.yaml @@ -72,13 +72,3 @@ deployment_groups: # 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload # 4- Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl # 5- Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests - - - id: job-template - source: modules/compute/gke-job-template - use: [a3-megagpu_pool] - settings: - image: nvidia/cuda:11.0.3-runtime-ubuntu20.04 - command: - - nvidia-smi - node_count: 2 - outputs: [instructions] diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 373e00c634..15781f7ffc 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -276,8 +276,8 @@ No modules. | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | -| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 4f8f33d330..8acc7cca94 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -93,6 +93,7 @@ variable "local_ssd_count_ephemeral_storage" { description = <<-EOT The number of local SSDs to attach to each node to back ephemeral storage. Uses NVMe interfaces. Must be supported by `machine_type`. + When set to null, GKE decides about default value. [See above](#local-ssd-storage) for more info. EOT type = number @@ -103,6 +104,7 @@ variable "local_ssd_count_nvme_block" { description = <<-EOT The number of local SSDs to attach to each node to back block storage. Uses NVMe interfaces. Must be supported by `machine_type`. + When set to null, GKE decides about default value. [See above](#local-ssd-storage) for more info. EOT @@ -110,7 +112,6 @@ variable "local_ssd_count_nvme_block" { default = null } - variable "autoscaling_total_min_nodes" { description = "Total minimum number of nodes in the NodePool." type = number diff --git a/modules/network/multivpc/README.md b/modules/network/multivpc/README.md index e254bc4888..f0f55175c9 100644 --- a/modules/network/multivpc/README.md +++ b/modules/network/multivpc/README.md @@ -126,7 +126,6 @@ limitations under the License. | Name | Description | |------|-------------| | [additional\_networks](#output\_additional\_networks) | Network interfaces for each subnetwork created by this module | -| [enable\_multi\_networking](#output\_enable\_multi\_networking) | Enables multi-networking by setting the corresponding variable to true on supported modules (e.g., gke\_cluster). | | [network\_ids](#output\_network\_ids) | IDs of the new VPC network | | [network\_names](#output\_network\_names) | Names of the new VPC networks | | [network\_self\_links](#output\_network\_self\_links) | Self link of the new VPC network | diff --git a/modules/network/multivpc/outputs.tf b/modules/network/multivpc/outputs.tf index 6596f543be..c838faa67c 100644 --- a/modules/network/multivpc/outputs.tf +++ b/modules/network/multivpc/outputs.tf @@ -48,8 +48,3 @@ output "subnetwork_addresses" { description = "IP address range of the primary subnetwork" value = module.vpcs[*].subnetwork_address } - -output "enable_multi_networking" { - description = "Enables multi-networking by setting the corresponding variable to true on supported modules (e.g., gke_cluster)." - value = true -} From 3aee18aa7f5998a2e2ff82720a55928c8292c963 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 20 Aug 2024 00:59:14 +0000 Subject: [PATCH 111/180] some descriptions updated --- examples/README.md | 7 ++++++- examples/gke-a3-highgpu.yaml | 2 +- examples/gke-a3-megagpu.yaml | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/README.md b/examples/README.md index b1816c1be8..ffa3cf83dc 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1542,10 +1542,15 @@ cleaned up when the job is deleted. This blueprint shows how to provision a GKE cluster with A3 Mega machines in the toolkit. After provisioning the cluster and the nodepool, we need to do the following: + 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl + 2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector + 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload + 4- Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl + 5- Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests > [!Note] @@ -1563,7 +1568,7 @@ After provisioning the cluster and the nodepool, we need to do the following: This blueprint shows how to provision a GKE cluster with A3 High machines in the toolkit. -After provisioning the cluster and the nodepool, we need to do the following: +After provisioning the cluster and the nodepool, we need to do the [following](#gke-a3-high-setup-steps): 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#gpudirect-tcpx_2 2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml index 049b37f27f..c370bb9982 100644 --- a/examples/gke-a3-highgpu.yaml +++ b/examples/gke-a3-highgpu.yaml @@ -66,7 +66,7 @@ deployment_groups: autoscaling_total_min_nodes: 2 zones: [$(vars.zone)] -# We need to do the following here: +# We need to take the following steps as explained here: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-highgpuyaml-- # 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl # 2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector # 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml index eaa056c547..bc05d57f22 100644 --- a/examples/gke-a3-megagpu.yaml +++ b/examples/gke-a3-megagpu.yaml @@ -66,7 +66,7 @@ deployment_groups: autoscaling_total_min_nodes: 2 zones: [$(vars.zone)] -# We need to do the following here: +# We need to take the following steps as explained here: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-megagpuyaml-- # 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl # 2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector # 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload From ab408ea33621e7cb8819980c1b8de2e6fe057721 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Tue, 20 Aug 2024 08:19:06 +0000 Subject: [PATCH 112/180] Fix installation on Rocky and Redhat --- .../parallelstore/scripts/install-daos-client.sh | 6 ++++-- .../scripts/install-daos-client.sh | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh index 4c5204859e..a98507e890 100644 --- a/modules/file-system/parallelstore/scripts/install-daos-client.sh +++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh @@ -30,7 +30,7 @@ else # Install the DAOS client library # The following commands should be executed on each client vm. ## For Rocky linux 8. - if grep -q "ID=\"(rocky|rhel)\"" /etc/os-release && lsb_release -rs | grep -q "8\.[0-9]"; then + if grep -q "ID=\"rocky\"" /etc/os-release && lsb_release -rs | grep -q "8\.[0-9]"; then # 1) Add the Parallelstore package repository tee /etc/yum.repos.d/parallelstore-v2-6-el8.repo < Date: Tue, 20 Aug 2024 09:22:23 +0000 Subject: [PATCH 113/180] some docs and descriptions updated --- examples/README.md | 17 ++++++----------- examples/gke-a3-highgpu.yaml | 2 +- examples/gke-a3-megagpu.yaml | 2 +- modules/compute/gke-node-pool/README.md | 4 ++-- modules/scheduler/gke-cluster/README.md | 4 ++-- modules/scheduler/gke-cluster/variables.tf | 4 ++-- 6 files changed, 14 insertions(+), 19 deletions(-) diff --git a/examples/README.md b/examples/README.md index ffa3cf83dc..1459b4b432 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1542,16 +1542,11 @@ cleaned up when the job is deleted. This blueprint shows how to provision a GKE cluster with A3 Mega machines in the toolkit. After provisioning the cluster and the nodepool, we need to do the following: - -1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl - -2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector - -3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload - -4- Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl - -5- Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests +1. Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl +2. Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector +3. Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload +4. Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl +5. Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests > [!Note] > The Kubernetes API server will only allow requests from authorized networks. @@ -1568,7 +1563,7 @@ After provisioning the cluster and the nodepool, we need to do the following: This blueprint shows how to provision a GKE cluster with A3 High machines in the toolkit. -After provisioning the cluster and the nodepool, we need to do the [following](#gke-a3-high-setup-steps): +After provisioning the cluster and the nodepool, we need to do the following: 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#gpudirect-tcpx_2 2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml index c370bb9982..abcc780df1 100644 --- a/examples/gke-a3-highgpu.yaml +++ b/examples/gke-a3-highgpu.yaml @@ -66,7 +66,7 @@ deployment_groups: autoscaling_total_min_nodes: 2 zones: [$(vars.zone)] -# We need to take the following steps as explained here: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-highgpuyaml-- +# We need to do the following here after deployment (https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-highgpuyaml--): # 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl # 2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector # 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml index bc05d57f22..bbcedfeb83 100644 --- a/examples/gke-a3-megagpu.yaml +++ b/examples/gke-a3-megagpu.yaml @@ -66,7 +66,7 @@ deployment_groups: autoscaling_total_min_nodes: 2 zones: [$(vars.zone)] -# We need to take the following steps as explained here: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-megagpuyaml-- +# # We need to do the following here after deployment (https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-megagpuyaml--): # 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl # 2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector # 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 15781f7ffc..0f978987b3 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -75,8 +75,8 @@ kernel modules to be loaded. To maximize GPU network bandwidth, nodepools accept multiple VPCs. Pass a multivpc module to gke-node-pool module, and [take these steps] (https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl) to install GPUDirect, configure NCCL, use recommended settings, and add GPUDirect to your pods. -> **_NOTE:_** You must [enable multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) feature when creating the GKE cluster. Passing the multivpc module to a gke-cluster module enables multi networking on the cluster creation. -> Passing the multivpc module to a gke-cluster or pre-existing-gke-cluster module [creates required network objects](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment) on the cluster for multi networking. +> **_NOTE:_** You must [enable multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) feature when creating the GKE cluster. When gke-cluster depends on multivpc (with the use keyword), multi networking will be automatically enabled on the cluster creation. +> When gke-cluster or pre-existing-gke-cluster depends on multivpc (with the use keyword), the [network objects](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment) required for multi networking will be created on the cluster. ### GPUs Examples diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 71bce049c7..e6e7f6f6de 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -151,11 +151,11 @@ limitations under the License. | [autoscaling\_profile](#input\_autoscaling\_profile) | (Beta) Optimize for utilization or availability when deciding to remove nodes. Can be BALANCED or OPTIMIZE\_UTILIZATION. | `string` | `"OPTIMIZE_UTILIZATION"` | no | | [configure\_workload\_identity\_sa](#input\_configure\_workload\_identity\_sa) | When true, a kubernetes service account will be created and bound using workload identity to the service account used to create the cluster. | `bool` | `false` | no | | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment. Used in the GKE cluster name by default and can be configured with `prefix_with_deployment_name`. | `string` | n/a | yes | -| [enable\_dataplane\_v2](#input\_enable\_dataplane\_v2) | Enables [Dataplane v2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2). This setting is immutable on clusters. | `bool` | `null` | no | +| [enable\_dataplane\_v2](#input\_enable\_dataplane\_v2) | Enables [Dataplane v2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2). This setting is immutable on clusters. If null, will default to false unless using multi-networking, in which case it will default to true | `bool` | `null` | no | | [enable\_filestore\_csi](#input\_enable\_filestore\_csi) | The status of the Filestore Container Storage Interface (CSI) driver addon, which allows the usage of filestore instance as volumes. | `bool` | `false` | no | | [enable\_gcsfuse\_csi](#input\_enable\_gcsfuse\_csi) | The status of the GCSFuse Filestore Container Storage Interface (CSI) driver addon, which allows the usage of a gcs bucket as volumes. | `bool` | `false` | no | | [enable\_master\_global\_access](#input\_enable\_master\_global\_access) | Whether the cluster master is accessible globally (from any region) or only within the same region as the private endpoint. | `bool` | `false` | no | -| [enable\_multi\_networking](#input\_enable\_multi\_networking) | Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) (Requires GKE Enterprise). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en). | `bool` | `null` | no | +| [enable\_multi\_networking](#input\_enable\_multi\_networking) | Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) (Requires GKE Enterprise). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en). If null, will determine state based on if additional\_networks are passed in. | `bool` | `null` | no | | [enable\_persistent\_disk\_csi](#input\_enable\_persistent\_disk\_csi) | The status of the Google Compute Engine Persistent Disk Container Storage Interface (CSI) driver addon, which allows the usage of a PD as volumes. | `bool` | `true` | no | | [enable\_private\_endpoint](#input\_enable\_private\_endpoint) | (Beta) Whether the master's internal IP address is used as the cluster endpoint. | `bool` | `true` | no | | [enable\_private\_ipv6\_google\_access](#input\_enable\_private\_ipv6\_google\_access) | The private IPv6 google access type for the VMs in this subnet. | `bool` | `true` | no | diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf index ab1f465392..e91be6b297 100644 --- a/modules/scheduler/gke-cluster/variables.tf +++ b/modules/scheduler/gke-cluster/variables.tf @@ -260,7 +260,7 @@ variable "authenticator_security_group" { } variable "enable_dataplane_v2" { - description = "Enables [Dataplane v2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2). This setting is immutable on clusters." + description = "Enables [Dataplane v2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2). This setting is immutable on clusters. If null, will default to false unless using multi-networking, in which case it will default to true" type = bool default = null } @@ -298,7 +298,7 @@ variable "service_account" { } variable "enable_multi_networking" { - description = "Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) (Requires GKE Enterprise). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en)." + description = "Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) (Requires GKE Enterprise). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en). If null, will determine state based on if additional_networks are passed in." type = bool default = null } From 72e9908086b00956bf7f071cf4b604d1a43ebe25 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 20 Aug 2024 16:33:26 +0000 Subject: [PATCH 114/180] enable_multi_networking and enable_dataplane_v2 logic updated in gke-cluster module --- examples/README.md | 10 +++++----- examples/gke-a3-highgpu.yaml | 7 +------ examples/gke-a3-megagpu.yaml | 7 +------ modules/scheduler/gke-cluster/main.tf | 4 ++-- 4 files changed, 9 insertions(+), 19 deletions(-) diff --git a/examples/README.md b/examples/README.md index 1459b4b432..efccaed9f5 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1564,11 +1564,11 @@ After provisioning the cluster and the nodepool, we need to do the following: This blueprint shows how to provision a GKE cluster with A3 High machines in the toolkit. After provisioning the cluster and the nodepool, we need to do the following: -1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#gpudirect-tcpx_2 -2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector -3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload -4- Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl -5- Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests +1. Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#gpudirect-tcpx_2 +2. Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector +3. Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload +4. Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl +5. Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests > [!Note] > The Kubernetes API server will only allow requests from authorized networks. diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml index abcc780df1..de25631fa4 100644 --- a/examples/gke-a3-highgpu.yaml +++ b/examples/gke-a3-highgpu.yaml @@ -66,9 +66,4 @@ deployment_groups: autoscaling_total_min_nodes: 2 zones: [$(vars.zone)] -# We need to do the following here after deployment (https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-highgpuyaml--): -# 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl -# 2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector -# 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload -# 4- Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl -# 5- Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests +# We need to do the following here after deployment: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-highgpuyaml-- diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml index bbcedfeb83..51dbcdb0fc 100644 --- a/examples/gke-a3-megagpu.yaml +++ b/examples/gke-a3-megagpu.yaml @@ -66,9 +66,4 @@ deployment_groups: autoscaling_total_min_nodes: 2 zones: [$(vars.zone)] -# # We need to do the following here after deployment (https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-megagpuyaml--): -# 1- Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl -# 2- Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector -# 3- Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload -# 4- Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl -# 5- Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests +# We need to do the following here after deployment: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-megagpuyaml-- diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index de20423b71..804b07e929 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -32,10 +32,10 @@ locals { sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email # additional VPCs enable multi networking - derived_enable_multi_networking = length(var.additional_networks) > 0 ? true : coalesce(var.enable_multi_networking, false) + derived_enable_multi_networking = coalesce(var.enable_multi_networking, length(var.additional_networks) > 0 ? true : false) # multi networking needs enabled Dataplane v2 - derived_enable_dataplane_v2 = local.derived_enable_multi_networking ? true : coalesce(var.enable_dataplane_v2, false) + derived_enable_dataplane_v2 = coalesce(var.enable_dataplane_v2, local.derived_enable_multi_networking) } data "google_compute_default_service_account" "default_sa" { From 9672dd7d4487ea28130add0299f475f3fd75c3a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Tue, 20 Aug 2024 16:53:34 +0000 Subject: [PATCH 115/180] Refactor to use variables, fix installation on RHEL --- .../scripts/install-daos-client.sh | 42 ++++++++++++------- .../scripts/install-daos-client.sh | 42 ++++++++++++------- 2 files changed, 54 insertions(+), 30 deletions(-) diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh index a98507e890..6363562e6b 100644 --- a/modules/file-system/parallelstore/scripts/install-daos-client.sh +++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh @@ -22,18 +22,21 @@ for arg in "$@"; do fi done +OS_ID=$(awk -F '=' '/^ID=/ {print $2}' /etc/os-release | sed -e 's/"//g') +OS_VERSION=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g') +OS_VERSION_MAJOR=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g' -e 's/\..*$//') + if [ -x /bin/daos ]; then echo "DAOS already installed" daos version else - # Install the DAOS client library # The following commands should be executed on each client vm. - ## For Rocky linux 8. - if grep -q "ID=\"rocky\"" /etc/os-release && lsb_release -rs | grep -q "8\.[0-9]"; then - - # 1) Add the Parallelstore package repository - tee /etc/yum.repos.d/parallelstore-v2-6-el8.repo < Date: Tue, 20 Aug 2024 11:43:14 -0700 Subject: [PATCH 116/180] minor doc update in ps script --- .../file-system/parallelstore/scripts/install-daos-client.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh index 6363562e6b..7095b8228f 100644 --- a/modules/file-system/parallelstore/scripts/install-daos-client.sh +++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh @@ -79,7 +79,7 @@ EOF apt install -y daos-client else - echo "Unsupported operating system ${OS_ID} ${OS_VERSION_MAJOR}. This script only supports Rocky Linux 8, Ubuntu 22.04, and Debian 12." + echo "Unsupported operating system ${OS_ID} ${OS_VERSION}. This script only supports Rocky Linux 8, Redhat 8, Redhat 9, Ubuntu 22.04, and Debian 12." exit 1 fi fi From bb6f3191d0c67a8fec95df652ffdc4c510daaddc Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 20 Aug 2024 11:43:50 -0700 Subject: [PATCH 117/180] minor doc update in pre-existing ps script --- .../pre-existing-network-storage/scripts/install-daos-client.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh index 6363562e6b..7095b8228f 100644 --- a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh +++ b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh @@ -79,7 +79,7 @@ EOF apt install -y daos-client else - echo "Unsupported operating system ${OS_ID} ${OS_VERSION_MAJOR}. This script only supports Rocky Linux 8, Ubuntu 22.04, and Debian 12." + echo "Unsupported operating system ${OS_ID} ${OS_VERSION}. This script only supports Rocky Linux 8, Redhat 8, Redhat 9, Ubuntu 22.04, and Debian 12." exit 1 fi fi From 4c337697e284a46889ba8ba860c153198265eb13 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 17 Aug 2024 01:42:05 +0000 Subject: [PATCH 118/180] SurmGCP. Bump version `6.6.1 -> 6.6.2` --- .../schedmd-slurm-gcp-v6-nodeset-dynamic/README.md | 2 +- .../schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 12 ++++++------ .../schedmd-slurm-gcp-v6-controller/controller.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/partition.tf | 4 ++-- .../a3-megagpu-8g/slurm-a3mega-image.yaml | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index 864b8933ad..e3b9a353ff 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Source | Version | |------|--------|---------| -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.1 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.2 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf index f064171b67..5f692db27f 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -61,7 +61,7 @@ data "google_compute_default_service_account" "default" { module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.2" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 3c47ea0281..30ee38d084 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -196,13 +196,13 @@ limitations under the License. | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | | [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | | [nodeset\_tpu\_cleanup](#module\_nodeset\_tpu\_cleanup) | ./modules/cleanup_compute | n/a | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.1 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.2 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.2 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.1 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.1 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.1 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.6.1 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.2 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.2 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.2 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.6.2 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 93df4491b9..5f0e681ee1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -44,7 +44,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.2" project_id = var.project_id region = var.region @@ -100,7 +100,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.2" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index d9ce81db36..e7747b72ff 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.2" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -56,7 +56,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.2" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 41282816e8..c3448df181 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -26,7 +26,7 @@ locals { # NODESET # TODO: remove dependency on slurm-gcp repo, move to local template module module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.2" for_each = local.nodeset_map project_id = var.project_id @@ -100,7 +100,7 @@ locals { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.6.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.6.2" for_each = local.nodeset_tpu_map project_id = var.project_id diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index 2706b8eb60..04d78aa9d3 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -108,7 +108,7 @@ deployment_groups: apt-get install -y git ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.6.1 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.6.2 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml From 105962d963a2de9dd4e0da84780f3bbb72e5127b Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 20 Aug 2024 19:53:14 +0000 Subject: [PATCH 119/180] preconditions in gke-cluster updated. --- modules/scheduler/gke-cluster/main.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index 804b07e929..3faf514d10 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -32,7 +32,7 @@ locals { sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email # additional VPCs enable multi networking - derived_enable_multi_networking = coalesce(var.enable_multi_networking, length(var.additional_networks) > 0 ? true : false) + derived_enable_multi_networking = coalesce(var.enable_multi_networking, length(var.additional_networks) > 0) # multi networking needs enabled Dataplane v2 derived_enable_dataplane_v2 = coalesce(var.enable_dataplane_v2, local.derived_enable_multi_networking) @@ -177,12 +177,12 @@ resource "google_container_cluster" "gke_cluster" { node_config ] precondition { - condition = !(!coalesce(var.enable_dataplane_v2, true) && (coalesce(var.enable_multi_networking, false) || length(var.additional_networks) > 0)) + condition = !(!coalesce(var.enable_dataplane_v2, true) && local.derived_enable_multi_networking) error_message = "'enable_dataplane_v2' cannot be false when enabling multi networking." } precondition { condition = !(!coalesce(var.enable_multi_networking, true) && length(var.additional_networks) > 0) - error_message = "'enable_multi_networking' cannot be false when passing multivpc module." + error_message = "'enable_multi_networking' cannot be false when using multivpc module, which passes additional_networks." } } From a2b45f6276c71f392c21790a18b766f38b6735bf Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Tue, 20 Aug 2024 21:12:15 +0000 Subject: [PATCH 120/180] positioning of validation and module source substituition steps modified. --- pkg/config/config.go | 14 ++++++++++++++ pkg/config/expand.go | 33 ++++++++++++++++----------------- pkg/inspect/list.go | 4 ++++ 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 7e3d7d99a8..afd64f04b7 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -349,6 +349,7 @@ func (bp *Blueprint) Expand() error { errs := (&Errors{}). Add(checkStringLiterals(bp)). Add(bp.checkBlueprintName()). + Add(bp.checkToolkitModulesUrlAndVersion()). Add(checkProviders(Root.Provider, bp.TerraformProviders)) if errs.Any() { return *errs @@ -679,6 +680,19 @@ func (bp *Blueprint) checkBlueprintName() error { return nil } +// checkToolkitModulesUrlAndVersion returns an error if either +// toolkit_modules_url or toolkit_modules_version is +// exclsuively supplied (i.e., one is present, but the other is missing). +func (bp *Blueprint) checkToolkitModulesUrlAndVersion() error { + if bp.ToolkitModulesURL == "" && bp.ToolkitModulesVersion != "" { + return fmt.Errorf("toolkit_modules_url must be provided when toolkit_modules_version is specified") + } + if bp.ToolkitModulesURL != "" && bp.ToolkitModulesVersion == "" { + return fmt.Errorf("toolkit_modules_version must be provided when toolkit_modules_url is specified") + } + return nil +} + // Check that all references in expressions are valid func (bp *Blueprint) checkReferences() error { errs := Errors{} diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 59dda0cf25..d5570a8ac0 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -17,8 +17,8 @@ package config import ( "errors" "fmt" - "strings" + "hpc-toolkit/pkg/inspect" "hpc-toolkit/pkg/modulereader" "github.com/zclconf/go-cty/cty" @@ -104,9 +104,23 @@ func (bp *Blueprint) expandVars() error { return nil } +func (bp *Blueprint) substituteModuleSources() { + for ig := range bp.Groups { + g := &bp.Groups[ig] + for im := range g.Modules { + m := &g.Modules[im] + if inspect.IsLocalModule(m.Source) { + m.Source = fmt.Sprintf("%s//%s?ref=%s&depth=1", bp.ToolkitModulesURL, m.Source, bp.ToolkitModulesVersion) + } + } + } +} + func (bp *Blueprint) expandGroups() error { bp.addKindToModules() - + if bp.ToolkitModulesURL != "" && bp.ToolkitModulesVersion != "" { + bp.substituteModuleSources() + } if err := checkModulesAndGroups(*bp); err != nil { return err } @@ -144,24 +158,9 @@ func (bp Blueprint) expandGroup(gp groupPath, g *Group) error { func (bp Blueprint) expandModule(mp ModulePath, m *Module) error { bp.applyUseModules(m) bp.applyGlobalVarsInModule(m) - // Versioned Module Logic for Embedded Modules - if bp.ToolkitModulesURL != "" && bp.ToolkitModulesVersion != "" { - if strings.HasPrefix(m.Source, "modules/") || strings.HasPrefix(m.Source, "community/") { - newSource, err := constructVersionedModuleSource(bp.ToolkitModulesURL, m.Source, bp.ToolkitModulesVersion) - if err != nil { - return fmt.Errorf("error constructing versioned module source: %w", err) - } - m.Source = newSource - } - } return validateModuleInputs(mp, *m, bp) } -// TODO: Add validation and error checks for baseURL and version -func constructVersionedModuleSource(baseURL, modulePath, version string) (string, error) { - return fmt.Sprintf("%s//%s?ref=%s&depth=1", baseURL, modulePath, version), nil -} - func (bp Blueprint) expandBackend(grp *Group) { // 1. DEFAULT: use TerraformBackend configuration (if supplied) // 2. If top-level TerraformBackendDefaults is defined, insert that diff --git a/pkg/inspect/list.go b/pkg/inspect/list.go index 1bb2f9fc73..7a5d5dc3b5 100644 --- a/pkg/inspect/list.go +++ b/pkg/inspect/list.go @@ -67,3 +67,7 @@ func LocalModules() ([]SourceAndKind, error) { } return ret, nil } + +func IsLocalModule(source string) bool { + return strings.HasPrefix(source, "modules/") || strings.HasPrefix(source, "community/modules") +} From fc837f054a44244fccd6270d01e6bb4db69f4c7d Mon Sep 17 00:00:00 2001 From: abbas1902 Date: Tue, 20 Aug 2024 21:13:38 +0000 Subject: [PATCH 121/180] Change Parallelstore installation script to not move log folder for debian/ubuntu --- .../parallelstore/scripts/install-daos-client.sh | 10 +++++----- .../scripts/install-daos-client.sh | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh index 7095b8228f..fd796bd5fd 100644 --- a/modules/file-system/parallelstore/scripts/install-daos-client.sh +++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh @@ -90,13 +90,13 @@ sed -i "s/#.*transport_config/transport_config/g" $daos_config sed -i "s/#.*allow_insecure:.*false/ allow_insecure: true/g" $daos_config sed -i "s/.*access_points.*/access_points: $access_points/g" $daos_config -# Move agent log destination from /tmp/ (default) to /var/log/daos_agent/ -mkdir -p /var/log/daos_agent -chown daos_agent:daos_agent /var/log/daos_agent -sed -i "s/#.*log_file:.*/log_file: \/var\/log\/daos_agent\/daos_agent.log/g" $daos_config - # Start service if { [ "${OS_ID}" = "rocky" ] || [ "${OS_ID}" = "rhel" ]; } && { [ "${OS_VERSION_MAJOR}" = "8" ] || [ "${OS_VERSION_MAJOR}" = "9" ]; }; then + # TODO: Update script to change default log destination folder, after daos_agent user is supported in debian and ubuntu. + # Move agent log destination from /tmp/ (default) to /var/log/daos_agent/ + mkdir -p /var/log/daos_agent + chown daos_agent:daos_agent /var/log/daos_agent + sed -i "s/#.*log_file:.*/log_file: \/var\/log\/daos_agent\/daos_agent.log/g" $daos_config systemctl start daos_agent.service elif { [ "${OS_ID}" = "ubuntu" ] && [ "${OS_VERSION}" = "22.04" ]; } || { [ "${OS_ID}" = "debian" ] && [ "${OS_VERSION_MAJOR}" = "12" ]; }; then mkdir -p /var/run/daos_agent diff --git a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh index 7095b8228f..fd796bd5fd 100644 --- a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh +++ b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh @@ -90,13 +90,13 @@ sed -i "s/#.*transport_config/transport_config/g" $daos_config sed -i "s/#.*allow_insecure:.*false/ allow_insecure: true/g" $daos_config sed -i "s/.*access_points.*/access_points: $access_points/g" $daos_config -# Move agent log destination from /tmp/ (default) to /var/log/daos_agent/ -mkdir -p /var/log/daos_agent -chown daos_agent:daos_agent /var/log/daos_agent -sed -i "s/#.*log_file:.*/log_file: \/var\/log\/daos_agent\/daos_agent.log/g" $daos_config - # Start service if { [ "${OS_ID}" = "rocky" ] || [ "${OS_ID}" = "rhel" ]; } && { [ "${OS_VERSION_MAJOR}" = "8" ] || [ "${OS_VERSION_MAJOR}" = "9" ]; }; then + # TODO: Update script to change default log destination folder, after daos_agent user is supported in debian and ubuntu. + # Move agent log destination from /tmp/ (default) to /var/log/daos_agent/ + mkdir -p /var/log/daos_agent + chown daos_agent:daos_agent /var/log/daos_agent + sed -i "s/#.*log_file:.*/log_file: \/var\/log\/daos_agent\/daos_agent.log/g" $daos_config systemctl start daos_agent.service elif { [ "${OS_ID}" = "ubuntu" ] && [ "${OS_VERSION}" = "22.04" ]; } || { [ "${OS_ID}" = "debian" ] && [ "${OS_VERSION_MAJOR}" = "12" ]; }; then mkdir -p /var/run/daos_agent From 3aaee211742b3f6cefd1a808db8266a180efcce9 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 20 Aug 2024 22:14:30 +0000 Subject: [PATCH 122/180] SlurmGCP. Remove dependency of controller on config * Reduce coupling * Improve deploy time --- .../schedmd-slurm-gcp-v6-controller/controller.tf | 2 +- .../schedmd-slurm-gcp-v6-controller/login.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v6-controller/main.tf | 1 - .../modules/slurm_files/README.md | 1 - .../modules/slurm_files/main.tf | 11 ----------- .../modules/slurm_files/outputs.tf | 5 ----- 6 files changed, 2 insertions(+), 20 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 5f0e681ee1..ac1535a92c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -116,7 +116,7 @@ module "slurm_controller_instance" { zone = var.zone metadata = var.metadata - labels = merge(local.labels, local.files_cs_labels) + labels = local.labels } # SECRETS: CLOUDSQL diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index e7747b72ff..00945db930 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -68,7 +68,7 @@ module "slurm_login_instance" { slurm_cluster_name = local.slurm_cluster_name instance_template = module.slurm_login_template[each.key].self_link - labels = merge(each.value.labels, local.files_cs_labels) + labels = each.value.labels num_instances = each.value.num_instances additional_networks = each.value.additional_networks diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf index 095d4efdbb..2e8cca6728 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf @@ -25,7 +25,6 @@ locals { tmp_cluster_name = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10) slurm_cluster_name = coalesce(var.slurm_cluster_name, local.tmp_cluster_name) - files_cs_labels = { slurm_files_checksum = module.slurm_files.checksum } universe_domain = { "universe_domain" = var.universe_domain } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index a4ca8378e6..5f26492443 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -110,7 +110,6 @@ No modules. | Name | Description | |------|-------------| -| [checksum](#output\_checksum) | Checksum of all files written to the bucket. | | [config](#output\_config) | Cluster configuration. | | [nodeset](#output\_nodeset) | Cluster nodesets. | | [nodeset\_dyn](#output\_nodeset\_dyn) | Cluster nodesets (dynamic). | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index 00338a3cca..22a8ccba44 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -277,17 +277,6 @@ data "local_file" "setup_external" { } locals { - checksum = md5(join("", flatten([ - google_storage_bucket_object.config.md5hash, - google_storage_bucket_object.devel.md5hash, - [for k, f in google_storage_bucket_object.controller_startup_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.compute_startup_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.nodeset_startup_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.login_startup_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.prolog_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.epilog_scripts : f.md5hash] - ]))) - external_epilog = [{ filename = "z_external_epilog.sh" content = data.local_file.external_epilog.content diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/outputs.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/outputs.tf index 3b680b50a7..36cf0b646c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/outputs.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/outputs.tf @@ -53,8 +53,3 @@ output "nodeset_tpu" { description = "Cluster nodesets (TPU)." value = lookup(local.config, "nodeset_tpu", null) } - -output "checksum" { - description = "Checksum of all files written to the bucket." - value = local.checksum -} From a4f4579bc1e8f262e6fc6ad85ca206130f0518d6 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 20 Aug 2024 22:20:15 +0000 Subject: [PATCH 123/180] Remove `test_tf_version_for_slurm` validator --- pkg/validators/adhoc.go | 72 ------------------- pkg/validators/validators.go | 5 +- pkg/validators/validators_test.go | 11 ++- .../tasks/create_deployment_directory.yml | 1 - tools/validate_configs/validate_configs.sh | 2 +- 5 files changed, 7 insertions(+), 84 deletions(-) delete mode 100644 pkg/validators/adhoc.go diff --git a/pkg/validators/adhoc.go b/pkg/validators/adhoc.go deleted file mode 100644 index d7503593ca..0000000000 --- a/pkg/validators/adhoc.go +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2023 "Google LLC" -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package validators - -import ( - "encoding/json" - "fmt" - "hpc-toolkit/pkg/config" - "os/exec" - "strings" -) - -func testTfVersionForSlurm(bp config.Blueprint, _ config.Dict) error { - slurm := false - bp.WalkModulesSafe(func(_ config.ModulePath, m *config.Module) { - if strings.HasSuffix(m.Source, "slurm-gcp-v6-controller") { - slurm = true - } - }) - - if !slurm { - return nil - } - - ver, err := tfVersion() - if err != nil { - return nil - } - - if ver <= "1.4.0" { - return nil - } - - return fmt.Errorf("using a newer version of Terraform can lead to controller replacement on reconfigure for Slurm GCP v6\n\n" + - "Please be advised of this known issue: https://github.com/GoogleCloudPlatform/hpc-toolkit/issues/2774\n" + - "Until resolved it is advised to use Terraform 1.4.0 with Slurm deployments.\n\n" + - "To silence this warning, add flag: --skip-validators=test_tf_version_for_slurm") - -} - -func tfVersion() (string, error) { - path, err := exec.LookPath("terraform") - if err != nil { - return "", err - } - - out, err := exec.Command(path, "version", "--json").Output() - if err != nil { - return "", err - } - - var version struct { - TerraformVersion string `json:"terraform_version"` - } - if err := json.Unmarshal(out, &version); err != nil { - return "", err - } - - return version.TerraformVersion, nil -} diff --git a/pkg/validators/validators.go b/pkg/validators/validators.go index 18397bfaa6..21cf622144 100644 --- a/pkg/validators/validators.go +++ b/pkg/validators/validators.go @@ -54,7 +54,6 @@ const ( testZoneInRegionName = "test_zone_in_region" testModuleNotUsedName = "test_module_not_used" testDeploymentVariableNotUsedName = "test_deployment_variable_not_used" - testTfVersionForSlurmName = "test_tf_version_for_slurm" ) func implementations() map[string]func(config.Blueprint, config.Dict) error { @@ -66,7 +65,6 @@ func implementations() map[string]func(config.Blueprint, config.Dict) error { testZoneInRegionName: testZoneInRegion, testModuleNotUsedName: testModuleNotUsed, testDeploymentVariableNotUsedName: testDeploymentVariableNotUsed, - testTfVersionForSlurmName: testTfVersionForSlurm, } } @@ -167,8 +165,7 @@ func defaults(bp config.Blueprint) []config.Validator { defaults := []config.Validator{ {Validator: testModuleNotUsedName}, - {Validator: testDeploymentVariableNotUsedName}, - {Validator: testTfVersionForSlurmName}} + {Validator: testDeploymentVariableNotUsedName}} // always add the project ID validator before subsequent validators that can // only succeed if credentials can access the project. If the project ID diff --git a/pkg/validators/validators_test.go b/pkg/validators/validators_test.go index 4f8fc578ad..3f690e5821 100644 --- a/pkg/validators/validators_test.go +++ b/pkg/validators/validators_test.go @@ -73,7 +73,6 @@ func (s *MySuite) TestCheckInputs(c *C) { func (s *MySuite) TestDefaultValidators(c *C) { unusedMods := config.Validator{Validator: "test_module_not_used"} unusedVars := config.Validator{Validator: "test_deployment_variable_not_used"} - slurmTf := config.Validator{Validator: "test_tf_version_for_slurm"} prjInp := config.Dict{}.With("project_id", config.GlobalRef("project_id").AsValue()) regInp := prjInp.With("region", config.GlobalRef("region").AsValue()) @@ -94,14 +93,14 @@ func (s *MySuite) TestDefaultValidators(c *C) { { bp := config.Blueprint{} c.Check(defaults(bp), DeepEquals, []config.Validator{ - unusedMods, unusedVars, slurmTf}) + unusedMods, unusedVars}) } { bp := config.Blueprint{Vars: config.Dict{}. With("project_id", cty.StringVal("f00b"))} c.Check(defaults(bp), DeepEquals, []config.Validator{ - unusedMods, unusedVars, slurmTf, projectExists, apisEnabled}) + unusedMods, unusedVars, projectExists, apisEnabled}) } { @@ -110,7 +109,7 @@ func (s *MySuite) TestDefaultValidators(c *C) { With("region", cty.StringVal("narnia"))} c.Check(defaults(bp), DeepEquals, []config.Validator{ - unusedMods, unusedVars, slurmTf, projectExists, apisEnabled, regionExists}) + unusedMods, unusedVars, projectExists, apisEnabled, regionExists}) } { @@ -119,7 +118,7 @@ func (s *MySuite) TestDefaultValidators(c *C) { With("zone", cty.StringVal("danger"))} c.Check(defaults(bp), DeepEquals, []config.Validator{ - unusedMods, unusedVars, slurmTf, projectExists, apisEnabled, zoneExists}) + unusedMods, unusedVars, projectExists, apisEnabled, zoneExists}) } { @@ -129,6 +128,6 @@ func (s *MySuite) TestDefaultValidators(c *C) { With("zone", cty.StringVal("danger"))} c.Check(defaults(bp), DeepEquals, []config.Validator{ - unusedMods, unusedVars, slurmTf, projectExists, apisEnabled, regionExists, zoneExists, zoneInRegion}) + unusedMods, unusedVars, projectExists, apisEnabled, regionExists, zoneExists, zoneInRegion}) } } diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml index ecb9513291..034c99c54f 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml @@ -34,7 +34,6 @@ ansible.builtin.command: | ./gcluster create -l ERROR "{{ blueprint_yaml }}" \ --backend-config bucket={{ state_bucket }} \ - --skip-validators=test_tf_version_for_slurm \ --vars project_id={{ project }} \ --vars deployment_name={{ deployment_name }} \ {{ deployment_vars_str if deployment_vars_str is defined else '' }} diff --git a/tools/validate_configs/validate_configs.sh b/tools/validate_configs/validate_configs.sh index 511028beaa..996b25006f 100755 --- a/tools/validate_configs/validate_configs.sh +++ b/tools/validate_configs/validate_configs.sh @@ -26,7 +26,7 @@ run_test() { exampleFile=$(basename "$example") DEPLOYMENT=$(echo "${exampleFile%.yaml}-$(basename "${tmpdir##*.}")" | sed -e 's/\(.*\)/\L\1/') PROJECT="invalid-project" - VALIDATORS_TO_SKIP="test_project_exists,test_apis_enabled,test_region_exists,test_zone_exists,test_zone_in_region,test_tf_version_for_slurm" + VALIDATORS_TO_SKIP="test_project_exists,test_apis_enabled,test_region_exists,test_zone_exists,test_zone_in_region" GHPC_PATH="${cwd}/ghpc" BP_PATH="${cwd}/${example}" # Cover the three possible starting sequences for local sources: ./ ../ / From 99e64ac87bcabdea2e6a0aef7b6c7b5861e249c0 Mon Sep 17 00:00:00 2001 From: abbas1902 Date: Tue, 20 Aug 2024 23:04:29 +0000 Subject: [PATCH 124/180] improved error msg --- .../compute/schedmd-slurm-gcp-v6-nodeset/variables.tf | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 62cddfeb48..38cc0739b3 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -29,6 +29,14 @@ variable "node_conf" { description = "Map of Slurm node line configuration." type = map(any) default = {} + validation { + condition = lookup(var.node_conf, "Sockets", null) == null + error_message = <<-EOD + `Sockets` settings conflicts with `SocketsPerBoard` that is automatically computed by SlurmGCP. + Instead you can override `Boards`, `SocketsPerBoard`, `CoresPerSocket`, and `ThreadsPerCore`. + See: https://slurm.schedmd.com/slurm.conf.html#OPT_Boards and https://slurm.schedmd.com/slurm.conf.html#OPT_Sockets_1 + EOD + } } variable "node_count_static" { From a0d2fab835b001478859287f04090807fae2de68 Mon Sep 17 00:00:00 2001 From: abbas1902 Date: Tue, 20 Aug 2024 23:40:11 +0000 Subject: [PATCH 125/180] additional error msg improvements --- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 38cc0739b3..0a53ef95e2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -32,8 +32,8 @@ variable "node_conf" { validation { condition = lookup(var.node_conf, "Sockets", null) == null error_message = <<-EOD - `Sockets` settings conflicts with `SocketsPerBoard` that is automatically computed by SlurmGCP. - Instead you can override `Boards`, `SocketsPerBoard`, `CoresPerSocket`, and `ThreadsPerCore`. + `Sockets` field is in conflict with `SocketsPerBoard` which is automatically generated by SlurmGCP. + Instead, you can override the following fields: `Boards`, `SocketsPerBoard`, `CoresPerSocket`, and `ThreadsPerCore`. See: https://slurm.schedmd.com/slurm.conf.html#OPT_Boards and https://slurm.schedmd.com/slurm.conf.html#OPT_Sockets_1 EOD } From c7b6b9f41e30321390f9344a2064ffb0c5917523 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Thu, 22 Aug 2024 10:16:12 +0000 Subject: [PATCH 126/180] Support named compact placement in GKE node pools --- modules/compute/gke-node-pool/README.md | 1 + modules/compute/gke-node-pool/main.tf | 3 ++- modules/compute/gke-node-pool/variables.tf | 10 ++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 673e1686d9..b5892da77c 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -226,6 +226,7 @@ No modules. | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | | [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes | | [compact\_placement](#input\_compact\_placement) | Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes. | `bool` | `false` | no | +| [compact\_placement\_policy](#input\_compact\_placement\_policy) | Name of the placement policy to use when compact\_placement is enabled.
It is assumed that the specified policy exists. To create a compact placement policy refer to https://cloud.google.com/compute/docs/instances/use-compact-placement-policies.
Beware of the limitations of using compact placement with GKE https://cloud.google.com/kubernetes-engine/docs/how-to/compact-placement#limitations-standard | `string` | `null` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for each node. | `number` | `100` | no | | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 9045bb3f5a..8daba2d7b3 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -69,7 +69,8 @@ resource "google_container_node_pool" "node_pool" { dynamic "placement_policy" { for_each = var.compact_placement ? [1] : [] content { - type = "COMPACT" + type = "COMPACT" + policy_name = var.compact_placement_policy } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index bf804c5bce..6a62042ff5 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -293,3 +293,13 @@ variable "specific_reservation" { values = null } } + +variable "compact_placement_policy" { + description = <<-EOT + Name of the placement policy to use when compact_placement is enabled. + It is assumed that the specified policy exists. To create a compact placement policy refer to https://cloud.google.com/compute/docs/instances/use-compact-placement-policies. + Beware of the limitations of using compact placement with GKE https://cloud.google.com/kubernetes-engine/docs/how-to/compact-placement#limitations-standard + EOT + type = string + default = null +} From 9810224c4d73f348ab2681ed1e151b923e98e6c7 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Thu, 22 Aug 2024 11:47:09 +0000 Subject: [PATCH 127/180] disk_definitions implemented --- modules/compute/gke-node-pool/README.md | 4 +-- .../compute/gke-node-pool/disk_definitions.tf | 36 +++++++++++++++++++ modules/compute/gke-node-pool/main.tf | 10 +++--- modules/compute/gke-node-pool/variables.tf | 4 +-- 4 files changed, 45 insertions(+), 9 deletions(-) create mode 100644 modules/compute/gke-node-pool/disk_definitions.tf diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 81aff33a62..612ab7d1fc 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -276,8 +276,8 @@ No modules. | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | -| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd?_gl=1*103lsfs*_ga*MTczMTAzMDM4My4xNzIyMzc1ODU5*_ga_WH2QY8WWF5*MTcyMzYzODIzOS4xOC4xLjE3MjM2MzgzNTEuNTAuMC4w#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd?_gl=1*103lsfs*_ga*MTczMTAzMDM4My4xNzIyMzc1ODU5*_ga_WH2QY8WWF5*MTcyMzYzODIzOS4xOC4xLjE3MjM2MzgzNTEuNTAuMC4w#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | diff --git a/modules/compute/gke-node-pool/disk_definitions.tf b/modules/compute/gke-node-pool/disk_definitions.tf new file mode 100644 index 0000000000..f7dbebea0a --- /dev/null +++ b/modules/compute/gke-node-pool/disk_definitions.tf @@ -0,0 +1,36 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +## Required variables: +# local_ssd_count_ephemeral_storage +# local_ssd_count_nvme_block +# machine_type + +locals { + + local_ssd_machines = { + "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null }, + "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null }, + } + + generated_local_ssd_config = lookup(local.local_ssd_machines, var.machine_type, { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = null }) + + # Select in priority order: + # (1) var.local_ssd_count_ephemeral_storage and var.local_ssd_count_nvme_block if any is not null + # (2) local.local_ssd_machines if not empty + # (3) default to null value for both local_ssd_count_ephemeral_storage and local_ssd_count_nvme_block + local_ssd_config = (var.local_ssd_count_ephemeral_storage == null && var.local_ssd_count_nvme_block == null) ? local.generated_local_ssd_config : { local_ssd_count_ephemeral_storage = var.local_ssd_count_ephemeral_storage, local_ssd_count_nvme_block = var.local_ssd_count_nvme_block } +} diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 14736ddc00..ed59839516 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -105,16 +105,16 @@ resource "google_container_node_pool" "node_pool" { } dynamic "ephemeral_storage_local_ssd_config" { - for_each = var.local_ssd_count_ephemeral_storage != null ? [1] : [] + for_each = local.local_ssd_config.local_ssd_count_ephemeral_storage != null ? [1] : [] content { - local_ssd_count = var.local_ssd_count_ephemeral_storage + local_ssd_count = local.local_ssd_config.local_ssd_count_ephemeral_storage } } dynamic "local_nvme_ssd_block_config" { - for_each = var.local_ssd_count_nvme_block != null ? [1] : [] + for_each = local.local_ssd_config.local_ssd_count_nvme_block != null ? [1] : [] content { - local_ssd_count = var.local_ssd_count_nvme_block + local_ssd_count = local.local_ssd_config.local_ssd_count_nvme_block } } @@ -189,7 +189,7 @@ resource "google_container_node_pool" "node_pool" { error_message = "static_node_count cannot be set with either autoscaling_total_min_nodes or autoscaling_total_max_nodes." } precondition { - condition = !(coalesce(var.local_ssd_count_ephemeral_storage, 0) > 0 && coalesce(var.local_ssd_count_nvme_block, 0) > 0) + condition = !(coalesce(local.local_ssd_config.local_ssd_count_ephemeral_storage, 0) > 0 && coalesce(local.local_ssd_config.local_ssd_count_nvme_block, 0) > 0) error_message = "Only one of local_ssd_count_ephemeral_storage or local_ssd_count_nvme_block can be set to a non-zero value." } precondition { diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 1352db37f5..b3bfcc3091 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -93,7 +93,7 @@ variable "local_ssd_count_ephemeral_storage" { description = <<-EOT The number of local SSDs to attach to each node to back ephemeral storage. Uses NVMe interfaces. Must be supported by `machine_type`. - When set to null, GKE decides about default value. + When set to null, default value either is [set based on machine_type](https://cloud.google.com/compute/docs/disks/local-ssd?_gl=1*103lsfs*_ga*MTczMTAzMDM4My4xNzIyMzc1ODU5*_ga_WH2QY8WWF5*MTcyMzYzODIzOS4xOC4xLjE3MjM2MzgzNTEuNTAuMC4w#choose_number_local_ssds) or GKE decides about default value. [See above](#local-ssd-storage) for more info. EOT type = number @@ -104,7 +104,7 @@ variable "local_ssd_count_nvme_block" { description = <<-EOT The number of local SSDs to attach to each node to back block storage. Uses NVMe interfaces. Must be supported by `machine_type`. - When set to null, GKE decides about default value. + When set to null, default value either is [set based on machine_type](https://cloud.google.com/compute/docs/disks/local-ssd?_gl=1*103lsfs*_ga*MTczMTAzMDM4My4xNzIyMzc1ODU5*_ga_WH2QY8WWF5*MTcyMzYzODIzOS4xOC4xLjE3MjM2MzgzNTEuNTAuMC4w#choose_number_local_ssds) or GKE decides about default value. [See above](#local-ssd-storage) for more info. EOT From 986ea90bcfe6c5fe0b4b63108c1dbf86f0fd39d7 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Thu, 22 Aug 2024 16:10:47 +0000 Subject: [PATCH 128/180] using helper functions and adding unit tests --- pkg/config/config.go | 8 ++++++-- pkg/config/config_test.go | 25 +++++++++++++++++++++++++ pkg/config/expand.go | 23 +++++++++++------------ pkg/config/expand_test.go | 35 +++++++++++++++++++++++++++++++++++ pkg/config/path.go | 18 ++++++++++-------- pkg/config/path_test.go | 2 ++ pkg/inspect/list.go | 4 ---- 7 files changed, 89 insertions(+), 26 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index afd64f04b7..0199fcb8d0 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -685,10 +685,14 @@ func (bp *Blueprint) checkBlueprintName() error { // exclsuively supplied (i.e., one is present, but the other is missing). func (bp *Blueprint) checkToolkitModulesUrlAndVersion() error { if bp.ToolkitModulesURL == "" && bp.ToolkitModulesVersion != "" { - return fmt.Errorf("toolkit_modules_url must be provided when toolkit_modules_version is specified") + return BpError{Root.ToolkitModulesURL, HintError{ + Err: errors.New("toolkit_modules_url must be provided when toolkit_modules_version is specified"), + Hint: "Specify toolkit_modules_url"}} } if bp.ToolkitModulesURL != "" && bp.ToolkitModulesVersion == "" { - return fmt.Errorf("toolkit_modules_version must be provided when toolkit_modules_url is specified") + return BpError{Root.ToolkitModulesVersion, HintError{ + Err: errors.New("toolkit_modules_version must be provided when toolkit_modules_url is specified"), + Hint: "Specify toolkit_modules_version"}} } return nil } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 8d25f8752f..8c239a7971 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -315,6 +315,31 @@ func (s *zeroSuite) TestCheckBlueprintName(c *C) { c.Check(errors.As(bp.checkBlueprintName(), &e), Equals, true) } +func (s *zeroSuite) TestCheckToolkitModulesUrlAndVersion(c *C) { + bp := Blueprint{} + var e HintError + + // Are toolkit_modules_url and toolkit_modules_version both provided? + bp.ToolkitModulesURL = "github.com/GoogleCloudPlatform/cluster-toolkit" + bp.ToolkitModulesVersion = "v1.15.0" + c.Check(bp.checkToolkitModulesUrlAndVersion(), IsNil) + + // Are toolkit_modules_url and toolkit_modules_version both empty? + bp.ToolkitModulesURL = "" + bp.ToolkitModulesVersion = "" + c.Check(bp.checkToolkitModulesUrlAndVersion(), IsNil) + + // Is toolkit_modules_url provided and toolkit_modules_version empty? + bp.ToolkitModulesURL = "github.com/GoogleCloudPlatform/cluster-toolkit" + bp.ToolkitModulesVersion = "" + c.Check(errors.As(bp.checkToolkitModulesUrlAndVersion(), &e), Equals, true) + + // Is toolkit_modules_version provided and toolkit_modules_url empty? + bp.ToolkitModulesURL = "" + bp.ToolkitModulesVersion = "v1.15.0" + c.Check(errors.As(bp.checkToolkitModulesUrlAndVersion(), &e), Equals, true) +} + func (s *zeroSuite) TestNewBlueprint(c *C) { bp := Blueprint{ Vars: NewDict(map[string]cty.Value{ diff --git a/pkg/config/expand.go b/pkg/config/expand.go index d5570a8ac0..4833f9cccd 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -18,8 +18,8 @@ import ( "errors" "fmt" - "hpc-toolkit/pkg/inspect" "hpc-toolkit/pkg/modulereader" + "hpc-toolkit/pkg/sourcereader" "github.com/zclconf/go-cty/cty" "github.com/zclconf/go-cty/cty/convert" @@ -105,22 +105,21 @@ func (bp *Blueprint) expandVars() error { } func (bp *Blueprint) substituteModuleSources() { - for ig := range bp.Groups { - g := &bp.Groups[ig] - for im := range g.Modules { - m := &g.Modules[im] - if inspect.IsLocalModule(m.Source) { - m.Source = fmt.Sprintf("%s//%s?ref=%s&depth=1", bp.ToolkitModulesURL, m.Source, bp.ToolkitModulesVersion) - } - } + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { + m.Source = bp.transformSource(m.Source) + }) +} + +func (bp Blueprint) transformSource(s string) string { + if sourcereader.IsEmbeddedPath(s) && bp.ToolkitModulesURL != "" && bp.ToolkitModulesVersion != "" { + return fmt.Sprintf("%s//%s?ref=%s&depth=1", bp.ToolkitModulesURL, s, bp.ToolkitModulesVersion) } + return s } func (bp *Blueprint) expandGroups() error { bp.addKindToModules() - if bp.ToolkitModulesURL != "" && bp.ToolkitModulesVersion != "" { - bp.substituteModuleSources() - } + bp.substituteModuleSources() if err := checkModulesAndGroups(*bp); err != nil { return err } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 34025ece06..6c347aceb9 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -356,6 +356,41 @@ func (s *zeroSuite) TestValidateModuleReference(c *C) { } +func (s *zeroSuite) TestSubstituteModuleSources(c *C) { + a := Module{ID: "moduleA", Source: "modules/network/pre-existing-vpc"} + b := Module{ID: "moduleB", Source: "community/modules/file-system/DDN-EXAScaler"} + y := Module{ID: "moduleY", Source: "./modules/network/pre-existing-vpc"} + + dg := []Group{ + {Name: "zero", Modules: []Module{a, b}}, + {Name: "one", Modules: []Module{y}}, + } + + // toolkit_modules_url and toolkit_modules_version not provided + bp := Blueprint{ + Groups: dg, + } + bp.substituteModuleSources() + // Check that sources remain unchanged + c.Assert(bp.Groups[0].Modules[0].Source, Equals, "modules/network/pre-existing-vpc") + c.Assert(bp.Groups[0].Modules[1].Source, Equals, "community/modules/file-system/DDN-EXAScaler") + c.Assert(bp.Groups[1].Modules[0].Source, Equals, "./modules/network/pre-existing-vpc") + + // toolkit_modules_url and toolkit_modules_version provided + bp = Blueprint{ + Groups: dg, ToolkitModulesURL: "github.com/GoogleCloudPlatform/cluster-toolkit", ToolkitModulesVersion: "v1.15.0", + } + bp.substituteModuleSources() + // Check that embedded sources (a and b) are transformed correctly + expectedSourceA := "github.com/GoogleCloudPlatform/cluster-toolkit//modules/network/pre-existing-vpc?ref=v1.15.0&depth=1" + expectedSourceB := "github.com/GoogleCloudPlatform/cluster-toolkit//community/modules/file-system/DDN-EXAScaler?ref=v1.15.0&depth=1" + c.Assert(bp.Groups[0].Modules[0].Source, Equals, expectedSourceA) + c.Assert(bp.Groups[0].Modules[1].Source, Equals, expectedSourceB) + + // Check that the non-embedded source (y) remains unchanged + c.Assert(bp.Groups[1].Modules[0].Source, Equals, "./modules/network/pre-existing-vpc") +} + func (s *zeroSuite) TestIntersection(c *C) { is := intersection([]string{"A", "B", "C"}, []string{"A", "B", "C"}) c.Assert(is, DeepEquals, []string{"A", "B", "C"}) diff --git a/pkg/config/path.go b/pkg/config/path.go index 1c5926adc0..7d84f449d4 100644 --- a/pkg/config/path.go +++ b/pkg/config/path.go @@ -128,14 +128,16 @@ func initPath(p any, prev any, piece string) { type rootPath struct { basePath - BlueprintName basePath `path:"blueprint_name"` - GhpcVersion basePath `path:"ghpc_version"` - Validators arrayPath[validatorCfgPath] `path:"validators"` - ValidationLevel basePath `path:"validation_level"` - Vars dictPath `path:"vars"` - Groups arrayPath[groupPath] `path:"deployment_groups"` - Backend backendPath `path:"terraform_backend_defaults"` - Provider mapPath[providerPath] `path:"terraform_providers"` + BlueprintName basePath `path:"blueprint_name"` + GhpcVersion basePath `path:"ghpc_version"` + Validators arrayPath[validatorCfgPath] `path:"validators"` + ValidationLevel basePath `path:"validation_level"` + Vars dictPath `path:"vars"` + Groups arrayPath[groupPath] `path:"deployment_groups"` + Backend backendPath `path:"terraform_backend_defaults"` + Provider mapPath[providerPath] `path:"terraform_providers"` + ToolkitModulesURL basePath `path:"toolkit_modules_url"` + ToolkitModulesVersion basePath `path:"toolkit_modules_version"` } type validatorCfgPath struct { diff --git a/pkg/config/path_test.go b/pkg/config/path_test.go index 4e1964854b..83b5e71874 100644 --- a/pkg/config/path_test.go +++ b/pkg/config/path_test.go @@ -37,6 +37,8 @@ func TestPath(t *testing.T) { {r.Groups, "deployment_groups"}, {r.Backend, "terraform_backend_defaults"}, {r.Provider, "terraform_providers"}, + {r.ToolkitModulesURL, "toolkit_modules_url"}, + {r.ToolkitModulesVersion, "toolkit_modules_version"}, {r.Validators.At(2), "validators[2]"}, {r.Validators.At(2).Validator, "validators[2].validator"}, diff --git a/pkg/inspect/list.go b/pkg/inspect/list.go index 7a5d5dc3b5..1bb2f9fc73 100644 --- a/pkg/inspect/list.go +++ b/pkg/inspect/list.go @@ -67,7 +67,3 @@ func LocalModules() ([]SourceAndKind, error) { } return ret, nil } - -func IsLocalModule(source string) bool { - return strings.HasPrefix(source, "modules/") || strings.HasPrefix(source, "community/modules") -} From 6c3584ef5b6d994a9e6bbbf6ce1445244053182e Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Thu, 22 Aug 2024 18:46:37 +0000 Subject: [PATCH 129/180] fixing path error --- pkg/config/config.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 0199fcb8d0..ef8e8e2290 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -685,12 +685,12 @@ func (bp *Blueprint) checkBlueprintName() error { // exclsuively supplied (i.e., one is present, but the other is missing). func (bp *Blueprint) checkToolkitModulesUrlAndVersion() error { if bp.ToolkitModulesURL == "" && bp.ToolkitModulesVersion != "" { - return BpError{Root.ToolkitModulesURL, HintError{ + return BpError{Root.ToolkitModulesVersion, HintError{ Err: errors.New("toolkit_modules_url must be provided when toolkit_modules_version is specified"), Hint: "Specify toolkit_modules_url"}} } if bp.ToolkitModulesURL != "" && bp.ToolkitModulesVersion == "" { - return BpError{Root.ToolkitModulesVersion, HintError{ + return BpError{Root.ToolkitModulesURL, HintError{ Err: errors.New("toolkit_modules_version must be provided when toolkit_modules_url is specified"), Hint: "Specify toolkit_modules_version"}} } From 427429a9c3cebf74c0d5f67e2612898cb82e4502 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Thu, 22 Aug 2024 10:16:12 +0000 Subject: [PATCH 130/180] Support named compact placement in GKE node pools --- modules/compute/gke-node-pool/README.md | 1 + modules/compute/gke-node-pool/main.tf | 3 ++- modules/compute/gke-node-pool/variables.tf | 10 ++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 81aff33a62..ee09fad3a5 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -268,6 +268,7 @@ No modules. | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | | [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes | | [compact\_placement](#input\_compact\_placement) | Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes. | `bool` | `false` | no | +| [compact\_placement\_policy](#input\_compact\_placement\_policy) | Name of the placement policy to use when compact\_placement is enabled.
It is assumed that the specified policy exists. To create a compact placement policy refer to https://cloud.google.com/compute/docs/instances/use-compact-placement-policies.
Beware of the limitations of using compact placement with GKE https://cloud.google.com/kubernetes-engine/docs/how-to/compact-placement#limitations-standard | `string` | `null` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for each node. | `number` | `100` | no | | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 14736ddc00..3ecca0c0d3 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -69,7 +69,8 @@ resource "google_container_node_pool" "node_pool" { dynamic "placement_policy" { for_each = var.compact_placement ? [1] : [] content { - type = "COMPACT" + type = "COMPACT" + policy_name = var.compact_placement_policy } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 1352db37f5..af596d98f7 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -319,3 +319,13 @@ variable "specific_reservation" { values = null } } + +variable "compact_placement_policy" { + description = <<-EOT + Name of the placement policy to use when compact_placement is enabled. + It is assumed that the specified policy exists. To create a compact placement policy refer to https://cloud.google.com/compute/docs/instances/use-compact-placement-policies. + Beware of the limitations of using compact placement with GKE https://cloud.google.com/kubernetes-engine/docs/how-to/compact-placement#limitations-standard + EOT + type = string + default = null +} From 54642ed95fe1b6b67f70d4966ebff52a14635020 Mon Sep 17 00:00:00 2001 From: annuay Date: Fri, 23 Aug 2024 11:19:27 +0000 Subject: [PATCH 131/180] Add maintenance interval to module and vars --- modules/compute/gke-node-pool/README.md | 1 + modules/compute/gke-node-pool/main.tf | 7 +++++++ modules/compute/gke-node-pool/variables.tf | 13 +++++++++++++ 3 files changed, 21 insertions(+) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 81aff33a62..267045ba99 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -273,6 +273,7 @@ No modules. | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | +| [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `"PERIODIC"` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 14736ddc00..0ddb25c274 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -162,6 +162,13 @@ resource "google_container_node_pool" "node_pool" { key = var.specific_reservation.key values = var.specific_reservation.values } + + dynamic "host_maintenance_policy" { + for_each = true ? [1] : [] + content { + maintenance_interval = var.host_maintenance_interval + } + } } network_config { diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 1352db37f5..a7be3dcae5 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -319,3 +319,16 @@ variable "specific_reservation" { values = null } } + +variable "host_maintenance_interval" { + description = "Specifies the frequency of planned maintenance events." + type = string + default = "PERIODIC" + validation { + condition = var.host_maintenance_interval != null ? contains( + ["PERIODIC", "AS_NEEDED"], + var.host_maintenance_interval, + ) : true + error_message = "Invalid host_maintenance_interval value." + } +} From 33712429a6cc642965d4154f5e3f563803b4b643 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Fri, 23 Aug 2024 12:56:16 +0000 Subject: [PATCH 132/180] allow generic placement --- modules/compute/gke-node-pool/README.md | 5 +-- modules/compute/gke-node-pool/main.tf | 24 ++++++++++++-- modules/compute/gke-node-pool/variables.tf | 37 +++++++++++++++------- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index ee09fad3a5..972c6279f4 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -267,8 +267,7 @@ No modules. | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | | [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes | -| [compact\_placement](#input\_compact\_placement) | Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes. | `bool` | `false` | no | -| [compact\_placement\_policy](#input\_compact\_placement\_policy) | Name of the placement policy to use when compact\_placement is enabled.
It is assumed that the specified policy exists. To create a compact placement policy refer to https://cloud.google.com/compute/docs/instances/use-compact-placement-policies.
Beware of the limitations of using compact placement with GKE https://cloud.google.com/kubernetes-engine/docs/how-to/compact-placement#limitations-standard | `string` | `null` | no | +| [compact\_placement](#input\_compact\_placement) | Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes.
Use `placement_type` and `placement_policy` if you want to use the placement policy you created.
Note that `compact_placement` might be deprecated in future in favour of `placement_type` and `placement_policy` | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for each node. | `number` | `100` | no | | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | @@ -281,6 +280,8 @@ No modules. | [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | +| [placement\_policy](#input\_placement\_policy) | Name of the placement policy to use when `placement_type` is set.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies | `string` | `null` | no | +| [placement\_type](#input\_placement\_type) | Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy`.
`COMPACT` is the only supported value currently.
Note that `placement_type` and `placement_policy` take effect only when `compact_placement` is `false`. | `string` | `null` | no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [reservation\_type](#input\_reservation\_type) | Type of reservation to consume | `string` | `"NO_RESERVATION"` | no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 3ecca0c0d3..4f5b5ff490 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -66,11 +66,22 @@ resource "google_container_node_pool" "node_pool" { max_unavailable = 1 } + # NOTE: `placement_type` and `placement_policy` together can support compact placement of nodes in a node pool. + # So, one might wonder if we can remove this block. But to support backward compatibility this needs to be kept. + # By backward compatibility we mean that there maybe blueprints that might be using `compact_placement`. + # Those blueprints will stop working if we remove this block. dynamic "placement_policy" { for_each = var.compact_placement ? [1] : [] content { - type = "COMPACT" - policy_name = var.compact_placement_policy + type = "COMPACT" + } + } + + dynamic "placement_policy" { + for_each = (!var.compact_placement && try(contains(["COMPACT"], var.placement_type), false)) ? [1] : [] + content { + type = var.placement_type + policy_name = var.placement_policy } } @@ -203,6 +214,15 @@ resource "google_container_node_pool" "node_pool" { On the other hand, with SPECIFIC_RESERVATION you must set `specific_reservation.key` and `specific_reservation.values` to `compute.googleapis.com/reservation-name` and a list of reservation names respectively. EOT } + precondition { + condition = var.compact_placement || var.placement_type == null || try(contains(["COMPACT"], var.placement_type), false) + error_message = "`COMPACT` is the only supported value for `placement_type`." + } + + precondition { + condition = var.compact_placement || var.placement_type != null || (var.placement_type == null && var.placement_policy == null) + error_message = "`placement_type` needs to be set when specifying `placement_policy`" + } } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index af596d98f7..7499296388 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -172,11 +172,36 @@ variable "spot" { } variable "compact_placement" { - description = "Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes." + description = <<-EOT + Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes. + Use `placement_type` and `placement_policy` if you want to use the placement policy you created. + Note that `compact_placement` might be deprecated in future in favour of `placement_type` and `placement_policy` + EOT type = bool default = false } +variable "placement_type" { + description = <<-EOT + Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy`. + `COMPACT` is the only supported value currently. + Note that `placement_type` and `placement_policy` take effect only when `compact_placement` is `false`. + EOT + type = string + default = null + +} + +variable "placement_policy" { + description = <<-EOT + Name of the placement policy to use when `placement_type` is set. + It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement. + Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies + EOT + type = string + default = null +} + variable "service_account_email" { description = "Service account e-mail address to use with the node pool" type = string @@ -319,13 +344,3 @@ variable "specific_reservation" { values = null } } - -variable "compact_placement_policy" { - description = <<-EOT - Name of the placement policy to use when compact_placement is enabled. - It is assumed that the specified policy exists. To create a compact placement policy refer to https://cloud.google.com/compute/docs/instances/use-compact-placement-policies. - Beware of the limitations of using compact placement with GKE https://cloud.google.com/kubernetes-engine/docs/how-to/compact-placement#limitations-standard - EOT - type = string - default = null -} From 77cd7ab0cdf967fa21d962ff664ca522073ccfea Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 12 Aug 2024 23:38:16 +0000 Subject: [PATCH 133/180] SlurmGCP. Store "extra files" hash in config DRAFT --- .../modules/slurm_files/main.tf | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index 22a8ccba44..0cf9981f5a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -40,6 +40,8 @@ resource "random_uuid" "cluster_id" { ################## locals { + tp = "${local.bucket_dir}/" # prefix to trim from the bucket path to get a "file name" + config = { enable_slurm_gcp_plugins = var.enable_slurm_gcp_plugins enable_bigquery_load = var.enable_bigquery_load @@ -86,6 +88,29 @@ locals { # Providers endpoint_versions = var.endpoint_versions + + # Extra-files MD5 hashes + # Makes config file creation depend on the files + # Allows for informed updates & checks on slurmsync side + slurm_gcp_scripts_md5 = google_storage_bucket_object.devel.md5hash, + controller_startup_scripts_md5 = { + for o in values(google_storage_bucket_object.controller_startup_scripts) : trimprefix(o.name, local.tp) => o.md5hash + } + compute_startup_scripts_md5 = { + for o in values(google_storage_bucket_object.compute_startup_scripts) : trimprefix(o.name, local.tp) => o.md5hash + } + nodeset_startup_scripts_md5 = { + for o in values(google_storage_bucket_object.nodeset_startup_scripts) : trimprefix(o.name, local.tp) => o.md5hash + } + login_startup_scripts_md5 = { + for o in values(google_storage_bucket_object.login_startup_scripts) : trimprefix(o.name, local.tp) => o.md5hash + } + prolog_scripts_md5 = { + for o in values(google_storage_bucket_object.prolog_scripts) : trimprefix(o.name, local.tp) => o.md5hash + } + epilog_scripts_md5 = { + for o in values(google_storage_bucket_object.epilog_scripts) : trimprefix(o.name, local.tp) => o.md5hash + } } x_nodeset = toset(var.nodeset[*].nodeset_name) From 77a79bb8d7c08ca0af64d96f47c1aa3c427ecd3a Mon Sep 17 00:00:00 2001 From: annuay Date: Mon, 26 Aug 2024 04:07:45 +0000 Subject: [PATCH 134/180] use empty string as default --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/variables.tf | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 267045ba99..0b1330f8f9 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -273,7 +273,7 @@ No modules. | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | -| [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `"PERIODIC"` | no | +| [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index a7be3dcae5..c092340caf 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -323,12 +323,10 @@ variable "specific_reservation" { variable "host_maintenance_interval" { description = "Specifies the frequency of planned maintenance events." type = string - default = "PERIODIC" + default = "" + nullable = false validation { - condition = var.host_maintenance_interval != null ? contains( - ["PERIODIC", "AS_NEEDED"], - var.host_maintenance_interval, - ) : true - error_message = "Invalid host_maintenance_interval value." + condition = contains(["", "PERIODIC", "AS_NEEDED"], var.host_maintenance_interval) + error_message = "Invalid host_maintenance_interval value. Must be PERIODIC, AS_NEEDED or the empty string" } } From 7a808c7a4754f06358b0499d98d427608884ed62 Mon Sep 17 00:00:00 2001 From: annuay Date: Mon, 26 Aug 2024 14:54:40 +0000 Subject: [PATCH 135/180] Fix condition --- modules/compute/gke-node-pool/main.tf | 2 +- terraform-provider-google-beta | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 160000 terraform-provider-google-beta diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 0ddb25c274..977375778f 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -164,7 +164,7 @@ resource "google_container_node_pool" "node_pool" { } dynamic "host_maintenance_policy" { - for_each = true ? [1] : [] + for_each = var.host_maintenance_interval == "" ? [] : [1] content { maintenance_interval = var.host_maintenance_interval } diff --git a/terraform-provider-google-beta b/terraform-provider-google-beta new file mode 160000 index 0000000000..81fd24e15b --- /dev/null +++ b/terraform-provider-google-beta @@ -0,0 +1 @@ +Subproject commit 81fd24e15b417042966711cea240de914448c605 From a96dff7f2c46c3639765eddfdabcbb80fcfa33d3 Mon Sep 17 00:00:00 2001 From: annuay Date: Tue, 27 Aug 2024 04:15:39 +0000 Subject: [PATCH 136/180] swap conditional --- modules/compute/gke-node-pool/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 977375778f..da69b5e5f0 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -164,7 +164,7 @@ resource "google_container_node_pool" "node_pool" { } dynamic "host_maintenance_policy" { - for_each = var.host_maintenance_interval == "" ? [] : [1] + for_each = var.host_maintenance_interval != "" ? [1] : [] content { maintenance_interval = var.host_maintenance_interval } From c9ade47efb9e31e3d975dde57f8254d1b5f024cb Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Fri, 23 Aug 2024 12:56:16 +0000 Subject: [PATCH 137/180] allow generic placement --- modules/compute/gke-node-pool/README.md | 5 +-- modules/compute/gke-node-pool/main.tf | 24 ++++++++++++-- modules/compute/gke-node-pool/variables.tf | 37 +++++++++++++++------- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index ee09fad3a5..972c6279f4 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -267,8 +267,7 @@ No modules. | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | | [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes | -| [compact\_placement](#input\_compact\_placement) | Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes. | `bool` | `false` | no | -| [compact\_placement\_policy](#input\_compact\_placement\_policy) | Name of the placement policy to use when compact\_placement is enabled.
It is assumed that the specified policy exists. To create a compact placement policy refer to https://cloud.google.com/compute/docs/instances/use-compact-placement-policies.
Beware of the limitations of using compact placement with GKE https://cloud.google.com/kubernetes-engine/docs/how-to/compact-placement#limitations-standard | `string` | `null` | no | +| [compact\_placement](#input\_compact\_placement) | Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes.
Use `placement_type` and `placement_policy` if you want to use the placement policy you created.
Note that `compact_placement` might be deprecated in future in favour of `placement_type` and `placement_policy` | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for each node. | `number` | `100` | no | | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | @@ -281,6 +280,8 @@ No modules. | [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | +| [placement\_policy](#input\_placement\_policy) | Name of the placement policy to use when `placement_type` is set.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies | `string` | `null` | no | +| [placement\_type](#input\_placement\_type) | Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy`.
`COMPACT` is the only supported value currently.
Note that `placement_type` and `placement_policy` take effect only when `compact_placement` is `false`. | `string` | `null` | no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [reservation\_type](#input\_reservation\_type) | Type of reservation to consume | `string` | `"NO_RESERVATION"` | no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 3ecca0c0d3..4f5b5ff490 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -66,11 +66,22 @@ resource "google_container_node_pool" "node_pool" { max_unavailable = 1 } + # NOTE: `placement_type` and `placement_policy` together can support compact placement of nodes in a node pool. + # So, one might wonder if we can remove this block. But to support backward compatibility this needs to be kept. + # By backward compatibility we mean that there maybe blueprints that might be using `compact_placement`. + # Those blueprints will stop working if we remove this block. dynamic "placement_policy" { for_each = var.compact_placement ? [1] : [] content { - type = "COMPACT" - policy_name = var.compact_placement_policy + type = "COMPACT" + } + } + + dynamic "placement_policy" { + for_each = (!var.compact_placement && try(contains(["COMPACT"], var.placement_type), false)) ? [1] : [] + content { + type = var.placement_type + policy_name = var.placement_policy } } @@ -203,6 +214,15 @@ resource "google_container_node_pool" "node_pool" { On the other hand, with SPECIFIC_RESERVATION you must set `specific_reservation.key` and `specific_reservation.values` to `compute.googleapis.com/reservation-name` and a list of reservation names respectively. EOT } + precondition { + condition = var.compact_placement || var.placement_type == null || try(contains(["COMPACT"], var.placement_type), false) + error_message = "`COMPACT` is the only supported value for `placement_type`." + } + + precondition { + condition = var.compact_placement || var.placement_type != null || (var.placement_type == null && var.placement_policy == null) + error_message = "`placement_type` needs to be set when specifying `placement_policy`" + } } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index af596d98f7..7499296388 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -172,11 +172,36 @@ variable "spot" { } variable "compact_placement" { - description = "Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes." + description = <<-EOT + Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes. + Use `placement_type` and `placement_policy` if you want to use the placement policy you created. + Note that `compact_placement` might be deprecated in future in favour of `placement_type` and `placement_policy` + EOT type = bool default = false } +variable "placement_type" { + description = <<-EOT + Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy`. + `COMPACT` is the only supported value currently. + Note that `placement_type` and `placement_policy` take effect only when `compact_placement` is `false`. + EOT + type = string + default = null + +} + +variable "placement_policy" { + description = <<-EOT + Name of the placement policy to use when `placement_type` is set. + It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement. + Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies + EOT + type = string + default = null +} + variable "service_account_email" { description = "Service account e-mail address to use with the node pool" type = string @@ -319,13 +344,3 @@ variable "specific_reservation" { values = null } } - -variable "compact_placement_policy" { - description = <<-EOT - Name of the placement policy to use when compact_placement is enabled. - It is assumed that the specified policy exists. To create a compact placement policy refer to https://cloud.google.com/compute/docs/instances/use-compact-placement-policies. - Beware of the limitations of using compact placement with GKE https://cloud.google.com/kubernetes-engine/docs/how-to/compact-placement#limitations-standard - EOT - type = string - default = null -} From d9e1c252ca0ab60fda04244a4a66690e38ad87a4 Mon Sep 17 00:00:00 2001 From: annuay Date: Tue, 27 Aug 2024 09:47:42 +0000 Subject: [PATCH 138/180] add maintenance interval to integration test --- tools/cloud-build/daily-tests/builds/ml-gke.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/builds/ml-gke.yaml b/tools/cloud-build/daily-tests/builds/ml-gke.yaml index 4336929f16..c9ae96850f 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke.yaml @@ -49,7 +49,7 @@ steps: echo ' - id: ubuntu_pool' >> $${SG_EXAMPLE} echo ' source: modules/compute/gke-node-pool' >> $${SG_EXAMPLE} echo ' use: [gke_cluster]' >> $${SG_EXAMPLE} - echo ' settings: {name: ubuntu, image_type: UBUNTU_CONTAINERD}' >> $${SG_EXAMPLE} + echo ' settings: {name: ubuntu, image_type: UBUNTU_CONTAINERD, host_maintenance_interval: AS_NEEDED}' >> $${SG_EXAMPLE} # avoids conflict with other tests sed -i "s/gke-subnet/gke-subnet-$${BUILD_ID_SHORT}/" $${SG_EXAMPLE} From 7429620789c5ac558151709c37501177f646770c Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Tue, 27 Aug 2024 09:54:03 +0000 Subject: [PATCH 139/180] deprecate compact_placement, better names --- modules/compute/gke-node-pool/README.md | 6 +++--- modules/compute/gke-node-pool/main.tf | 25 ++++++---------------- modules/compute/gke-node-pool/variables.tf | 22 +++++++++---------- 3 files changed, 21 insertions(+), 32 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 972c6279f4..95898248e4 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -267,7 +267,7 @@ No modules. | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | | [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes | -| [compact\_placement](#input\_compact\_placement) | Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes.
Use `placement_type` and `placement_policy` if you want to use the placement policy you created.
Note that `compact_placement` might be deprecated in future in favour of `placement_type` and `placement_policy` | `bool` | `false` | no | +| [compact\_placement](#input\_compact\_placement) | DEPRECATED: Use `placement_policy_type` | `bool` | `null` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for each node. | `number` | `100` | no | | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | @@ -280,8 +280,8 @@ No modules. | [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | -| [placement\_policy](#input\_placement\_policy) | Name of the placement policy to use when `placement_type` is set.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies | `string` | `null` | no | -| [placement\_type](#input\_placement\_type) | Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy`.
`COMPACT` is the only supported value currently.
Note that `placement_type` and `placement_policy` take effect only when `compact_placement` is `false`. | `string` | `null` | no | +| [placement\_policy\_name](#input\_placement\_policy\_name) | Name of the placement policy to use when `placement_policy_type` is set.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies | `string` | `null` | no | +| [placement\_policy\_type](#input\_placement\_policy\_type) | Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy_name`.
`COMPACT` is the only supported value currently. | `string` | `null` | no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [reservation\_type](#input\_reservation\_type) | Type of reservation to consume | `string` | `"NO_RESERVATION"` | no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 4f5b5ff490..81f48fc45a 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -66,22 +66,11 @@ resource "google_container_node_pool" "node_pool" { max_unavailable = 1 } - # NOTE: `placement_type` and `placement_policy` together can support compact placement of nodes in a node pool. - # So, one might wonder if we can remove this block. But to support backward compatibility this needs to be kept. - # By backward compatibility we mean that there maybe blueprints that might be using `compact_placement`. - # Those blueprints will stop working if we remove this block. dynamic "placement_policy" { - for_each = var.compact_placement ? [1] : [] + for_each = var.placement_policy_type != null ? [1] : [] content { - type = "COMPACT" - } - } - - dynamic "placement_policy" { - for_each = (!var.compact_placement && try(contains(["COMPACT"], var.placement_type), false)) ? [1] : [] - content { - type = var.placement_type - policy_name = var.placement_policy + type = var.placement_policy_type + policy_name = var.placement_policy_name } } @@ -215,13 +204,13 @@ resource "google_container_node_pool" "node_pool" { EOT } precondition { - condition = var.compact_placement || var.placement_type == null || try(contains(["COMPACT"], var.placement_type), false) - error_message = "`COMPACT` is the only supported value for `placement_type`." + condition = var.placement_policy_type == null || try(contains(["COMPACT"], var.placement_policy_type), false) + error_message = "`COMPACT` is the only supported value for `placement_policy_type`." } precondition { - condition = var.compact_placement || var.placement_type != null || (var.placement_type == null && var.placement_policy == null) - error_message = "`placement_type` needs to be set when specifying `placement_policy`" + condition = var.placement_policy_type != null || (var.placement_policy_type == null && var.placement_policy_name == null) + error_message = "`placement_policy_type` needs to be set when specifying `placement_policy_name`" } } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 7499296388..0bf4835cd4 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -171,30 +171,30 @@ variable "spot" { default = false } +# tflint-ignore: terraform_unused_declarations variable "compact_placement" { - description = <<-EOT - Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes. - Use `placement_type` and `placement_policy` if you want to use the placement policy you created. - Note that `compact_placement` might be deprecated in future in favour of `placement_type` and `placement_policy` - EOT + description = "DEPRECATED: Use `placement_policy_type`" type = bool - default = false + default = null + validation { + condition = var.compact_placement == null + error_message = "`compact_placement` is deprecated. Use `placement_policy_type`" + } } -variable "placement_type" { +variable "placement_policy_type" { description = <<-EOT - Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy`. + Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy_name`. `COMPACT` is the only supported value currently. - Note that `placement_type` and `placement_policy` take effect only when `compact_placement` is `false`. EOT type = string default = null } -variable "placement_policy" { +variable "placement_policy_name" { description = <<-EOT - Name of the placement policy to use when `placement_type` is set. + Name of the placement policy to use when `placement_policy_type` is set. It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement. Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies EOT From 40614849a572ee73b853a6ae8d4e5f3dc203ba86 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Tue, 27 Aug 2024 10:15:49 +0000 Subject: [PATCH 140/180] No cross reference anymore. So variable validation --- examples/ml-gke.yaml | 2 -- modules/compute/gke-node-pool/main.tf | 4 ---- modules/compute/gke-node-pool/variables.tf | 5 ++++- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/examples/ml-gke.yaml b/examples/ml-gke.yaml index 7969023906..5aedd354fb 100644 --- a/examples/ml-gke.yaml +++ b/examples/ml-gke.yaml @@ -54,8 +54,6 @@ deployment_groups: source: modules/compute/gke-node-pool use: [gke_cluster] settings: - placement_policy_type: COMPACT2 - # placement_policy_name: abcd disk_type: pd-balanced machine_type: g2-standard-4 diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 81f48fc45a..14399dd391 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -203,10 +203,6 @@ resource "google_container_node_pool" "node_pool" { On the other hand, with SPECIFIC_RESERVATION you must set `specific_reservation.key` and `specific_reservation.values` to `compute.googleapis.com/reservation-name` and a list of reservation names respectively. EOT } - precondition { - condition = var.placement_policy_type == null || try(contains(["COMPACT"], var.placement_policy_type), false) - error_message = "`COMPACT` is the only supported value for `placement_policy_type`." - } precondition { condition = var.placement_policy_type != null || (var.placement_policy_type == null && var.placement_policy_name == null) diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 0bf4835cd4..fb548a9501 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -189,7 +189,10 @@ variable "placement_policy_type" { EOT type = string default = null - + validation { + condition = var.placement_policy_type == null || try(contains(["COMPACT"], var.placement_policy_type), false) + error_message = "`COMPACT` is the only supported value for `placement_policy_type`." + } } variable "placement_policy_name" { From 1b6b419b562968f78a41785a13b2451c911afe7c Mon Sep 17 00:00:00 2001 From: annuay Date: Tue, 27 Aug 2024 14:47:05 +0000 Subject: [PATCH 141/180] Remove repo cloned by mistake --- terraform-provider-google-beta | 1 - 1 file changed, 1 deletion(-) delete mode 160000 terraform-provider-google-beta diff --git a/terraform-provider-google-beta b/terraform-provider-google-beta deleted file mode 160000 index 81fd24e15b..0000000000 --- a/terraform-provider-google-beta +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 81fd24e15b417042966711cea240de914448c605 From c83050d2eaf14b846d8f54611a61d71dc7329cf3 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Wed, 28 Aug 2024 05:52:25 +0000 Subject: [PATCH 142/180] Make placement_policy setting an object --- modules/compute/gke-node-pool/README.md | 5 ++- modules/compute/gke-node-pool/main.tf | 11 ++----- modules/compute/gke-node-pool/variables.tf | 36 ++++++++++------------ 3 files changed, 22 insertions(+), 30 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 95898248e4..7dde55eee7 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -267,7 +267,7 @@ No modules. | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | | [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes | -| [compact\_placement](#input\_compact\_placement) | DEPRECATED: Use `placement_policy_type` | `bool` | `null` | no | +| [compact\_placement](#input\_compact\_placement) | DEPRECATED: Use `placement_policy` | `bool` | `null` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for each node. | `number` | `100` | no | | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | @@ -280,8 +280,7 @@ No modules. | [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | -| [placement\_policy\_name](#input\_placement\_policy\_name) | Name of the placement policy to use when `placement_policy_type` is set.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies | `string` | `null` | no | -| [placement\_policy\_type](#input\_placement\_policy\_type) | Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy_name`.
`COMPACT` is the only supported value currently. | `string` | `null` | no | +| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [reservation\_type](#input\_reservation\_type) | Type of reservation to consume | `string` | `"NO_RESERVATION"` | no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 14399dd391..5482ce0277 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -67,10 +67,10 @@ resource "google_container_node_pool" "node_pool" { } dynamic "placement_policy" { - for_each = var.placement_policy_type != null ? [1] : [] + for_each = var.placement_policy.type != null ? [1] : [] content { - type = var.placement_policy_type - policy_name = var.placement_policy_name + type = var.placement_policy.type + policy_name = var.placement_policy.name } } @@ -203,11 +203,6 @@ resource "google_container_node_pool" "node_pool" { On the other hand, with SPECIFIC_RESERVATION you must set `specific_reservation.key` and `specific_reservation.values` to `compute.googleapis.com/reservation-name` and a list of reservation names respectively. EOT } - - precondition { - condition = var.placement_policy_type != null || (var.placement_policy_type == null && var.placement_policy_name == null) - error_message = "`placement_policy_type` needs to be set when specifying `placement_policy_name`" - } } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index fb548a9501..d1d9f9667e 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -173,36 +173,34 @@ variable "spot" { # tflint-ignore: terraform_unused_declarations variable "compact_placement" { - description = "DEPRECATED: Use `placement_policy_type`" + description = "DEPRECATED: Use `placement_policy`" type = bool default = null validation { condition = var.compact_placement == null - error_message = "`compact_placement` is deprecated. Use `placement_policy_type`" + error_message = "`compact_placement` is deprecated. Use `placement_policy` instead" } } -variable "placement_policy_type" { +variable "placement_policy" { description = <<-EOT - Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy_name`. - `COMPACT` is the only supported value currently. - EOT - type = string - default = null - validation { - condition = var.placement_policy_type == null || try(contains(["COMPACT"], var.placement_policy_type), false) - error_message = "`COMPACT` is the only supported value for `placement_policy_type`." - } -} - -variable "placement_policy_name" { - description = <<-EOT - Name of the placement policy to use when `placement_policy_type` is set. + Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy. It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement. Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies EOT - type = string - default = null + + type = object({ + type = string + name = optional(string) + }) + default = { + type = null + name = null + } + validation { + condition = var.placement_policy.type == null || try(contains(["COMPACT"], var.placement_policy.type), false) + error_message = "`COMPACT` is the only supported value for `placement_policy.type`." + } } variable "service_account_email" { From 09ae27255f81c5e935ff56a497d4683fbeeaa4bf Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 28 Aug 2024 07:09:39 -0700 Subject: [PATCH 143/180] Prevent use of google provider 6.0 in vm-instance The google_compute_instance has a breaking change in TPG 6.0 that we must first address. --- modules/compute/vm-instance/README.md | 8 ++++---- modules/compute/vm-instance/versions.tf | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 61b78e088f..5e6c75181b 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -169,16 +169,16 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.3.0 | -| [google](#requirement\_google) | >= 4.73.0 | -| [google-beta](#requirement\_google-beta) | >= 4.73.0 | +| [google](#requirement\_google) | >= 4.73.0, <6.0 | +| [google-beta](#requirement\_google-beta) | >= 4.73.0, <6.0 | | [null](#requirement\_null) | >= 3.0 | ## Providers | Name | Version | |------|---------| -| [google](#provider\_google) | >= 4.73.0 | -| [google-beta](#provider\_google-beta) | >= 4.73.0 | +| [google](#provider\_google) | >= 4.73.0, <6.0 | +| [google-beta](#provider\_google-beta) | >= 4.73.0, <6.0 | | [null](#provider\_null) | >= 3.0 | ## Modules diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 1b46a4e5e1..2f42a5a83e 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -18,12 +18,12 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.73.0" + version = ">= 4.73.0, <6.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.73.0" + version = ">= 4.73.0, <6.0" } null = { source = "hashicorp/null" From 297b0d8cad8091610a18c1cec7bbb5942a0dbf24 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 28 Aug 2024 14:36:49 +0000 Subject: [PATCH 144/180] Prevent use of google provider 6.0 in gke-node-pool The google_container_node_pool resource has a breaking change in TPG 6.0 that we must first address. --- modules/compute/gke-node-pool/README.md | 8 ++++---- modules/compute/gke-node-pool/versions.tf | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 81aff33a62..0f24abc816 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -231,15 +231,15 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.2 | -| [google](#requirement\_google) | > 5.0 | -| [google-beta](#requirement\_google-beta) | > 5.0 | +| [google](#requirement\_google) | ~> 5.0 | +| [google-beta](#requirement\_google-beta) | ~> 5.0 | ## Providers | Name | Version | |------|---------| -| [google](#provider\_google) | > 5.0 | -| [google-beta](#provider\_google-beta) | > 5.0 | +| [google](#provider\_google) | ~> 5.0 | +| [google-beta](#provider\_google-beta) | ~> 5.0 | ## Modules diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf index 8905a94eea..fd0c4e2044 100644 --- a/modules/compute/gke-node-pool/versions.tf +++ b/modules/compute/gke-node-pool/versions.tf @@ -18,11 +18,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "> 5.0" + version = "~> 5.0" } google-beta = { source = "hashicorp/google-beta" - version = "> 5.0" + version = "~> 5.0" } } provider_meta "google" { From 780de5e5b352d8a8b83db2efc7db97081607dae3 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 28 Aug 2024 14:46:46 +0000 Subject: [PATCH 145/180] Adopt vm-instance module that restricts use of TPG 6.0 --- community/modules/compute/pbspro-execution/README.md | 2 +- community/modules/compute/pbspro-execution/main.tf | 2 +- .../modules/remote-desktop/chrome-remote-desktop/README.md | 2 +- community/modules/remote-desktop/chrome-remote-desktop/main.tf | 2 +- community/modules/scheduler/pbspro-client/README.md | 2 +- community/modules/scheduler/pbspro-client/main.tf | 2 +- community/modules/scheduler/pbspro-server/README.md | 2 +- community/modules/scheduler/pbspro-server/main.tf | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/community/modules/compute/pbspro-execution/README.md b/community/modules/compute/pbspro-execution/README.md index 0dd9b64775..dd68300561 100644 --- a/community/modules/compute/pbspro-execution/README.md +++ b/community/modules/compute/pbspro-execution/README.md @@ -75,7 +75,7 @@ No providers. | Name | Source | Version | |------|--------|---------| | [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | -| [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.36.0&depth=1 | +| [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/compute/pbspro-execution/main.tf b/community/modules/compute/pbspro-execution/main.tf index b3e46cc2a8..9df32916a1 100644 --- a/community/modules/compute/pbspro-execution/main.tf +++ b/community/modules/compute/pbspro-execution/main.tf @@ -68,7 +68,7 @@ module "execution_startup_script" { } module "pbs_execution" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.36.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/remote-desktop/chrome-remote-desktop/README.md b/community/modules/remote-desktop/chrome-remote-desktop/README.md index 19ab5361f7..bdd44aa18f 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/README.md +++ b/community/modules/remote-desktop/chrome-remote-desktop/README.md @@ -64,7 +64,7 @@ No providers. | Name | Source | Version | |------|--------|---------| | [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | -| [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.36.0&depth=1 | +| [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | ## Resources diff --git a/community/modules/remote-desktop/chrome-remote-desktop/main.tf b/community/modules/remote-desktop/chrome-remote-desktop/main.tf index 936cc75d3c..023fc81ffa 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/main.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/main.tf @@ -71,7 +71,7 @@ module "client_startup_script" { } module "instances" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.36.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" instance_count = var.instance_count name_prefix = var.name_prefix diff --git a/community/modules/scheduler/pbspro-client/README.md b/community/modules/scheduler/pbspro-client/README.md index 43b71423fc..6f7a7b938c 100644 --- a/community/modules/scheduler/pbspro-client/README.md +++ b/community/modules/scheduler/pbspro-client/README.md @@ -75,7 +75,7 @@ No providers. | Name | Source | Version | |------|--------|---------| | [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | -| [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.36.0&depth=1 | +| [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/pbspro-client/main.tf b/community/modules/scheduler/pbspro-client/main.tf index e427c3945e..c1fa0e211f 100644 --- a/community/modules/scheduler/pbspro-client/main.tf +++ b/community/modules/scheduler/pbspro-client/main.tf @@ -57,7 +57,7 @@ module "client_startup_script" { } module "pbs_client" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.36.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/scheduler/pbspro-server/README.md b/community/modules/scheduler/pbspro-server/README.md index f27384fc3c..e432e0cbbf 100644 --- a/community/modules/scheduler/pbspro-server/README.md +++ b/community/modules/scheduler/pbspro-server/README.md @@ -71,7 +71,7 @@ No providers. |------|--------|---------| | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.36.0&depth=1 | | [pbs\_qmgr](#module\_pbs\_qmgr) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr | v1.36.0&depth=1 | -| [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.36.0&depth=1 | +| [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | | [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/pbspro-server/main.tf b/community/modules/scheduler/pbspro-server/main.tf index bfbf635aa4..b5e924d969 100644 --- a/community/modules/scheduler/pbspro-server/main.tf +++ b/community/modules/scheduler/pbspro-server/main.tf @@ -70,7 +70,7 @@ module "server_startup_script" { } module "pbs_server" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.36.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" instance_count = var.instance_count spot = var.spot From 6e734c2112bd86ff92b52f3c9e911ffd97ec26c3 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 27 Aug 2024 01:04:50 +0000 Subject: [PATCH 146/180] Increate `ps-slurm.yaml` startup scripts timeout --- examples/ps-slurm.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/ps-slurm.yaml b/examples/ps-slurm.yaml index ceecdd472b..f139aa7b3c 100644 --- a/examples/ps-slurm.yaml +++ b/examples/ps-slurm.yaml @@ -26,6 +26,13 @@ vars: zone: us-east4-b compute_node_machine_type: c2-standard-60 + # The Parallelstore drivers installation takes a long time. + # Increase the timeout to 20 minutes (default is 5 minutes). + compute_startup_scripts_timeout: $(20*60) + login_startup_scripts_timeout: $(20*60) + controller_startup_scripts_timeout: $(20*60) + resume_timeout: $(20*60) + deployment_groups: - group: primary modules: From 676ff42cd7e83d848ce5644144a4ac3e275cdd8c Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 29 Aug 2024 03:39:33 +0000 Subject: [PATCH 147/180] Don't set `automaticRestart: false` --- .../modules/slurm_files/scripts/resume.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index e669dd1dca..e20a7ed195 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -75,7 +75,6 @@ def instance_properties(nodeset, model, placement_group, labels=None): if placement_group: props.scheduling = { "onHostMaintenance": "TERMINATE", - "automaticRestart": False, } props.resourcePolicies = [placement_group] @@ -97,7 +96,6 @@ def instance_properties(nodeset, model, placement_group, labels=None): if policies: props.scheduling = { "onHostMaintenance": "TERMINATE", - "automaticRestart": False, } props.resourcePolicies = policies log.info( From 3a4a92c3d287bcbedb325d2bd5ddfee042dfe17c Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Thu, 29 Aug 2024 06:39:56 +0000 Subject: [PATCH 148/180] links in desciptions fixed --- modules/compute/gke-node-pool/README.md | 4 ++-- modules/compute/gke-node-pool/variables.tf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 612ab7d1fc..23aad80f3a 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -276,8 +276,8 @@ No modules. | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd?_gl=1*103lsfs*_ga*MTczMTAzMDM4My4xNzIyMzc1ODU5*_ga_WH2QY8WWF5*MTcyMzYzODIzOS4xOC4xLjE3MjM2MzgzNTEuNTAuMC4w#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | -| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd?_gl=1*103lsfs*_ga*MTczMTAzMDM4My4xNzIyMzc1ODU5*_ga_WH2QY8WWF5*MTcyMzYzODIzOS4xOC4xLjE3MjM2MzgzNTEuNTAuMC4w#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index b3bfcc3091..9636be209b 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -93,7 +93,7 @@ variable "local_ssd_count_ephemeral_storage" { description = <<-EOT The number of local SSDs to attach to each node to back ephemeral storage. Uses NVMe interfaces. Must be supported by `machine_type`. - When set to null, default value either is [set based on machine_type](https://cloud.google.com/compute/docs/disks/local-ssd?_gl=1*103lsfs*_ga*MTczMTAzMDM4My4xNzIyMzc1ODU5*_ga_WH2QY8WWF5*MTcyMzYzODIzOS4xOC4xLjE3MjM2MzgzNTEuNTAuMC4w#choose_number_local_ssds) or GKE decides about default value. + When set to null, default value either is [set based on machine_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value. [See above](#local-ssd-storage) for more info. EOT type = number @@ -104,7 +104,7 @@ variable "local_ssd_count_nvme_block" { description = <<-EOT The number of local SSDs to attach to each node to back block storage. Uses NVMe interfaces. Must be supported by `machine_type`. - When set to null, default value either is [set based on machine_type](https://cloud.google.com/compute/docs/disks/local-ssd?_gl=1*103lsfs*_ga*MTczMTAzMDM4My4xNzIyMzc1ODU5*_ga_WH2QY8WWF5*MTcyMzYzODIzOS4xOC4xLjE3MjM2MzgzNTEuNTAuMC4w#choose_number_local_ssds) or GKE decides about default value. + When set to null, default value either is [set based on machine_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value. [See above](#local-ssd-storage) for more info. EOT From 9d425a88c06db7a106bf3c4e21afb92b96cb200b Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Thu, 22 Aug 2024 10:16:12 +0000 Subject: [PATCH 149/180] Support named compact placement in GKE node pools --- modules/compute/gke-node-pool/README.md | 3 ++- modules/compute/gke-node-pool/main.tf | 4 ++-- modules/compute/gke-node-pool/variables.tf | 10 ++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 95898248e4..f7a3589933 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -267,7 +267,8 @@ No modules. | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | | [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes | -| [compact\_placement](#input\_compact\_placement) | DEPRECATED: Use `placement_policy_type` | `bool` | `null` | no | +| [compact\_placement](#input\_compact\_placement) | Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes. | `bool` | `false` | no | +| [compact\_placement\_policy](#input\_compact\_placement\_policy) | Name of the placement policy to use when compact\_placement is enabled.
It is assumed that the specified policy exists. To create a compact placement policy refer to https://cloud.google.com/compute/docs/instances/use-compact-placement-policies.
Beware of the limitations of using compact placement with GKE https://cloud.google.com/kubernetes-engine/docs/how-to/compact-placement#limitations-standard | `string` | `null` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for each node. | `number` | `100` | no | | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 81f48fc45a..916fc163a6 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -69,8 +69,8 @@ resource "google_container_node_pool" "node_pool" { dynamic "placement_policy" { for_each = var.placement_policy_type != null ? [1] : [] content { - type = var.placement_policy_type - policy_name = var.placement_policy_name + type = "COMPACT" + policy_name = var.compact_placement_policy } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 0bf4835cd4..ea4a6ca9d2 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -344,3 +344,13 @@ variable "specific_reservation" { values = null } } + +variable "compact_placement_policy" { + description = <<-EOT + Name of the placement policy to use when compact_placement is enabled. + It is assumed that the specified policy exists. To create a compact placement policy refer to https://cloud.google.com/compute/docs/instances/use-compact-placement-policies. + Beware of the limitations of using compact placement with GKE https://cloud.google.com/kubernetes-engine/docs/how-to/compact-placement#limitations-standard + EOT + type = string + default = null +} From f605edca4b03f9f4021931053363de937f55edba Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Fri, 23 Aug 2024 12:56:16 +0000 Subject: [PATCH 150/180] allow generic placement --- modules/compute/gke-node-pool/README.md | 7 ++--- modules/compute/gke-node-pool/main.tf | 34 ++++++++++++++++++---- modules/compute/gke-node-pool/variables.tf | 31 +++++++------------- 3 files changed, 42 insertions(+), 30 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index f7a3589933..972c6279f4 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -267,8 +267,7 @@ No modules. | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | | [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes | -| [compact\_placement](#input\_compact\_placement) | Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes. | `bool` | `false` | no | -| [compact\_placement\_policy](#input\_compact\_placement\_policy) | Name of the placement policy to use when compact\_placement is enabled.
It is assumed that the specified policy exists. To create a compact placement policy refer to https://cloud.google.com/compute/docs/instances/use-compact-placement-policies.
Beware of the limitations of using compact placement with GKE https://cloud.google.com/kubernetes-engine/docs/how-to/compact-placement#limitations-standard | `string` | `null` | no | +| [compact\_placement](#input\_compact\_placement) | Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes.
Use `placement_type` and `placement_policy` if you want to use the placement policy you created.
Note that `compact_placement` might be deprecated in future in favour of `placement_type` and `placement_policy` | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for each node. | `number` | `100` | no | | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | @@ -281,8 +280,8 @@ No modules. | [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | -| [placement\_policy\_name](#input\_placement\_policy\_name) | Name of the placement policy to use when `placement_policy_type` is set.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies | `string` | `null` | no | -| [placement\_policy\_type](#input\_placement\_policy\_type) | Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy_name`.
`COMPACT` is the only supported value currently. | `string` | `null` | no | +| [placement\_policy](#input\_placement\_policy) | Name of the placement policy to use when `placement_type` is set.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies | `string` | `null` | no | +| [placement\_type](#input\_placement\_type) | Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy`.
`COMPACT` is the only supported value currently.
Note that `placement_type` and `placement_policy` take effect only when `compact_placement` is `false`. | `string` | `null` | no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [reservation\_type](#input\_reservation\_type) | Type of reservation to consume | `string` | `"NO_RESERVATION"` | no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 916fc163a6..ab5d6a6e8c 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -66,11 +66,33 @@ resource "google_container_node_pool" "node_pool" { max_unavailable = 1 } + # NOTE: `placement_type` and `placement_policy` together can support compact placement of nodes in a node pool. + # So, one might wonder if we can remove this block. But to support backward compatibility this needs to be kept. + # By backward compatibility we mean that there maybe blueprints that might be using `compact_placement`. + # Those blueprints will stop working if we remove this block. dynamic "placement_policy" { for_each = var.placement_policy_type != null ? [1] : [] content { - type = "COMPACT" - policy_name = var.compact_placement_policy + type = "COMPACT" + } + } + + # NOTE: `placement_type` and `placement_policy` together can support compact placement of nodes in a node pool. + # So, one might wonder if we can remove this block. But to support backward compatibility this needs to be kept. + # By backward compatibility we mean that there maybe blueprints that might be using `compact_placement`. + # Those blueprints will stop working if we remove this block. + dynamic "placement_policy" { + for_each = var.compact_placement ? [1] : [] + content { + type = "COMPACT" + } + } + + dynamic "placement_policy" { + for_each = (!var.compact_placement && try(contains(["COMPACT"], var.placement_type), false)) ? [1] : [] + content { + type = var.placement_type + policy_name = var.placement_policy } } @@ -204,13 +226,13 @@ resource "google_container_node_pool" "node_pool" { EOT } precondition { - condition = var.placement_policy_type == null || try(contains(["COMPACT"], var.placement_policy_type), false) - error_message = "`COMPACT` is the only supported value for `placement_policy_type`." + condition = var.compact_placement || var.placement_type == null || try(contains(["COMPACT"], var.placement_type), false) + error_message = "`COMPACT` is the only supported value for `placement_type`." } precondition { - condition = var.placement_policy_type != null || (var.placement_policy_type == null && var.placement_policy_name == null) - error_message = "`placement_policy_type` needs to be set when specifying `placement_policy_name`" + condition = var.compact_placement || var.placement_type != null || (var.placement_type == null && var.placement_policy == null) + error_message = "`placement_type` needs to be set when specifying `placement_policy`" } } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index ea4a6ca9d2..e614b4050c 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -173,28 +173,29 @@ variable "spot" { # tflint-ignore: terraform_unused_declarations variable "compact_placement" { - description = "DEPRECATED: Use `placement_policy_type`" + description = <<-EOT + Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes. + Use `placement_type` and `placement_policy` if you want to use the placement policy you created. + Note that `compact_placement` might be deprecated in future in favour of `placement_type` and `placement_policy` + EOT type = bool - default = null - validation { - condition = var.compact_placement == null - error_message = "`compact_placement` is deprecated. Use `placement_policy_type`" - } + default = false } -variable "placement_policy_type" { +variable "placement_type" { description = <<-EOT - Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy_name`. + Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy`. `COMPACT` is the only supported value currently. + Note that `placement_type` and `placement_policy` take effect only when `compact_placement` is `false`. EOT type = string default = null } -variable "placement_policy_name" { +variable "placement_policy" { description = <<-EOT - Name of the placement policy to use when `placement_policy_type` is set. + Name of the placement policy to use when `placement_type` is set. It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement. Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies EOT @@ -344,13 +345,3 @@ variable "specific_reservation" { values = null } } - -variable "compact_placement_policy" { - description = <<-EOT - Name of the placement policy to use when compact_placement is enabled. - It is assumed that the specified policy exists. To create a compact placement policy refer to https://cloud.google.com/compute/docs/instances/use-compact-placement-policies. - Beware of the limitations of using compact placement with GKE https://cloud.google.com/kubernetes-engine/docs/how-to/compact-placement#limitations-standard - EOT - type = string - default = null -} From f90c9c02bd1139fa7179118cb134a4895a3d3353 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Tue, 27 Aug 2024 10:15:49 +0000 Subject: [PATCH 151/180] No cross reference anymore. So variable validation --- modules/compute/gke-node-pool/main.tf | 4 ---- modules/compute/gke-node-pool/variables.tf | 5 ++++- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index ab5d6a6e8c..0764ef8f5e 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -225,10 +225,6 @@ resource "google_container_node_pool" "node_pool" { On the other hand, with SPECIFIC_RESERVATION you must set `specific_reservation.key` and `specific_reservation.values` to `compute.googleapis.com/reservation-name` and a list of reservation names respectively. EOT } - precondition { - condition = var.compact_placement || var.placement_type == null || try(contains(["COMPACT"], var.placement_type), false) - error_message = "`COMPACT` is the only supported value for `placement_type`." - } precondition { condition = var.compact_placement || var.placement_type != null || (var.placement_type == null && var.placement_policy == null) diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index e614b4050c..907dc278f6 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -190,7 +190,10 @@ variable "placement_type" { EOT type = string default = null - + validation { + condition = var.placement_policy_type == null || try(contains(["COMPACT"], var.placement_policy_type), false) + error_message = "`COMPACT` is the only supported value for `placement_policy_type`." + } } variable "placement_policy" { From 08f54986c6f05f5a63220a728e19a5cfe174623d Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Wed, 28 Aug 2024 05:52:25 +0000 Subject: [PATCH 152/180] Make placement_policy setting an object --- modules/compute/gke-node-pool/README.md | 5 ++- modules/compute/gke-node-pool/main.tf | 29 ++--------------- modules/compute/gke-node-pool/variables.tf | 37 ++++++++++------------ 3 files changed, 22 insertions(+), 49 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 972c6279f4..7dde55eee7 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -267,7 +267,7 @@ No modules. | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | | [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes | -| [compact\_placement](#input\_compact\_placement) | Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes.
Use `placement_type` and `placement_policy` if you want to use the placement policy you created.
Note that `compact_placement` might be deprecated in future in favour of `placement_type` and `placement_policy` | `bool` | `false` | no | +| [compact\_placement](#input\_compact\_placement) | DEPRECATED: Use `placement_policy` | `bool` | `null` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for each node. | `number` | `100` | no | | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | @@ -280,8 +280,7 @@ No modules. | [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | -| [placement\_policy](#input\_placement\_policy) | Name of the placement policy to use when `placement_type` is set.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies | `string` | `null` | no | -| [placement\_type](#input\_placement\_type) | Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy`.
`COMPACT` is the only supported value currently.
Note that `placement_type` and `placement_policy` take effect only when `compact_placement` is `false`. | `string` | `null` | no | +| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [reservation\_type](#input\_reservation\_type) | Type of reservation to consume | `string` | `"NO_RESERVATION"` | no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 0764ef8f5e..91b949d02f 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -71,28 +71,10 @@ resource "google_container_node_pool" "node_pool" { # By backward compatibility we mean that there maybe blueprints that might be using `compact_placement`. # Those blueprints will stop working if we remove this block. dynamic "placement_policy" { - for_each = var.placement_policy_type != null ? [1] : [] + for_each = var.placement_policy.type != null ? [1] : [] content { - type = "COMPACT" - } - } - - # NOTE: `placement_type` and `placement_policy` together can support compact placement of nodes in a node pool. - # So, one might wonder if we can remove this block. But to support backward compatibility this needs to be kept. - # By backward compatibility we mean that there maybe blueprints that might be using `compact_placement`. - # Those blueprints will stop working if we remove this block. - dynamic "placement_policy" { - for_each = var.compact_placement ? [1] : [] - content { - type = "COMPACT" - } - } - - dynamic "placement_policy" { - for_each = (!var.compact_placement && try(contains(["COMPACT"], var.placement_type), false)) ? [1] : [] - content { - type = var.placement_type - policy_name = var.placement_policy + type = var.placement_policy.type + policy_name = var.placement_policy.name } } @@ -225,11 +207,6 @@ resource "google_container_node_pool" "node_pool" { On the other hand, with SPECIFIC_RESERVATION you must set `specific_reservation.key` and `specific_reservation.values` to `compute.googleapis.com/reservation-name` and a list of reservation names respectively. EOT } - - precondition { - condition = var.compact_placement || var.placement_type != null || (var.placement_type == null && var.placement_policy == null) - error_message = "`placement_type` needs to be set when specifying `placement_policy`" - } } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 907dc278f6..d1d9f9667e 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -173,37 +173,34 @@ variable "spot" { # tflint-ignore: terraform_unused_declarations variable "compact_placement" { - description = <<-EOT - Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes. - Use `placement_type` and `placement_policy` if you want to use the placement policy you created. - Note that `compact_placement` might be deprecated in future in favour of `placement_type` and `placement_policy` - EOT + description = "DEPRECATED: Use `placement_policy`" type = bool - default = false -} - -variable "placement_type" { - description = <<-EOT - Type of the group placement to use for the node pool's nodes. This is used together with `placement_policy`. - `COMPACT` is the only supported value currently. - Note that `placement_type` and `placement_policy` take effect only when `compact_placement` is `false`. - EOT - type = string default = null validation { - condition = var.placement_policy_type == null || try(contains(["COMPACT"], var.placement_policy_type), false) - error_message = "`COMPACT` is the only supported value for `placement_policy_type`." + condition = var.compact_placement == null + error_message = "`compact_placement` is deprecated. Use `placement_policy` instead" } } variable "placement_policy" { description = <<-EOT - Name of the placement policy to use when `placement_type` is set. + Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy. It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement. Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies EOT - type = string - default = null + + type = object({ + type = string + name = optional(string) + }) + default = { + type = null + name = null + } + validation { + condition = var.placement_policy.type == null || try(contains(["COMPACT"], var.placement_policy.type), false) + error_message = "`COMPACT` is the only supported value for `placement_policy.type`." + } } variable "service_account_email" { From 5eed964b38384b4b8de952445d05f0a589e7a7b2 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Thu, 29 Aug 2024 09:24:16 +0000 Subject: [PATCH 153/180] Fix the merge --- modules/compute/gke-node-pool/main.tf | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 91b949d02f..5482ce0277 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -66,10 +66,6 @@ resource "google_container_node_pool" "node_pool" { max_unavailable = 1 } - # NOTE: `placement_type` and `placement_policy` together can support compact placement of nodes in a node pool. - # So, one might wonder if we can remove this block. But to support backward compatibility this needs to be kept. - # By backward compatibility we mean that there maybe blueprints that might be using `compact_placement`. - # Those blueprints will stop working if we remove this block. dynamic "placement_policy" { for_each = var.placement_policy.type != null ? [1] : [] content { From fab2a0eb5ac4c7423fcd623520632ff1c613aa01 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Fri, 30 Aug 2024 06:19:14 +0000 Subject: [PATCH 154/180] Tone down the note --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 52e500d6ca..d53d402e4b 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -281,7 +281,7 @@ No modules. | [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | -| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | +| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [reservation\_type](#input\_reservation\_type) | Type of reservation to consume | `string` | `"NO_RESERVATION"` | no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 3ce5b77a42..d33575f6c8 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -186,7 +186,7 @@ variable "placement_policy" { description = <<-EOT Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy. It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement. - Beware of the restrictions for placement policies https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies + Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. EOT type = object({ From c713cfd7b50ccadd4afb0fb17b075a2593d3694f Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 30 Aug 2024 11:24:17 +0000 Subject: [PATCH 155/180] kubernetes provider added to gke-cluster module --- modules/scheduler/gke-cluster/README.md | 1 + modules/scheduler/gke-cluster/main.tf | 15 +++++++++++++-- modules/scheduler/gke-cluster/versions.tf | 4 ++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index e6e7f6f6de..d446cd8667 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -110,6 +110,7 @@ limitations under the License. | [google](#requirement\_google) | > 5.0 | | [google-beta](#requirement\_google-beta) | > 5.0 | | [kubectl](#requirement\_kubectl) | >= 1.7.0 | +| [kubernetes](#requirement\_kubernetes) | ~> 2.23 | ## Providers diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index 3faf514d10..bd74b9d812 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -306,6 +306,14 @@ resource "google_project_iam_member" "node_service_account_artifact_registry" { member = "serviceAccount:${local.sa_email}" } +data "google_client_config" "default" {} + +provider "kubernetes" { + host = "https://${google_container_cluster.gke_cluster.endpoint}" + cluster_ca_certificate = base64decode(google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate) + token = data.google_client_config.default.access_token +} + module "workload_identity" { count = var.configure_workload_identity_sa ? 1 : 0 source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity" @@ -317,6 +325,11 @@ module "workload_identity" { project_id = var.project_id roles = var.enable_gcsfuse_csi ? ["roles/storage.admin"] : [] + providers = { + google = google + kubernetes = kubernetes + } + # https://github.com/terraform-google-modules/terraform-google-kubernetes-engine/issues/1059 depends_on = [ data.google_compute_default_service_account.default_sa, @@ -324,8 +337,6 @@ module "workload_identity" { ] } -data "google_client_config" "default" {} - provider "kubectl" { host = "https://${google_container_cluster.gke_cluster.endpoint}" cluster_ca_certificate = base64decode(google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate) diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf index 23616de820..0f813deed4 100644 --- a/modules/scheduler/gke-cluster/versions.tf +++ b/modules/scheduler/gke-cluster/versions.tf @@ -24,6 +24,10 @@ terraform { source = "hashicorp/google-beta" version = "> 5.0" } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.23" + } kubectl = { source = "gavinbunney/kubectl" version = ">= 1.7.0" From 8a4724c60c69cfbcbdc15917df372c68b20f9efa Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 30 Aug 2024 11:47:44 +0000 Subject: [PATCH 156/180] explicit providers removed --- modules/scheduler/gke-cluster/main.tf | 5 ----- 1 file changed, 5 deletions(-) diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index bd74b9d812..efd2a30dde 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -325,11 +325,6 @@ module "workload_identity" { project_id = var.project_id roles = var.enable_gcsfuse_csi ? ["roles/storage.admin"] : [] - providers = { - google = google - kubernetes = kubernetes - } - # https://github.com/terraform-google-modules/terraform-google-kubernetes-engine/issues/1059 depends_on = [ data.google_compute_default_service_account.default_sa, From dcfd1ce7c10a2e2886e445c762fc9802ba3f8171 Mon Sep 17 00:00:00 2001 From: annuay Date: Fri, 30 Aug 2024 13:23:54 +0000 Subject: [PATCH 157/180] validate machine type availability by zones --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 5 +++- .../schedmd-slurm-gcp-v6-nodeset/main.tf | 27 ++++++++++++++++++- .../schedmd-slurm-gcp-v6-nodeset/versions.tf | 2 +- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index cea9e8e862..40a286ea28 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -130,7 +130,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.3 | +| [terraform](#requirement\_terraform) | >= 1.4 | | [google](#requirement\_google) | >= 5.11 | ## Providers @@ -138,6 +138,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Version | |------|---------| | [google](#provider\_google) | >= 5.11 | +| [terraform](#provider\_terraform) | n/a | ## Modules @@ -147,8 +148,10 @@ No modules. | Name | Type | |------|------| +| [terraform_data.machine_type_zone_validation](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource | | [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | | [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | +| [google_compute_machine_types.machine_types_by_zone](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_machine_types) | data source | | [google_compute_reservation.reservation](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_reservation) | data source | | [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 491ea64419..699c6633e8 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -77,7 +77,7 @@ locals { gpu = one(local.guest_accelerator) labels = local.labels - machine_type = var.machine_type + machine_type = terraform_data.machine_type_zone_validation.output metadata = local.metadata min_cpu_platform = var.min_cpu_platform @@ -175,3 +175,28 @@ data "google_compute_reservation" "reservation" { # Add a validation that if reservation.project != var.project_id it should be a shared reservation } } + +data "google_compute_machine_types" "machine_types_by_zone" { + for_each = local.zones + filter = format("name = \"%s\"", var.machine_type) + zone = each.value +} + +locals { + machine_types_by_zone = data.google_compute_machine_types.machine_types_by_zone + zones_with_machine_type = [for k, v in local.machine_types_by_zone : k if length(v.machine_types) > 0] +} + +resource "terraform_data" "machine_type_zone_validation" { + input = var.machine_type + lifecycle { + precondition { + condition = length(local.zones_with_machine_type) > 0 + error_message = <<-EOT + machine type ${var.machine_type} is not available in any of the zones ${jsonencode(local.zones)}". To list zones in which it is available, run: + + gcloud compute machine-types list --filter="name=${var.machine_type}" + EOT + } + } +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 38330af5d0..0197157037 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -15,7 +15,7 @@ */ terraform { - required_version = ">= 1.3" + required_version = ">= 1.4" required_providers { google = { From 6815db3d7bb945cb333efed33c7e775a45f1abe6 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Thu, 29 Aug 2024 14:50:39 +0000 Subject: [PATCH 158/180] Improved serial port collection tool --- tools/serial_port_collector.py | 69 ++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 tools/serial_port_collector.py diff --git a/tools/serial_port_collector.py b/tools/serial_port_collector.py new file mode 100644 index 0000000000..7e1f78210e --- /dev/null +++ b/tools/serial_port_collector.py @@ -0,0 +1,69 @@ +#!/bin/python3 +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +from google.cloud import compute_v1 +from argparse import ArgumentParser, RawTextHelpFormatter + +"""This tool collects serial port output and prints it to the terminal until +the VM is deleted or it hits the timeout (300s). It takes in, project, vm_name +and zone as arguments. The script should only print each line once, using the +line number of the previous serial port retrieval as the starting point of the +next request. + +usage: serial_port_collector.py [-h] -p PROJECT -v VM_NAME -z ZONE [-t TIMEOUT] +""" + +def get_serial_port_output(host_name: str, project: str, zone: str, + start: int = 0) -> str: + # Create a client + client = compute_v1.InstancesClient() + # Initialize request argument(s) + request = compute_v1.GetSerialPortOutputInstanceRequest( + instance=host_name, + project=project, + zone=zone, + start=start, + ) + # Make the request + res = client.get_serial_port_output(request=request) + return res.contents, res.next_ + +if __name__ == "__main__": + parser = ArgumentParser(prog='serial_port_collector.py', + formatter_class=RawTextHelpFormatter) + parser.add_argument("-p", "--project", required=True, type=str, + help="Project where the vm is located") + parser.add_argument("-v", "--vm_name", required=True, type=str, + help="VM name to collect serial port output from") + parser.add_argument("-z", "--zone", required=True, type=str, + help="The zone the vm is located in") + parser.add_argument("-t", "--timeout", type=int, default = 0, + help="Timeout in seconds waiting for the next output "\ + "(values <= 0 are no timeout)") + + args = parser.parse_args() + to = args.timeout + + next=0 + sleep_timer = 2 + ts = time.time() + while to <= 0 or time.time()-ts < to: + out, next = get_serial_port_output(args.vm_name, args.project, + args.zone, next) + if len(out) > 0: + print(out) + ts = time.time() + time.sleep(sleep_timer) From a42fc9f04c275a14abbaa7832e4809831363d510 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Fri, 30 Aug 2024 18:16:57 +0000 Subject: [PATCH 159/180] Disable automatic updates in daos installation script --- .../file-system/parallelstore/scripts/install-daos-client.sh | 3 +++ .../scripts/install-daos-client.sh | 3 +++ 2 files changed, 6 insertions(+) diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh index fd796bd5fd..9a027b7fb5 100644 --- a/modules/file-system/parallelstore/scripts/install-daos-client.sh +++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh @@ -57,6 +57,9 @@ EOF echo "Unsupported RedHat / Rocky Linux system version ${OS_VERSION_MAJOR}. This script only supports version 8 and 9." exit 1 fi + + ## TODO: Remove disable automatic update script after issue is fixed. + /usr/bin/google_disable_automatic_updates dnf makecache # 2) Install daos-client diff --git a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh index fd796bd5fd..9a027b7fb5 100644 --- a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh +++ b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh @@ -57,6 +57,9 @@ EOF echo "Unsupported RedHat / Rocky Linux system version ${OS_VERSION_MAJOR}. This script only supports version 8 and 9." exit 1 fi + + ## TODO: Remove disable automatic update script after issue is fixed. + /usr/bin/google_disable_automatic_updates dnf makecache # 2) Install daos-client From 142eee0737acc71968419ee36f665f523882e19c Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 21 Aug 2024 20:47:24 +0000 Subject: [PATCH 160/180] SlurmGCP. Do not add empty startup scripts --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../schedmd-slurm-gcp-v6-nodeset/main.tf | 2 +- .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 6 ++-- .../slurm_files.tf | 35 +++++++++---------- .../variables.tf | 6 ++-- 6 files changed, 26 insertions(+), 27 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 40a286ea28..4871de7034 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -202,7 +202,7 @@ No modules. | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to attach to the compute instances. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
- enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
- enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
- enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [spot\_instance\_config](#input\_spot\_instance\_config) | Configuration for spot VMs. |
object({
termination_action = string
})
| `null` | no | -| [startup\_script](#input\_startup\_script) | Startup script used by VMs in this nodeset.
NOTE: will be executed after `compute_startup_script` defined on controller module. | `string` | `"# no-op"` | no | +| [startup\_script](#input\_startup\_script) | Startup script used by VMs in this nodeset.
NOTE: will be executed after `compute_startup_script` defined on controller module. | `string` | `""` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | | [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 699c6633e8..bd0dbad454 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -49,7 +49,7 @@ locals { scopes = var.service_account_scopes } - ghpc_startup_script = [{ + ghpc_startup_script = length(var.startup_script) == 0 ? [] : [{ filename = "ghpc_nodeset_startup.sh" content = var.startup_script }] diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 0a53ef95e2..b67ed7cc9a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -478,7 +478,7 @@ variable "startup_script" { NOTE: will be executed after `compute_startup_script` defined on controller module. EOD type = string - default = "# no-op" + default = "" } variable "network_storage" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 30ee38d084..d8b495425d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -229,9 +229,9 @@ limitations under the License. | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | | [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | | [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access.
user\_managed\_replication : The list of location and (optional) kms\_key\_name for secret |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
user_managed_replication = optional(list(object({
location = string
kms_key_name = optional(string)
})), [])
})
| `null` | no | -| [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `"# no-op"` | no | +| [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `""` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | -| [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `"# no-op"` | no | +| [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `""` | no | | [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [create\_bucket](#input\_create\_bucket) | Create GCS bucket instead of using an existing one. | `bool` | `true` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | @@ -265,7 +265,7 @@ limitations under the License. | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | | [login\_network\_storage](#input\_login\_network\_storage) | An array of network attached storage mounts to be configured on all login nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | | [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. |
list(object({
name_prefix = string
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
additional_networks = optional(list(object({
access_config = optional(list(object({
nat_ip = string
network_tier = string
})), [])
alias_ip_range = optional(list(object({
ip_cidr_range = string
subnetwork_range_name = string
})), [])
ipv6_access_config = optional(list(object({
network_tier = string
})), [])
network = optional(string)
network_ip = optional(string, "")
nic_type = optional(string)
queue_count = optional(number)
stack_type = optional(string)
subnetwork = optional(string)
subnetwork_project = optional(string)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string, "n1-standard-1")
enable_confidential_vm = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
num_instances = optional(number, 1)
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
static_ips = optional(list(string), [])
subnetwork = string
spot = optional(bool, false)
tags = optional(list(string), [])
zone = optional(string)
termination_action = optional(string)
}))
| `[]` | no | -| [login\_startup\_script](#input\_login\_startup\_script) | Startup script used by the login VMs. | `string` | `"# no-op"` | no | +| [login\_startup\_script](#input\_login\_startup\_script) | Startup script used by the login VMs. | `string` | `""` | no | | [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index c8a8eb8a1c..9452eb1bdd 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -90,31 +90,30 @@ locals { local.daos_mount_runners, ) - daos_install_mount_script = { + daos_ss = length(local.daos_ns) == 0 ? [] : [{ filename = "ghpc_daos_mount.sh" - content = length(local.daos_ns) > 0 ? module.daos_network_storage_scripts[0].startup_script : "" - } + content = module.daos_network_storage_scripts[0].startup_script + }] + + additional_ss = concat([], local.daos_ss) } # SLURM FILES locals { - ghpc_startup_controller = { + controller_ghpc_ss = length(var.controller_startup_script) == 0 ? [] : [{ filename = "ghpc_startup.sh" - content = var.controller_startup_script - } - ghpc_startup_script_controller = length(local.daos_ns) > 0 ? [local.daos_install_mount_script, local.ghpc_startup_controller] : [local.ghpc_startup_controller] + content = var.controller_startup_script }] + controller_ss = concat(local.additional_ss, local.controller_ghpc_ss) - ghpc_startup_login = { + login_ghpc_ss = length(var.login_startup_script) == 0 ? [] : [{ filename = "ghpc_startup.sh" - content = var.login_startup_script - } - ghpc_startup_script_login = length(local.daos_ns) > 0 ? [local.daos_install_mount_script, local.ghpc_startup_login] : [local.ghpc_startup_login] + content = var.login_startup_script }] + login_ss = concat(local.additional_ss, local.login_ghpc_ss) - ghpc_startup_compute = { + compute_ghpc_ss = length(var.compute_startup_script) == 0 ? [] : [{ filename = "ghpc_startup.sh" - content = var.compute_startup_script - } - ghpc_startup_script_compute = length(local.daos_ns) > 0 ? [local.daos_install_mount_script, local.ghpc_startup_compute] : [local.ghpc_startup_compute] + content = var.compute_startup_script }] + compute_ss = concat(local.additional_ss, local.compute_ghpc_ss) nodeset_startup_scripts = { for k, v in local.nodeset_map : k => v.startup_script } } @@ -146,12 +145,12 @@ module "slurm_files" { one(google_secret_manager_secret_version.cloudsql_version[*].id), null) - controller_startup_scripts = local.ghpc_startup_script_controller + controller_startup_scripts = local.controller_ss controller_startup_scripts_timeout = var.controller_startup_scripts_timeout nodeset_startup_scripts = local.nodeset_startup_scripts - compute_startup_scripts = local.ghpc_startup_script_compute + compute_startup_scripts = local.compute_ss compute_startup_scripts_timeout = var.compute_startup_scripts_timeout - login_startup_scripts = local.ghpc_startup_script_login + login_startup_scripts = local.login_ss login_startup_scripts_timeout = var.login_startup_scripts_timeout enable_debug_logging = var.enable_debug_logging diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 26dfd21d49..e6bdc7e5a4 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -475,7 +475,7 @@ variable "cgroup_conf_tpl" { variable "controller_startup_script" { description = "Startup script used by the controller VM." type = string - default = "# no-op" + default = "" } variable "controller_startup_scripts_timeout" { @@ -493,7 +493,7 @@ EOD variable "login_startup_script" { description = "Startup script used by the login VMs." type = string - default = "# no-op" + default = "" } variable "login_startup_scripts_timeout" { @@ -511,7 +511,7 @@ EOD variable "compute_startup_script" { description = "Startup script used by the compute VMs." type = string - default = "# no-op" + default = "" } variable "compute_startup_scripts_timeout" { From c7a2095c34c68a958b57be04a18ea97ce5f8520d Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 24 Aug 2024 18:52:28 +0000 Subject: [PATCH 161/180] SlurmGCP. Fixes & improvements around config fetching --- .../modules/slurm_files/scripts/setup.py | 37 +++++++++---------- .../modules/slurm_files/scripts/util.py | 16 ++++++-- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index 4bf86f176e..589cfeadef 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -278,27 +278,22 @@ def configure_mysql(): def configure_dirs(): for p in dirs.values(): util.mkdirp(p) - util.chown_slurm(dirs.slurm) - util.chown_slurm(dirs.scripts) - + + for p in (dirs.slurm, dirs.scripts, dirs.custom_scripts): + util.chown_slurm(p) + for p in slurmdirs.values(): util.mkdirp(p) util.chown_slurm(p) - etc_slurm = Path("/etc/slurm") - if etc_slurm.exists() and etc_slurm.is_symlink(): - etc_slurm.unlink() - etc_slurm.symlink_to(slurmdirs.etc) - - scripts_etc = dirs.scripts / "etc" - if scripts_etc.exists() and scripts_etc.is_symlink(): - scripts_etc.unlink() - scripts_etc.symlink_to(slurmdirs.etc) - - scripts_log = dirs.scripts / "log" - if scripts_log.exists() and scripts_log.is_symlink(): - scripts_log.unlink() - scripts_log.symlink_to(dirs.log) + for sl, tgt in ( # create symlinks + (Path("/etc/slurm"), slurmdirs.etc), + (dirs.scripts / "etc", slurmdirs.etc), + (dirs.scripts / "log", dirs.log), + ): + if sl.exists() and sl.is_symlink(): + sl.unlink() + sl.symlink_to(tgt) for f in ("sort_nodes.py",): # copy auxiliary scripts dst = Path(lookup().cfg.slurm_bin_dir) / f @@ -446,15 +441,19 @@ def setup_compute(): def main(): start_motd() + log.info("Starting setup, fetching config") sleep_seconds = 5 while True: try: _, cfg = util.fetch_config() util.update_config(cfg) break + except util.DeffetiveStoredConfigError as e: + log.warning(f"config is not ready yet: {e}, sleeping for {sleep_seconds}s") except Exception as e: - log.exception(f"could not fetch config, sleeping for {sleep_seconds}s") - time.sleep(sleep_seconds) + log.exception(f"unexpected error while fetching config, sleeping for {sleep_seconds}s") + time.sleep(sleep_seconds) + log.info("Config fetched") configure_dirs() # call the setup function for the instance type diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index da62bf5c33..cb17500d90 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -394,6 +394,13 @@ def storage_client() -> storage.Client: return storage.Client(client_options=ClientOptions(**co)) +class DeffetiveStoredConfigError(Exception): + """ + Raised when config can not be loaded and assembled from bucket + """ + pass + + def _fill_cfg_defaults(cfg: NSDict) -> NSDict: if not cfg.slurm_log_dir: cfg.slurm_log_dir = dirs.log @@ -451,7 +458,9 @@ def _list_config_blobs() -> Tuple[Any, str]: if blob.name.startswith(f"{common_prefix}/{key}_configs/"): res[key].append(blob) hash.update(blob.md5_hash.encode("utf-8")) - assert res["core"] is not None, "config.yaml not found in bucket" + + if res["core"] is None: + raise DeffetiveStoredConfigError("config.yaml not found in bucket") return res, hash.hexdigest() @@ -506,8 +515,9 @@ def _add_nodesets(yamls: List[Any], target: dict): # validate that configs for all referenced nodesets are present for p in cfg.partitions.values(): - for ns_name in p.partition_nodeset: - assert ns_name in ns_names, f"nodeset {ns_name} not defined in config" + for ns_name in chain(p.partition_nodeset, p.partition_nodeset_dyn, p.partition_nodeset_tpu): + if ns_name not in ns_names: + raise DeffetiveStoredConfigError(f"nodeset {ns_name} not defined in config") return _fill_cfg_defaults(cfg) From af5edec142cd434ed84c11184c552edc4f6d50ce Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 30 Aug 2024 21:57:44 +0000 Subject: [PATCH 162/180] Bump golang.org/x/sys from 0.21.0 to 0.24.0 Bumps [golang.org/x/sys](https://github.com/golang/sys) from 0.21.0 to 0.24.0. - [Commits](https://github.com/golang/sys/compare/v0.21.0...v0.24.0) --- updated-dependencies: - dependency-name: golang.org/x/sys dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index d12ca04b35..516afbc8c8 100644 --- a/go.mod +++ b/go.mod @@ -98,7 +98,7 @@ require ( golang.org/x/crypto v0.24.0 // indirect golang.org/x/net v0.26.0 // indirect golang.org/x/oauth2 v0.21.0 // indirect - golang.org/x/sys v0.21.0 + golang.org/x/sys v0.24.0 golang.org/x/text v0.16.0 // indirect google.golang.org/grpc v1.64.0 // indirect google.golang.org/protobuf v1.34.2 // indirect diff --git a/go.sum b/go.sum index 56278d6bb1..56067a6d0c 100644 --- a/go.sum +++ b/go.sum @@ -732,8 +732,8 @@ golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= -golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= From 3e5c73ff45423bd9ab2fb82993d1264b50eab20e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 30 Aug 2024 22:11:06 +0000 Subject: [PATCH 163/180] Bump github.com/zclconf/go-cty from 1.14.4 to 1.15.0 Bumps [github.com/zclconf/go-cty](https://github.com/zclconf/go-cty) from 1.14.4 to 1.15.0. - [Release notes](https://github.com/zclconf/go-cty/releases) - [Changelog](https://github.com/zclconf/go-cty/blob/main/CHANGELOG.md) - [Commits](https://github.com/zclconf/go-cty/compare/v1.14.4...v1.15.0) --- updated-dependencies: - dependency-name: github.com/zclconf/go-cty dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 516afbc8c8..e9b3bf9234 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/pkg/errors v0.9.1 github.com/spf13/afero v1.11.0 github.com/spf13/cobra v1.8.1 - github.com/zclconf/go-cty v1.14.4 + github.com/zclconf/go-cty v1.15.0 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa google.golang.org/genproto v0.0.0-20240617180043-68d350f18fd4 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c diff --git a/go.sum b/go.sum index 56067a6d0c..75f26a9f0a 100644 --- a/go.sum +++ b/go.sum @@ -496,8 +496,8 @@ github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -github.com/zclconf/go-cty v1.14.4 h1:uXXczd9QDGsgu0i/QFR/hzI5NYCHLf6NQw/atrbnhq8= -github.com/zclconf/go-cty v1.14.4/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= +github.com/zclconf/go-cty v1.15.0 h1:tTCRWxsexYUmtt/wVxgDClUe+uQusuI443uL6e+5sXQ= +github.com/zclconf/go-cty v1.15.0/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 h1:4r45xpDWB6ZMSMNJFMOjqrGHynW3DIBuR2H9j0ug+Mo= github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940/go.mod h1:CmBdvvj3nqzfzJ6nTCIwDTPZ56aVGvDrmztiO5g3qrM= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= From c376be825a3e06b371e49db0ed111f348384ebec Mon Sep 17 00:00:00 2001 From: abbas1902 Date: Wed, 21 Aug 2024 23:03:32 +0000 Subject: [PATCH 164/180] add integration test for parallelstore in debian --- examples/pfs-parallelstore.yaml | 2 + .../scripts/install-daos-client.sh | 2 +- .../scripts/install-daos-client.sh | 2 +- .../daily-tests/builds/ps-vm-debian.yaml | 41 +++++++++++++++++++ .../daily-tests/tests/ps-vm-debian.yml | 29 +++++++++++++ 5 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 tools/cloud-build/daily-tests/builds/ps-vm-debian.yaml create mode 100644 tools/cloud-build/daily-tests/tests/ps-vm-debian.yml diff --git a/examples/pfs-parallelstore.yaml b/examples/pfs-parallelstore.yaml index eac758f660..1858556212 100644 --- a/examples/pfs-parallelstore.yaml +++ b/examples/pfs-parallelstore.yaml @@ -47,6 +47,7 @@ deployment_groups: use: [network, parallelstore] settings: name_prefix: debian + add_deployment_name_before_prefix: true instance_count: 1 instance_image: family: debian-12 @@ -59,6 +60,7 @@ deployment_groups: use: [network, parallelstore] settings: name_prefix: ubuntu + add_deployment_name_before_prefix: true instance_count: 1 instance_image: family: ubuntu-2204-lts diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh index fd796bd5fd..5b12d4d4e5 100644 --- a/modules/file-system/parallelstore/scripts/install-daos-client.sh +++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh @@ -100,7 +100,7 @@ if { [ "${OS_ID}" = "rocky" ] || [ "${OS_ID}" = "rhel" ]; } && { [ "${OS_VERSION systemctl start daos_agent.service elif { [ "${OS_ID}" = "ubuntu" ] && [ "${OS_VERSION}" = "22.04" ]; } || { [ "${OS_ID}" = "debian" ] && [ "${OS_VERSION_MAJOR}" = "12" ]; }; then mkdir -p /var/run/daos_agent - daos_agent -o /etc/daos/daos_agent.yml & + daos_agent -o /etc/daos/daos_agent.yml >/dev/null 2>&1 & else echo "Unsupported operating system ${OS_ID} ${OS_VERSION}. This script only supports Rocky Linux 8, Redhat 8, Redhat 9, Ubuntu 22.04, and Debian 12." exit 1 diff --git a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh index fd796bd5fd..5b12d4d4e5 100644 --- a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh +++ b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh @@ -100,7 +100,7 @@ if { [ "${OS_ID}" = "rocky" ] || [ "${OS_ID}" = "rhel" ]; } && { [ "${OS_VERSION systemctl start daos_agent.service elif { [ "${OS_ID}" = "ubuntu" ] && [ "${OS_VERSION}" = "22.04" ]; } || { [ "${OS_ID}" = "debian" ] && [ "${OS_VERSION_MAJOR}" = "12" ]; }; then mkdir -p /var/run/daos_agent - daos_agent -o /etc/daos/daos_agent.yml & + daos_agent -o /etc/daos/daos_agent.yml >/dev/null 2>&1 & else echo "Unsupported operating system ${OS_ID} ${OS_VERSION}. This script only supports Rocky Linux 8, Redhat 8, Redhat 9, Ubuntu 22.04, and Debian 12." exit 1 diff --git a/tools/cloud-build/daily-tests/builds/ps-vm-debian.yaml b/tools/cloud-build/daily-tests/builds/ps-vm-debian.yaml new file mode 100644 index 0000000000..97fd3b2a30 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/ps-vm-debian.yaml @@ -0,0 +1,41 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +tags: +- m.vpc +- m.private-service-access +- m.parallelstore +- m.vm-instance +- vm + +timeout: 14400s # 4hr +steps: +- id: parallelstore-vm + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + cd /workspace && make + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/ps-vm-debian.yml" diff --git a/tools/cloud-build/daily-tests/tests/ps-vm-debian.yml b/tools/cloud-build/daily-tests/tests/ps-vm-debian.yml new file mode 100644 index 0000000000..91494b3820 --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/ps-vm-debian.yml @@ -0,0 +1,29 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +test_name: test-parallelstore-vm-debian +deployment_name: "parallelstore-vm-{{ build }}" +region: us-central1 +zone: us-central1-a +workspace: /workspace +blueprint_yaml: "{{ workspace }}/examples/pfs-parallelstore.yaml" +network: "{{ deployment_name }}-net" +remote_node: "{{ deployment_name }}-debian-0" +post_deploy_tests: +- test-validation/test-mounts.yml +custom_vars: + mounts: + - /parallelstore From e301d05eea798b9bbcb636ddaa6455701ab402bf Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Fri, 30 Aug 2024 17:05:49 -0700 Subject: [PATCH 165/180] Revert "Add machine type availability checks to slurm-gcp-v6-nodeset" --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 5 +--- .../schedmd-slurm-gcp-v6-nodeset/main.tf | 27 +------------------ .../schedmd-slurm-gcp-v6-nodeset/versions.tf | 2 +- 3 files changed, 3 insertions(+), 31 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 4871de7034..d0685567f0 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -130,7 +130,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.4 | +| [terraform](#requirement\_terraform) | >= 1.3 | | [google](#requirement\_google) | >= 5.11 | ## Providers @@ -138,7 +138,6 @@ modules. For support with the underlying modules, see the instructions in the | Name | Version | |------|---------| | [google](#provider\_google) | >= 5.11 | -| [terraform](#provider\_terraform) | n/a | ## Modules @@ -148,10 +147,8 @@ No modules. | Name | Type | |------|------| -| [terraform_data.machine_type_zone_validation](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource | | [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | | [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | -| [google_compute_machine_types.machine_types_by_zone](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_machine_types) | data source | | [google_compute_reservation.reservation](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_reservation) | data source | | [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index bd0dbad454..d09929924d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -77,7 +77,7 @@ locals { gpu = one(local.guest_accelerator) labels = local.labels - machine_type = terraform_data.machine_type_zone_validation.output + machine_type = var.machine_type metadata = local.metadata min_cpu_platform = var.min_cpu_platform @@ -175,28 +175,3 @@ data "google_compute_reservation" "reservation" { # Add a validation that if reservation.project != var.project_id it should be a shared reservation } } - -data "google_compute_machine_types" "machine_types_by_zone" { - for_each = local.zones - filter = format("name = \"%s\"", var.machine_type) - zone = each.value -} - -locals { - machine_types_by_zone = data.google_compute_machine_types.machine_types_by_zone - zones_with_machine_type = [for k, v in local.machine_types_by_zone : k if length(v.machine_types) > 0] -} - -resource "terraform_data" "machine_type_zone_validation" { - input = var.machine_type - lifecycle { - precondition { - condition = length(local.zones_with_machine_type) > 0 - error_message = <<-EOT - machine type ${var.machine_type} is not available in any of the zones ${jsonencode(local.zones)}". To list zones in which it is available, run: - - gcloud compute machine-types list --filter="name=${var.machine_type}" - EOT - } - } -} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 0197157037..38330af5d0 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -15,7 +15,7 @@ */ terraform { - required_version = ">= 1.4" + required_version = ">= 1.3" required_providers { google = { From 6e5623d80e6c227731b3c53f5045eca0e92d2ae3 Mon Sep 17 00:00:00 2001 From: Fionn Malone Date: Sat, 31 Aug 2024 00:23:33 +0000 Subject: [PATCH 166/180] Catch None fields in slurm job data. --- .../modules/slurm_files/scripts/load_bq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index d48f1346a2..7c91e019d3 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -195,6 +195,7 @@ def make_job_row(job): field_name: dict.get(converters, field.field_type)(job[field_name]) for field_name, field in job_schema.items() if field_name in job + and job[field_name] != "None" } job_row["entry_uuid"] = uuid.uuid4().hex job_row["cluster_id"] = lookup().cfg.cluster_id From 9d405bbdd7a4bb513ca7d767044a9032afb6c560 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Sat, 31 Aug 2024 10:08:20 +0000 Subject: [PATCH 167/180] Fix for cleanup script. The last input is optional --- .../modules/cleanup_compute/scripts/cleanup_compute.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/scripts/cleanup_compute.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/scripts/cleanup_compute.sh index 51352f5989..671e7a0d27 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/scripts/cleanup_compute.sh +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/scripts/cleanup_compute.sh @@ -22,8 +22,8 @@ universe_domain="$4" compute_endpoint_version="$5" gcloud_dir="$6" -if [[ $# -ne 5 ]]; then - echo "Usage: $0 " +if [[ $# -ne 5 ]] && [[ $# -ne 6 ]]; then + echo "Usage: $0 []" exit 1 fi From 3d8a272db23d916f3bae60e39cd8d38867b01142 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Sep 2024 10:40:38 +0000 Subject: [PATCH 168/180] Bump github.com/hashicorp/hcl/v2 from 2.21.0 to 2.22.0 Bumps [github.com/hashicorp/hcl/v2](https://github.com/hashicorp/hcl) from 2.21.0 to 2.22.0. - [Release notes](https://github.com/hashicorp/hcl/releases) - [Changelog](https://github.com/hashicorp/hcl/blob/main/CHANGELOG.md) - [Commits](https://github.com/hashicorp/hcl/compare/v2.21.0...v2.22.0) --- updated-dependencies: - dependency-name: github.com/hashicorp/hcl/v2 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index e9b3bf9234..417a94e2b7 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/go-git/go-git/v5 v5.12.0 github.com/hashicorp/go-getter v1.7.6 github.com/hashicorp/hcl v1.0.0 // indirect - github.com/hashicorp/hcl/v2 v2.21.0 + github.com/hashicorp/hcl/v2 v2.22.0 github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d github.com/otiai10/copy v1.14.0 github.com/pkg/errors v0.9.1 diff --git a/go.sum b/go.sum index 75f26a9f0a..2cff77d322 100644 --- a/go.sum +++ b/go.sum @@ -391,8 +391,8 @@ github.com/hashicorp/hc-install v0.6.4 h1:QLqlM56/+SIIGvGcfFiwMY3z5WGXT066suo/v9 github.com/hashicorp/hc-install v0.6.4/go.mod h1:05LWLy8TD842OtgcfBbOT0WMoInBMUSHjmDx10zuBIA= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= -github.com/hashicorp/hcl/v2 v2.21.0 h1:lve4q/o/2rqwYOgUg3y3V2YPyD1/zkCLGjIV74Jit14= -github.com/hashicorp/hcl/v2 v2.21.0/go.mod h1:62ZYHrXgPoX8xBnzl8QzbWq4dyDsDtfCRgIq1rbJEvA= +github.com/hashicorp/hcl/v2 v2.22.0 h1:hkZ3nCtqeJsDhPRFz5EA9iwcG1hNWGePOTw6oyul12M= +github.com/hashicorp/hcl/v2 v2.22.0/go.mod h1:62ZYHrXgPoX8xBnzl8QzbWq4dyDsDtfCRgIq1rbJEvA= github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d h1:g6kHlvZrFPFKeWRj5q/zyJA5gu7rlJGPf17h8hX7LHY= github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d/go.mod h1:l8HcFPm9cQh6Q0KSWoYPiePqMvRFenybP1CH2MjKdlg= github.com/hashicorp/terraform-exec v0.21.0 h1:uNkLAe95ey5Uux6KJdua6+cv8asgILFVWkd/RG0D2XQ= From a0375ecd7b2921d82210124bcd69848db8a4b4a5 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Tue, 3 Sep 2024 07:42:16 +0000 Subject: [PATCH 169/180] revisit reservations interface --- modules/compute/gke-node-pool/README.md | 3 +- modules/compute/gke-node-pool/main.tf | 20 +++++++------ modules/compute/gke-node-pool/variables.tf | 33 ++++++++++------------ 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index d53d402e4b..c5af96c390 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -283,11 +283,10 @@ No modules. | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | | [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | -| [reservation\_type](#input\_reservation\_type) | Type of reservation to consume | `string` | `"NO_RESERVATION"` | no | +| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resources to consume. When targeting SPECIFIC\_RESERVATION, the list of specific\_reservations needs be specified.
It is assumed that the specified reservations exist and they have available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create reservations refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | -| [specific\_reservation](#input\_specific\_reservation) | Reservation resources to consume when targeting SPECIFIC\_RESERVATION.
Specify `compute.googleapis.com/reservation-name` as the key and the list of reservation names as the value.
It is assumed that the specified reservations exist and they have available capacity.
To create reservations refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
key = string
values = list(string)
})
|
{
"key": null,
"values": null
}
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no | | [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index df235442c7..460b640208 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -30,8 +30,10 @@ locals { effect = "NO_SCHEDULE" }] : [] - autoscale_set = var.autoscaling_total_min_nodes != 0 || var.autoscaling_total_max_nodes != 1000 - static_node_set = var.static_node_count != null + autoscale_set = var.autoscaling_total_min_nodes != 0 || var.autoscaling_total_max_nodes != 1000 + static_node_set = var.static_node_count != null + reservation_resource_api_label = "compute.googleapis.com/reservation-name" + specific_reservations_count = try(length(var.reservation_affinity.specific_reservations), 0) } data "google_compute_default_service_account" "default_sa" { @@ -159,9 +161,9 @@ resource "google_container_node_pool" "node_pool" { } reservation_affinity { - consume_reservation_type = var.reservation_type - key = var.specific_reservation.key - values = var.specific_reservation.values + consume_reservation_type = var.reservation_affinity.consume_reservation_type + key = local.specific_reservations_count != 1 ? null : local.reservation_resource_api_label + values = local.specific_reservations_count != 1 ? null : [for reservation in var.reservation_affinity.specific_reservations : reservation.name] } dynamic "host_maintenance_policy" { @@ -202,12 +204,12 @@ resource "google_container_node_pool" "node_pool" { } precondition { condition = ( - (var.reservation_type != "SPECIFIC_RESERVATION" && var.specific_reservation.key == null && var.specific_reservation.values == null) || - (var.reservation_type == "SPECIFIC_RESERVATION" && var.specific_reservation.key == "compute.googleapis.com/reservation-name" && var.specific_reservation.values != null) + (var.reservation_affinity.consume_reservation_type != "SPECIFIC_RESERVATION" && local.specific_reservations_count == 0) || + (var.reservation_affinity.consume_reservation_type == "SPECIFIC_RESERVATION" && local.specific_reservations_count == 1) ) error_message = <<-EOT - When using NO_RESERVATION or ANY_RESERVATION as the reservation type, `specific_reservation` cannot be set. - On the other hand, with SPECIFIC_RESERVATION you must set `specific_reservation.key` and `specific_reservation.values` to `compute.googleapis.com/reservation-name` and a list of reservation names respectively. + When using NO_RESERVATION or ANY_RESERVATION as the `consume_reservation_type`, `specific_reservations` cannot be set. + On the other hand, with SPECIFIC_RESERVATION you must set `specific_reservations`. EOT } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index d33575f6c8..439de2a45e 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -318,34 +318,31 @@ variable "additional_networks" { })) } -variable "reservation_type" { - description = "Type of reservation to consume" - type = string - default = "NO_RESERVATION" - - validation { - condition = contains(["NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"], var.reservation_type) - error_message = "Accepted values are: {NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION}" - } -} - -variable "specific_reservation" { +variable "reservation_affinity" { description = <<-EOT - Reservation resources to consume when targeting SPECIFIC_RESERVATION. - Specify `compute.googleapis.com/reservation-name` as the key and the list of reservation names as the value. + Reservation resources to consume. When targeting SPECIFIC_RESERVATION, the list of specific_reservations needs be specified. It is assumed that the specified reservations exist and they have available capacity. + For a shared reservation, specify the project_id as well in which it was created. To create reservations refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared EOT type = object({ - key = string - values = list(string) + consume_reservation_type = string + specific_reservations = optional(list(object({ + name = string + project = optional(string) + }))) }) default = { - key = null - values = null + consume_reservation_type = "NO_RESERVATION" + specific_reservations = [] + } + validation { + condition = contains(["NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"], var.reservation_affinity.consume_reservation_type) + error_message = "Accepted values are: {NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION}" } } + variable "host_maintenance_interval" { description = "Specifies the frequency of planned maintenance events." type = string From 33dad19ee9e577f1b1dd52c660eb3f84e2f8ecde Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 3 Sep 2024 12:25:45 -0700 Subject: [PATCH 170/180] Revert "SlurmGCP. Do not add empty startup scripts" --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../schedmd-slurm-gcp-v6-nodeset/main.tf | 2 +- .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 6 ++-- .../slurm_files.tf | 35 ++++++++++--------- .../variables.tf | 6 ++-- 6 files changed, 27 insertions(+), 26 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index d0685567f0..cea9e8e862 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -199,7 +199,7 @@ No modules. | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to attach to the compute instances. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
- enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
- enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
- enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [spot\_instance\_config](#input\_spot\_instance\_config) | Configuration for spot VMs. |
object({
termination_action = string
})
| `null` | no | -| [startup\_script](#input\_startup\_script) | Startup script used by VMs in this nodeset.
NOTE: will be executed after `compute_startup_script` defined on controller module. | `string` | `""` | no | +| [startup\_script](#input\_startup\_script) | Startup script used by VMs in this nodeset.
NOTE: will be executed after `compute_startup_script` defined on controller module. | `string` | `"# no-op"` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | | [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index d09929924d..491ea64419 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -49,7 +49,7 @@ locals { scopes = var.service_account_scopes } - ghpc_startup_script = length(var.startup_script) == 0 ? [] : [{ + ghpc_startup_script = [{ filename = "ghpc_nodeset_startup.sh" content = var.startup_script }] diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index b67ed7cc9a..0a53ef95e2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -478,7 +478,7 @@ variable "startup_script" { NOTE: will be executed after `compute_startup_script` defined on controller module. EOD type = string - default = "" + default = "# no-op" } variable "network_storage" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index d8b495425d..30ee38d084 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -229,9 +229,9 @@ limitations under the License. | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | | [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | | [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access.
user\_managed\_replication : The list of location and (optional) kms\_key\_name for secret |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
user_managed_replication = optional(list(object({
location = string
kms_key_name = optional(string)
})), [])
})
| `null` | no | -| [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `""` | no | +| [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `"# no-op"` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | -| [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `""` | no | +| [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `"# no-op"` | no | | [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [create\_bucket](#input\_create\_bucket) | Create GCS bucket instead of using an existing one. | `bool` | `true` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | @@ -265,7 +265,7 @@ limitations under the License. | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | | [login\_network\_storage](#input\_login\_network\_storage) | An array of network attached storage mounts to be configured on all login nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | | [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. |
list(object({
name_prefix = string
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
additional_networks = optional(list(object({
access_config = optional(list(object({
nat_ip = string
network_tier = string
})), [])
alias_ip_range = optional(list(object({
ip_cidr_range = string
subnetwork_range_name = string
})), [])
ipv6_access_config = optional(list(object({
network_tier = string
})), [])
network = optional(string)
network_ip = optional(string, "")
nic_type = optional(string)
queue_count = optional(number)
stack_type = optional(string)
subnetwork = optional(string)
subnetwork_project = optional(string)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string, "n1-standard-1")
enable_confidential_vm = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
num_instances = optional(number, 1)
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
static_ips = optional(list(string), [])
subnetwork = string
spot = optional(bool, false)
tags = optional(list(string), [])
zone = optional(string)
termination_action = optional(string)
}))
| `[]` | no | -| [login\_startup\_script](#input\_login\_startup\_script) | Startup script used by the login VMs. | `string` | `""` | no | +| [login\_startup\_script](#input\_login\_startup\_script) | Startup script used by the login VMs. | `string` | `"# no-op"` | no | | [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index 9452eb1bdd..c8a8eb8a1c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -90,30 +90,31 @@ locals { local.daos_mount_runners, ) - daos_ss = length(local.daos_ns) == 0 ? [] : [{ + daos_install_mount_script = { filename = "ghpc_daos_mount.sh" - content = module.daos_network_storage_scripts[0].startup_script - }] - - additional_ss = concat([], local.daos_ss) + content = length(local.daos_ns) > 0 ? module.daos_network_storage_scripts[0].startup_script : "" + } } # SLURM FILES locals { - controller_ghpc_ss = length(var.controller_startup_script) == 0 ? [] : [{ + ghpc_startup_controller = { filename = "ghpc_startup.sh" - content = var.controller_startup_script }] - controller_ss = concat(local.additional_ss, local.controller_ghpc_ss) + content = var.controller_startup_script + } + ghpc_startup_script_controller = length(local.daos_ns) > 0 ? [local.daos_install_mount_script, local.ghpc_startup_controller] : [local.ghpc_startup_controller] - login_ghpc_ss = length(var.login_startup_script) == 0 ? [] : [{ + ghpc_startup_login = { filename = "ghpc_startup.sh" - content = var.login_startup_script }] - login_ss = concat(local.additional_ss, local.login_ghpc_ss) + content = var.login_startup_script + } + ghpc_startup_script_login = length(local.daos_ns) > 0 ? [local.daos_install_mount_script, local.ghpc_startup_login] : [local.ghpc_startup_login] - compute_ghpc_ss = length(var.compute_startup_script) == 0 ? [] : [{ + ghpc_startup_compute = { filename = "ghpc_startup.sh" - content = var.compute_startup_script }] - compute_ss = concat(local.additional_ss, local.compute_ghpc_ss) + content = var.compute_startup_script + } + ghpc_startup_script_compute = length(local.daos_ns) > 0 ? [local.daos_install_mount_script, local.ghpc_startup_compute] : [local.ghpc_startup_compute] nodeset_startup_scripts = { for k, v in local.nodeset_map : k => v.startup_script } } @@ -145,12 +146,12 @@ module "slurm_files" { one(google_secret_manager_secret_version.cloudsql_version[*].id), null) - controller_startup_scripts = local.controller_ss + controller_startup_scripts = local.ghpc_startup_script_controller controller_startup_scripts_timeout = var.controller_startup_scripts_timeout nodeset_startup_scripts = local.nodeset_startup_scripts - compute_startup_scripts = local.compute_ss + compute_startup_scripts = local.ghpc_startup_script_compute compute_startup_scripts_timeout = var.compute_startup_scripts_timeout - login_startup_scripts = local.login_ss + login_startup_scripts = local.ghpc_startup_script_login login_startup_scripts_timeout = var.login_startup_scripts_timeout enable_debug_logging = var.enable_debug_logging diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index e6bdc7e5a4..26dfd21d49 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -475,7 +475,7 @@ variable "cgroup_conf_tpl" { variable "controller_startup_script" { description = "Startup script used by the controller VM." type = string - default = "" + default = "# no-op" } variable "controller_startup_scripts_timeout" { @@ -493,7 +493,7 @@ EOD variable "login_startup_script" { description = "Startup script used by the login VMs." type = string - default = "" + default = "# no-op" } variable "login_startup_scripts_timeout" { @@ -511,7 +511,7 @@ EOD variable "compute_startup_script" { description = "Startup script used by the compute VMs." type = string - default = "" + default = "# no-op" } variable "compute_startup_scripts_timeout" { From e4d49654a29d5449632266449e88577fb82548b6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 4 Sep 2024 00:05:47 +0000 Subject: [PATCH 171/180] Bump cryptography from 42.0.4 to 43.0.1 in /community/front-end/ofe Bumps [cryptography](https://github.com/pyca/cryptography) from 42.0.4 to 43.0.1. - [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pyca/cryptography/compare/42.0.4...43.0.1) --- updated-dependencies: - dependency-name: cryptography dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index f2764643c5..26756d670c 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -12,7 +12,7 @@ cffi==1.15.1 cfgv==3.3.1 charset-normalizer==3.1.0 click==8.1.3 -cryptography==42.0.4 +cryptography==43.0.1 decorator==5.1.1 defusedxml==0.7.1 dill==0.3.6 From d86632515b1cb020e72747e9a0fe30eb50ca7e86 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Wed, 4 Sep 2024 04:18:25 +0000 Subject: [PATCH 172/180] revisit reservations interface --- modules/compute/gke-node-pool/README.md | 3 +- modules/compute/gke-node-pool/main.tf | 20 +++++++------ modules/compute/gke-node-pool/variables.tf | 33 ++++++++++------------ 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index d53d402e4b..c5af96c390 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -283,11 +283,10 @@ No modules. | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | | [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | -| [reservation\_type](#input\_reservation\_type) | Type of reservation to consume | `string` | `"NO_RESERVATION"` | no | +| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resources to consume. When targeting SPECIFIC\_RESERVATION, the list of specific\_reservations needs be specified.
It is assumed that the specified reservations exist and they have available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create reservations refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | -| [specific\_reservation](#input\_specific\_reservation) | Reservation resources to consume when targeting SPECIFIC\_RESERVATION.
Specify `compute.googleapis.com/reservation-name` as the key and the list of reservation names as the value.
It is assumed that the specified reservations exist and they have available capacity.
To create reservations refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
key = string
values = list(string)
})
|
{
"key": null,
"values": null
}
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no | | [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index df235442c7..460b640208 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -30,8 +30,10 @@ locals { effect = "NO_SCHEDULE" }] : [] - autoscale_set = var.autoscaling_total_min_nodes != 0 || var.autoscaling_total_max_nodes != 1000 - static_node_set = var.static_node_count != null + autoscale_set = var.autoscaling_total_min_nodes != 0 || var.autoscaling_total_max_nodes != 1000 + static_node_set = var.static_node_count != null + reservation_resource_api_label = "compute.googleapis.com/reservation-name" + specific_reservations_count = try(length(var.reservation_affinity.specific_reservations), 0) } data "google_compute_default_service_account" "default_sa" { @@ -159,9 +161,9 @@ resource "google_container_node_pool" "node_pool" { } reservation_affinity { - consume_reservation_type = var.reservation_type - key = var.specific_reservation.key - values = var.specific_reservation.values + consume_reservation_type = var.reservation_affinity.consume_reservation_type + key = local.specific_reservations_count != 1 ? null : local.reservation_resource_api_label + values = local.specific_reservations_count != 1 ? null : [for reservation in var.reservation_affinity.specific_reservations : reservation.name] } dynamic "host_maintenance_policy" { @@ -202,12 +204,12 @@ resource "google_container_node_pool" "node_pool" { } precondition { condition = ( - (var.reservation_type != "SPECIFIC_RESERVATION" && var.specific_reservation.key == null && var.specific_reservation.values == null) || - (var.reservation_type == "SPECIFIC_RESERVATION" && var.specific_reservation.key == "compute.googleapis.com/reservation-name" && var.specific_reservation.values != null) + (var.reservation_affinity.consume_reservation_type != "SPECIFIC_RESERVATION" && local.specific_reservations_count == 0) || + (var.reservation_affinity.consume_reservation_type == "SPECIFIC_RESERVATION" && local.specific_reservations_count == 1) ) error_message = <<-EOT - When using NO_RESERVATION or ANY_RESERVATION as the reservation type, `specific_reservation` cannot be set. - On the other hand, with SPECIFIC_RESERVATION you must set `specific_reservation.key` and `specific_reservation.values` to `compute.googleapis.com/reservation-name` and a list of reservation names respectively. + When using NO_RESERVATION or ANY_RESERVATION as the `consume_reservation_type`, `specific_reservations` cannot be set. + On the other hand, with SPECIFIC_RESERVATION you must set `specific_reservations`. EOT } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index d33575f6c8..439de2a45e 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -318,34 +318,31 @@ variable "additional_networks" { })) } -variable "reservation_type" { - description = "Type of reservation to consume" - type = string - default = "NO_RESERVATION" - - validation { - condition = contains(["NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"], var.reservation_type) - error_message = "Accepted values are: {NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION}" - } -} - -variable "specific_reservation" { +variable "reservation_affinity" { description = <<-EOT - Reservation resources to consume when targeting SPECIFIC_RESERVATION. - Specify `compute.googleapis.com/reservation-name` as the key and the list of reservation names as the value. + Reservation resources to consume. When targeting SPECIFIC_RESERVATION, the list of specific_reservations needs be specified. It is assumed that the specified reservations exist and they have available capacity. + For a shared reservation, specify the project_id as well in which it was created. To create reservations refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared EOT type = object({ - key = string - values = list(string) + consume_reservation_type = string + specific_reservations = optional(list(object({ + name = string + project = optional(string) + }))) }) default = { - key = null - values = null + consume_reservation_type = "NO_RESERVATION" + specific_reservations = [] + } + validation { + condition = contains(["NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"], var.reservation_affinity.consume_reservation_type) + error_message = "Accepted values are: {NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION}" } } + variable "host_maintenance_interval" { description = "Specifies the frequency of planned maintenance events." type = string From db95b224eaf3d14af42da2fd306fe5e44d7cbf5c Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Wed, 4 Sep 2024 04:18:25 +0000 Subject: [PATCH 173/180] fixup! revisit reservations interface --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/variables.tf | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index c5af96c390..31a4561457 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -283,7 +283,7 @@ No modules. | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | | [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | -| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resources to consume. When targeting SPECIFIC\_RESERVATION, the list of specific\_reservations needs be specified.
It is assumed that the specified reservations exist and they have available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create reservations refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | +| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 439de2a45e..251031f108 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -320,10 +320,11 @@ variable "additional_networks" { variable "reservation_affinity" { description = <<-EOT - Reservation resources to consume. When targeting SPECIFIC_RESERVATION, the list of specific_reservations needs be specified. - It is assumed that the specified reservations exist and they have available capacity. + Reservation resource to consume. When targeting SPECIFIC_RESERVATION, specific_reservations needs be specified. + Even though specific_reservations is a list, only one reservation is allowed by the NodePool API. + It is assumed that the specified reservation exists and has available capacity. For a shared reservation, specify the project_id as well in which it was created. - To create reservations refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared + To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared EOT type = object({ consume_reservation_type = string @@ -342,7 +343,6 @@ variable "reservation_affinity" { } } - variable "host_maintenance_interval" { description = "Specifies the frequency of planned maintenance events." type = string From 2b25abc5774d8887e23092b37c736dc890e67c7b Mon Sep 17 00:00:00 2001 From: Fionn Malone Date: Wed, 4 Sep 2024 05:05:05 +0000 Subject: [PATCH 174/180] Return correct null type. --- .../modules/slurm_files/scripts/load_bq.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index 7c91e019d3..800202d2ea 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -50,6 +50,8 @@ def make_datetime(time_string): + if time_string == "None": + return None return datetime.strptime(time_string, SLURM_TIME_FORMAT).replace( tzinfo=timezone.utc ) @@ -195,7 +197,6 @@ def make_job_row(job): field_name: dict.get(converters, field.field_type)(job[field_name]) for field_name, field in job_schema.items() if field_name in job - and job[field_name] != "None" } job_row["entry_uuid"] = uuid.uuid4().hex job_row["cluster_id"] = lookup().cfg.cluster_id From 03f23e0a1b13a69d729c5a27ad33f9b5e95c1b2c Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 3 Sep 2024 23:49:02 +0000 Subject: [PATCH 175/180] Update debian default image in chrome-remote-desktop module --- .../remote-desktop/chrome-remote-desktop/README.md | 4 ++-- .../remote-desktop/chrome-remote-desktop/variables.tf | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/community/modules/remote-desktop/chrome-remote-desktop/README.md b/community/modules/remote-desktop/chrome-remote-desktop/README.md index bdd44aa18f..610cdab231 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/README.md +++ b/community/modules/remote-desktop/chrome-remote-desktop/README.md @@ -85,11 +85,11 @@ No resources. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. Requires virtual workstation accelerator if Nvidia Grid Drivers are required |
list(object({
type = string,
count = number
}))
|
[
{
"count": 1,
"type": "nvidia-tesla-t4-vws"
}
]
| no | | [install\_nvidia\_driver](#input\_install\_nvidia\_driver) | Installs the nvidia driver (true/false). For details, see https://cloud.google.com/compute/docs/gpus/install-drivers-gpu | `bool` | n/a | yes | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | -| [instance\_image](#input\_instance\_image) | Image used to build chrome remote desktop node. The default image is
name="debian-12-bookworm-v20240312" and project="debian-cloud".
NOTE: uses fixed version of image to avoid NVIDIA driver compatibility issues.

An alternative image is from name="ubuntu-2204-jammy-v20240126" and project="ubuntu-os-cloud".

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"name": "debian-12-bookworm-v20240312",
"project": "debian-cloud"
}
| no | +| [instance\_image](#input\_instance\_image) | Image used to build chrome remote desktop node. The default image is
name="debian-12-bookworm-v20240815" and project="debian-cloud".
NOTE: uses fixed version of image to avoid NVIDIA driver compatibility issues.

An alternative image is from name="ubuntu-2204-jammy-v20240126" and project="ubuntu-os-cloud".

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"name": "debian-12-bookworm-v20240815",
"project": "debian-cloud"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Machine type to use for the instance creation. Must be N1 family if GPU is used. | `string` | `"n1-standard-8"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | -| [name\_prefix](#input\_name\_prefix) | An optional name for all VM and disk resources.
If not supplied, `deployment_name` will be used.
When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set,
then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". | `string` | `null` | no | +| [name\_prefix](#input\_name\_prefix) | An optional name for all VM and disk resources.
If not supplied, `deployment_name` will be used.
When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set,
then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". | `string` | `null` | no | | [network\_interfaces](#input\_network\_interfaces) | A list of network interfaces. The options match that of the terraform
network\_interface block of google\_compute\_instance. For descriptions of the
subfields or more information see the documentation:
https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#nested_network_interface
**\_NOTE:\_** If `network_interfaces` are set, `network_self_link` and
`subnetwork_self_link` will be ignored, even if they are provided through
the `use` field. `bandwidth_tier` and `enable_public_ips` also do not apply
to network interfaces defined in this variable.
Subfields:
network (string, required if subnetwork is not supplied)
subnetwork (string, required if network is not supplied)
subnetwork\_project (string, optional)
network\_ip (string, optional)
nic\_type (string, optional, choose from ["GVNIC", "VIRTIO\_NET"])
stack\_type (string, optional, choose from ["IPV4\_ONLY", "IPV4\_IPV6"])
queue\_count (number, optional)
access\_config (object, optional)
ipv6\_access\_config (object, optional)
alias\_ip\_range (list(object), optional) |
list(object({
network = string,
subnetwork = string,
subnetwork_project = string,
network_ip = string,
nic_type = string,
stack_type = string,
queue_count = number,
access_config = list(object({
nat_ip = string,
public_ptr_domain_name = string,
network_tier = string
})),
ipv6_access_config = list(object({
public_ptr_domain_name = string,
network_tier = string
})),
alias_ip_range = list(object({
ip_cidr_range = string,
subnetwork_range_name = string
}))
}))
| `[]` | no | | [network\_self\_link](#input\_network\_self\_link) | The self link of the network to attach the VM. | `string` | `"default"` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | diff --git a/community/modules/remote-desktop/chrome-remote-desktop/variables.tf b/community/modules/remote-desktop/chrome-remote-desktop/variables.tf index 41916e70ff..df31cf5f34 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/variables.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/variables.tf @@ -58,7 +58,7 @@ variable "network_storage" { variable "instance_image" { description = <<-EOD Image used to build chrome remote desktop node. The default image is - name="debian-12-bookworm-v20240312" and project="debian-cloud". + name="debian-12-bookworm-v20240815" and project="debian-cloud". NOTE: uses fixed version of image to avoid NVIDIA driver compatibility issues. An alternative image is from name="ubuntu-2204-jammy-v20240126" and project="ubuntu-os-cloud". @@ -71,7 +71,7 @@ variable "instance_image" { type = map(string) default = { project = "debian-cloud" - name = "debian-12-bookworm-v20240312" + name = "debian-12-bookworm-v20240815" } } @@ -95,9 +95,9 @@ variable "auto_delete_boot_disk" { variable "name_prefix" { description = <<-EOT - An optional name for all VM and disk resources. - If not supplied, `deployment_name` will be used. - When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set, + An optional name for all VM and disk resources. + If not supplied, `deployment_name` will be used. + When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set, then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". EOT type = string From b304cf35af826370e555f9ab39477e16175d621b Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Fri, 6 Sep 2024 21:24:15 +0000 Subject: [PATCH 176/180] Use local-ssd for enroot temp space. Large container images imported through enroot can take up significant space in /tmp, which can exhaust space on the device. By setting this variable, we will use the localssd space allocated for the user. The choice of /mnt/localssd/${UID}/enroot instead of enroot/tmp is because the ENROOT_TEMP_PATH is not created by enroot by default, whereas the other ENROOT paths are. --- examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml | 1 + .../a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml | 1 + examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml | 1 + 3 files changed, 3 insertions(+) diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml index e14540f0ed..22a4a49e68 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -144,6 +144,7 @@ deployment_groups: ENROOT_RUNTIME_PATH /mnt/localssd/${UID}/enroot/runtime ENROOT_CACHE_PATH /mnt/localssd/${UID}/enroot/cache ENROOT_DATA_PATH /mnt/localssd/${UID}/enroot/data + ENROOT_TEMP_PATH /mnt/localssd/${UID}/enroot - type: ansible-local destination: configure_gpu_monitoring.yml content: | diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml index 6063b2eea8..42a823bf8e 100644 --- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml +++ b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml @@ -147,6 +147,7 @@ deployment_groups: ENROOT_RUNTIME_PATH /mnt/localssd/${UID}/enroot/runtime ENROOT_CACHE_PATH /mnt/localssd/${UID}/enroot/cache ENROOT_DATA_PATH /mnt/localssd/${UID}/enroot/data + ENROOT_TEMP_PATH /mnt/localssd/${UID}/enroot EOT ### Install Pyxis if [ ! -f "/usr/local/lib/slurm/spank_pyxis.so" ]; then diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index 04d78aa9d3..899fcd6037 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -166,6 +166,7 @@ deployment_groups: ENROOT_RUNTIME_PATH /mnt/localssd/${UID}/enroot/runtime ENROOT_CACHE_PATH /mnt/localssd/${UID}/enroot/cache ENROOT_DATA_PATH /mnt/localssd/${UID}/enroot/data + ENROOT_TEMP_PATH /mnt/localssd/${UID}/enroot - type: ansible-local destination: configure_gpu_monitoring.yml content: | From 657c43e4ff2ab14a4d8131358134bb58fcc66df7 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 10 Sep 2024 09:20:20 -0500 Subject: [PATCH 177/180] Ensure that HCLS blueprint uses official release of Toolkit modules --- examples/hcls-blueprint.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/hcls-blueprint.yaml b/examples/hcls-blueprint.yaml index ee55925236..271a9f3ba5 100644 --- a/examples/hcls-blueprint.yaml +++ b/examples/hcls-blueprint.yaml @@ -329,11 +329,11 @@ deployment_groups: partition_name: gpu - id: slurm_login - source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login use: [network] - id: slurm_controller - source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network - compute_partition From c98e23ea0098ed65a910c73e1bba861cfc0dc370 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 10 Sep 2024 09:21:58 -0500 Subject: [PATCH 178/180] Ensure that Slurm TPU/MaxText blueprint uses official release of Toolkit modules --- community/examples/hpc-slurm6-tpu-maxtext.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/community/examples/hpc-slurm6-tpu-maxtext.yaml b/community/examples/hpc-slurm6-tpu-maxtext.yaml index 5e172cd5c2..b8a8121a5d 100644 --- a/community/examples/hpc-slurm6-tpu-maxtext.yaml +++ b/community/examples/hpc-slurm6-tpu-maxtext.yaml @@ -72,7 +72,7 @@ deployment_groups: python3 MaxText/train.py MaxText/configs/base.yml run_name= base_output_directory=${PWD}/output/ dataset_path= async_checkpointing=False attention= steps= - id: tpu_nodeset - source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu use: [network] settings: node_type: v4-8 @@ -88,7 +88,7 @@ deployment_groups: node_count_dynamic_max: 1 - id: tpu_partition - source: ./community/modules/compute/schedmd-slurm-gcp-v6-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [tpu_nodeset] settings: partition_name: tpu @@ -110,14 +110,14 @@ deployment_groups: is_default: true - id: slurm_login - source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login use: [network] settings: enable_login_public_ips: true machine_type: n2-standard-16 - id: slurm_controller - source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - tpu_partition - compute_partition From 68490a289bfc34ef0f77e700b06b0d572120bbf2 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 10 Sep 2024 09:22:54 -0500 Subject: [PATCH 179/180] Ensure that Slurm TPU blueprint uses official release of Toolkit modules --- community/examples/hpc-slurm6-tpu.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/community/examples/hpc-slurm6-tpu.yaml b/community/examples/hpc-slurm6-tpu.yaml index 0f6455884f..606dae3246 100644 --- a/community/examples/hpc-slurm6-tpu.yaml +++ b/community/examples/hpc-slurm6-tpu.yaml @@ -29,7 +29,7 @@ deployment_groups: source: modules/network/vpc - id: tpu_nodeset - source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu use: [network] settings: node_type: v3-8 @@ -45,20 +45,20 @@ deployment_groups: node_count_dynamic_max: 1 - id: tpu_partition - source: ./community/modules/compute/schedmd-slurm-gcp-v6-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [tpu_nodeset] settings: partition_name: tpu - id: slurm_login - source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login use: [network] settings: machine_type: n2-standard-4 enable_login_public_ips: true - id: slurm_controller - source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - tpu_partition - slurm_login From 602b9892901b4c5d09af3bd5539b7a17c2256d5f Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Wed, 11 Sep 2024 15:00:05 -0700 Subject: [PATCH 180/180] Update version from v1.38.0 to v1.39.0 --- cmd/root.go | 2 +- community/modules/compute/htcondor-execute-point/versions.tf | 2 +- community/modules/compute/mig/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-node-group/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-partition/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf | 2 +- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-partition/versions.tf | 2 +- .../modules/database/slurm-cloudsql-federation/versions.tf | 4 ++-- .../modules/file-system/cloud-storage-bucket/versions.tf | 2 +- community/modules/file-system/nfs-server/versions.tf | 2 +- community/modules/files/fsi-montecarlo-on-batch/versions.tf | 4 ++-- community/modules/network/private-service-access/versions.tf | 4 ++-- community/modules/project/service-enablement/versions.tf | 2 +- community/modules/pubsub/bigquery-sub/versions.tf | 4 ++-- community/modules/pubsub/topic/versions.tf | 2 +- community/modules/scheduler/htcondor-access-point/versions.tf | 2 +- .../modules/scheduler/htcondor-central-manager/versions.tf | 2 +- community/modules/scheduler/htcondor-pool-secrets/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v5-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v6-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf | 2 +- community/modules/scripts/wait-for-startup/versions.tf | 2 +- community/modules/scripts/windows-startup-script/versions.tf | 2 +- modules/compute/gke-node-pool/versions.tf | 2 +- modules/compute/vm-instance/versions.tf | 4 ++-- modules/file-system/filestore/versions.tf | 4 ++-- modules/file-system/gke-persistent-volume/versions.tf | 2 +- modules/monitoring/dashboard/versions.tf | 2 +- modules/network/firewall-rules/versions.tf | 2 +- modules/network/pre-existing-subnetwork/versions.tf | 2 +- modules/network/pre-existing-vpc/versions.tf | 2 +- modules/scheduler/batch-login-node/versions.tf | 2 +- modules/scheduler/gke-cluster/versions.tf | 2 +- modules/scheduler/pre-existing-gke-cluster/versions.tf | 2 +- modules/scripts/startup-script/versions.tf | 2 +- 38 files changed, 44 insertions(+), 44 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index 106be3bdf8..6ce5c00e5c 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`, logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.38.0", + Version: "v1.39.0", Annotations: annotation, } ) diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index 4e4e500f30..9353a60ede 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.39.0" } } diff --git a/community/modules/compute/mig/versions.tf b/community/modules/compute/mig/versions.tf index 2690b53cb0..0dd9be722b 100644 --- a/community/modules/compute/mig/versions.tf +++ b/community/modules/compute/mig/versions.tf @@ -22,6 +22,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:mig/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:mig/v1.39.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index e35c55bf3b..661022f0d9 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.39.0" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index e4947c1420..b5c8bfd98d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.39.0" } required_version = ">= 0.13.0" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf index cc616cd258..b692ea8b17 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.39.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index 783ba8e39a..d3669809df 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.39.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 38330af5d0..7963852a10 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.39.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index f0ea4295ce..51a4cedf2a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.39.0" } } diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 37480f7cb9..c3e2e17f34 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.39.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.39.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index d649bf0ea0..1dc92b754e 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.39.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 0d08aa7deb..52b3087016 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.39.0" } required_version = ">= 0.14.0" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index d3e1124ef4..fb590f7a6f 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.39.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.39.0" } } diff --git a/community/modules/network/private-service-access/versions.tf b/community/modules/network/private-service-access/versions.tf index 3569a93f37..635a858afd 100644 --- a/community/modules/network/private-service-access/versions.tf +++ b/community/modules/network/private-service-access/versions.tf @@ -30,11 +30,11 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.39.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.39.0" } required_version = ">= 1.2" diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index c32a7e9ca6..a40251b1ea 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.39.0" } required_version = ">= 0.14.0" diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf index d59dc83874..af06c5ff3f 100644 --- a/community/modules/pubsub/bigquery-sub/versions.tf +++ b/community/modules/pubsub/bigquery-sub/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.39.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.39.0" } required_version = ">= 1.0" } diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf index e9c1a1d319..2189a1688c 100644 --- a/community/modules/pubsub/topic/versions.tf +++ b/community/modules/pubsub/topic/versions.tf @@ -27,6 +27,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:topic/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:topic/v1.39.0" } } diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index 4473fa1c46..073d8a161a 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.39.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index 60bdd4f8ac..f4bf842159 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.39.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 254362717d..604297e0b5 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.39.0" } required_version = ">= 1.3.0" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index e3513e58be..c747ec3e35 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.39.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index 3c5bb6bf5d..ab68579d98 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.39.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index 1a0fdfa215..f9ad93d88b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.39.0" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index 59e73842cf..4b18af8439 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.39.0" } } diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index be6f5e82c9..5f7ffc9614 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.39.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index e4e02e4151..678429f568 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.39.0" } required_version = ">= 0.14.0" diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf index fd0c4e2044..604ae8f58a 100644 --- a/modules/compute/gke-node-pool/versions.tf +++ b/modules/compute/gke-node-pool/versions.tf @@ -26,6 +26,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.39.0" } } diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 2f42a5a83e..7aeba60707 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.39.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.39.0" } required_version = ">= 1.3.0" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index c85733d7e1..dc34a97b6e 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.39.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.39.0" } required_version = ">= 0.14.0" diff --git a/modules/file-system/gke-persistent-volume/versions.tf b/modules/file-system/gke-persistent-volume/versions.tf index adb28ea217..8933fc8dde 100644 --- a/modules/file-system/gke-persistent-volume/versions.tf +++ b/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.39.0" } } diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index 5f97cdab1b..a26bc82b5b 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.39.0" } required_version = ">= 0.14.0" diff --git a/modules/network/firewall-rules/versions.tf b/modules/network/firewall-rules/versions.tf index 485548fdc3..3518992c1a 100644 --- a/modules/network/firewall-rules/versions.tf +++ b/modules/network/firewall-rules/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.39.0" } required_version = ">= 1.3" diff --git a/modules/network/pre-existing-subnetwork/versions.tf b/modules/network/pre-existing-subnetwork/versions.tf index d3524d92f0..55fd273890 100644 --- a/modules/network/pre-existing-subnetwork/versions.tf +++ b/modules/network/pre-existing-subnetwork/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.39.0" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index 9d9a57638f..ff1c892c78 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.39.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index d7a9b6cb1b..9eb2c44c89 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.39.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf index 0f813deed4..068fafd188 100644 --- a/modules/scheduler/gke-cluster/versions.tf +++ b/modules/scheduler/gke-cluster/versions.tf @@ -34,6 +34,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.39.0" } } diff --git a/modules/scheduler/pre-existing-gke-cluster/versions.tf b/modules/scheduler/pre-existing-gke-cluster/versions.tf index 44e306d718..024682d0be 100644 --- a/modules/scheduler/pre-existing-gke-cluster/versions.tf +++ b/modules/scheduler/pre-existing-gke-cluster/versions.tf @@ -27,7 +27,7 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.39.0" } required_version = ">= 1.3" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 2c31eb6231..8d969b7d4f 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.39.0" } required_version = ">= 1.3"