Skip to content

Commit

Permalink
PLAT-6183: Setup containerd certs.d support on GPU nodes (#128)
Browse files Browse the repository at this point in the history
* Setup containerd certs.d support on GPU nodes

Note: This is already set up in the regular EKS 1.24 AMI

* Pin all test artifacts to 0.0.0
* Added label to skip deploy when unneeded
  • Loading branch information
Secretions authored Jan 26, 2023
1 parent 6c1a19c commit a528d9a
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 5 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ jobs:
env:
AWS_ACCESS_KEY_ID: ${{ secrets.DOMINO_ARTIFACTS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DOMINO_ARTIFACTS_ACCESS_KEY }}
DOMINO_CDK_VERSION: ${{ github.sha }}
DOMINO_CDK_VERSION: "0.0.0+${{ github.sha }}"
DATEDIR: "date +%Y%m%d"
run: |
cd ..
Expand Down Expand Up @@ -167,7 +167,7 @@ jobs:
sleep 120 # Immediate destroy after cdk deploy causes race conditions + give k8s time to deprovision after domino Uninstall
cdk destroy --force
- name: Fail without deploy
if: ${{ github.event.pull_request.draft == false && ! (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master') }}
if: ${{ github.event.pull_request.draft == false && ! (contains(github.event.pull_request.labels.*.name, 'deploy-test') || contains(github.event.pull_request.labels.*.name, 'no-deploy-needed') || github.ref == 'refs/heads/master') }}
run: |
echo "Deploy tests required on non-draft PRs. Please add 'deploy-test' label".
exit 1
24 changes: 21 additions & 3 deletions cdk/domino_cdk/provisioners/eks/eks_nodegroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ def provision_managed_nodegroup(
machine_image: Optional[ec2.IMachineImage] = (
ec2.MachineImage.generic_linux({region: ng.ami_id}) if ng.ami_id else None
)
mime_user_data: Optional[ec2.UserData] = self._handle_user_data(name, ng.ami_id, ng.ssm_agent, [ng.user_data])
mime_user_data: Optional[ec2.UserData] = self._handle_user_data(
name, ng.ami_id, False, ng.ssm_agent, [ng.user_data]
)

lt = self._launch_template(
self.cluster,
Expand Down Expand Up @@ -167,7 +169,9 @@ def provision_unmanaged_nodegroup(
).items():
cdk.Tags.of(asg).add(str(k), str(v), apply_to_launched_instances=True)

mime_user_data = self._handle_user_data(name, ng.ami_id, ng.ssm_agent, [ng.user_data, asg.user_data])
mime_user_data = self._handle_user_data(
name, ng.ami_id, ng.gpu, ng.ssm_agent, [ng.user_data, asg.user_data]
)

if not cfn_lt:
lt = self._launch_template(
Expand Down Expand Up @@ -239,7 +243,7 @@ def provision_unmanaged_nodegroup(
self.cluster.connect_auto_scaling_group_capacity(asg, **options)

def _handle_user_data(
self, name: str, custom_ami: bool, ssm_agent: bool, user_data_list: List[Union[ec2.UserData, str]]
self, name: str, custom_ami: bool, gpu: bool, ssm_agent: bool, user_data_list: List[Union[ec2.UserData, str]]
) -> Optional[ec2.UserData]:
mime_user_data = ec2.MultipartUserData()

Expand All @@ -254,6 +258,20 @@ def _handle_user_data(
),
)

if gpu:
mime_user_data.add_part(
ec2.MultipartBody.from_user_data(
ec2.UserData.custom(
'EKS_CONTAINERD_CFG="/etc/eks/containerd/containerd-config.toml"\n'
'if [ -z "$(egrep \'certs\.d\' $EKS_CONTAINERD_CFG)" ]; then\n'
' if [ -n "$(egrep \'plugins\.cri\.containerd\.runtimes\.nvidia\' $EKS_CONTAINERD_CFG)" ]; then\n'
' printf \'\n\n[plugins.cri.registry]\nconfig_path = "/etc/containerd/certs.d:/etc/docker/certs.d"\n\' >> $EKS_CONTAINERD_CFG\n'
' fi\n'
'fi\n'
)
)
)

# if not custom AMI, we can install ssm agent. If requested.
if ssm_agent:
mime_user_data.add_part(
Expand Down

0 comments on commit a528d9a

Please sign in to comment.