From d69c365f2472b607162d1898b9bd14c68fa996a5 Mon Sep 17 00:00:00 2001 From: Viacheslav Kovalevskyi Date: Mon, 26 Mar 2018 10:28:22 -0700 Subject: [PATCH] Support of P3 is added. (#9) * Support of P3 is added. * regions list are updated. --- README.md | 10 ++++++- cfn-bootstrap/dl_cfn_setup.py | 4 +-- cfn-bootstrap/dl_cfn_setup_v2.py | 2 +- cfn-template/StackSetup.md | 2 +- cfn-template/deeplearning.template | 46 +++++++++++++++++++++--------- 5 files changed, 45 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 0076d06..a68881d 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,14 @@ With this template, we continue with our mission to make [distributed deep learn ## What's New? We've updated the AWS CloudFormation Deep Learning template to add some exciting new features and capabilities. +### Mar 22 2018 + +* We now support 10 AWS regions - us-east-1, us-west-2, eu-west-1, us-east-2, ap-southeast-2, ap-northeast-1, ap-northeast-2, ap-south-1, eu-central-1,ap-southeast-1. + +* We now support p3 instances. + +### Older Release Notes + * We now support 5 AWS regions - us-east-1, us-east-2, us-west-2, eu-west-1 and ap-southeast-2. * We've enhanced the AWS CloudFormation Deep Learning template with automation that continues stack creation even if the provisioned number of worker instances falls short of the desired count. In the previous version of the template, if one of the worker instances failed to be provisioned, for example, if it a hit account limit, AWS CloudFormation rolled back the stack and required you to adjust your desired count and restart the stack creation process. The new template includes a function that automatically adjusts the count down and proceeds with setting up the rest of the cluster (stack). @@ -17,7 +25,7 @@ We've updated the AWS CloudFormation Deep Learning template to add some exciting * Amazon EFS allows sharing of code, data, and results across worker instances. * Using Amazon EFS doesn't degrade performance for densely packed files (for example, .rec files containing image data). -* We now support creating a cluster of instances running Ubuntu. See the [Ubuntu Deep Learning AMI](https://aws.amazon.com/marketplace/pp/B06VSPXKDX). +* We now support creating a cluster of instances running Ubuntu. See the [Ubuntu Deep Learning AMI](https://aws.amazon.com/marketplace/pp/B076TGJHY1). ## EC2 Cluster Architecture The following architecture diagram shows the EC2 cluster infrastructure. diff --git a/cfn-bootstrap/dl_cfn_setup.py b/cfn-bootstrap/dl_cfn_setup.py index cf93064..13d873e 100644 --- a/cfn-bootstrap/dl_cfn_setup.py +++ b/cfn-bootstrap/dl_cfn_setup.py @@ -47,7 +47,7 @@ AWS_DL_DEFAULT_USER = None EFS_MOUNT = None -AWS_GPU_INSTANCE_TYPES = [ "g2.2xlarge", "g2.8xlarge", "p2.xlarge", "p2.8xlarge", "p2.16xlarge" ] +AWS_GPU_INSTANCE_TYPES = [ "g3.4xlarge", "g3.8xlarge", "g3.16xlarge", "p2.xlarge", "p2.8xlarge", "p2.16xlarge", "p3.2xlarge", "p3.8xlarge", "p3.16xlarge" ] ''' Setup Logger and LogLevel @@ -433,4 +433,4 @@ def main(): sys.exit(1) if __name__ =='__main__': - main() \ No newline at end of file + main() diff --git a/cfn-bootstrap/dl_cfn_setup_v2.py b/cfn-bootstrap/dl_cfn_setup_v2.py index 8e2e607..41cbef3 100644 --- a/cfn-bootstrap/dl_cfn_setup_v2.py +++ b/cfn-bootstrap/dl_cfn_setup_v2.py @@ -48,7 +48,7 @@ EFS_MOUNT = None CFN_PATH = None -AWS_GPU_INSTANCE_TYPES = [ "g2.2xlarge", "g2.8xlarge", "p2.xlarge", "p2.8xlarge", "p2.16xlarge" ] +AWS_GPU_INSTANCE_TYPES = [ "g3.4xlarge", "g3.8xlarge", "g3.16xlarge", "p2.xlarge", "p2.8xlarge", "p2.16xlarge", "p3.2xlarge", "p3.8xlarge", "p3.16xlarge" ] ''' Setup Logger and LogLevel diff --git a/cfn-template/StackSetup.md b/cfn-template/StackSetup.md index b30d04e..4563ab4 100644 --- a/cfn-template/StackSetup.md +++ b/cfn-template/StackSetup.md @@ -30,7 +30,7 @@ If you need to scale the number of instances beyond the [default limit](https:// 7. Choose an **ImageType**, Amazon Linux or Ubuntu. -8. Choose an **InstanceType**, such as [P2.16xlarge](https://aws.amazon.com/ec2/instance-types/p2/). +8. Choose an **InstanceType**, such as [p3.2xlarge](https://aws.amazon.com/ec2/instance-types/p3/). 9. For **KeyName**, choose an EC2 key pair. diff --git a/cfn-template/deeplearning.template b/cfn-template/deeplearning.template index bb43f16..50de1f5 100644 --- a/cfn-template/deeplearning.template +++ b/cfn-template/deeplearning.template @@ -15,11 +15,14 @@ "InstanceType" : { "Description" : "The EC2 instance type for workers.For GPUs choose g2.xx or p2.xx", "Type" : "String", - "Default" : "p2.xlarge", + "Default" : "p3.2xlarge", "AllowedValues" : [ "p2.16xlarge", "p2.8xlarge", "p2.xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p3.16xlarge", "g2.8xlarge", "g2.2xlarge", "t2.small", @@ -107,18 +110,28 @@ }, "Mappings" : { "AmazonLinux" : { - "us-east-1" : { "AMI" : "ami-4b44745d" }, - "us-east-2" : { "AMI" : "ami-305d7c55" }, - "us-west-2" : { "AMI" : "ami-296e7850" }, - "eu-west-1" : { "AMI" : "ami-d36386aa" }, - "ap-southeast-2" : { "AMI" : "ami-52332031" } + "us-east-1" : { "AMI" : "ami-9706e5ea" }, + "us-west-2" : { "AMI" : "ami-dc70ffa4" }, + "eu-west-1" : { "AMI" : "ami-8caad3f5" }, + "us-east-2" : { "AMI" : "ami-f4586f91" }, + "ap-southeast-2" : { "AMI" : "ami-bbd710d9" }, + "ap-northeast-1" : { "AMI" : "ami-5ba3d93d" }, + "ap-northeast-2" : { "AMI" : "ami-d0d67bbe" }, + "ap-south-1" : { "AMI" : "ami-359ec25a" }, + "eu-central-1" : { "AMI" : "ami-ca3351a5" }, + "ap-southeast-1" : { "AMI" : "ami-ded39da2" } }, "Ubuntu" : { - "us-east-1" : { "AMI" : "ami-2edccb38" }, - "us-east-2" : { "AMI" : "ami-2797b642" }, - "us-west-2" : { "AMI" : "ami-7fd7c906" }, - "eu-west-1" : { "AMI" : "ami-19896660" }, - "ap-southeast-2" : { "AMI" : "ami-b32b37d0" } + "us-east-1" : { "AMI" : "ami-173bd86a" }, + "us-west-2" : { "AMI" : "ami-5a77f822" }, + "eu-west-1" : { "AMI" : "ami-2fb0c956" }, + "us-east-2" : { "AMI" : "ami-295b6c4c" }, + "ap-southeast-2" : { "AMI" : "ami-64d51206" }, + "ap-northeast-1" : { "AMI" : "ami-bcafd5da" }, + "ap-northeast-2" : { "AMI" : "ami-1ad17c74" }, + "ap-south-1" : { "AMI" : "ami-959fc3fa" }, + "eu-central-1" : { "AMI" : "ami-3a254755" }, + "ap-southeast-1" : { "AMI" : "ami-63d9971f" } }, "SubnetConfig" : { "VPC" : { "CIDR" : "10.0.0.0/16" }, @@ -127,10 +140,15 @@ }, "S3" : { "us-east-1" : { "URL" : "https://s3.amazonaws.com/" }, - "us-east-2" : { "URL" : "https://s3-us-east-2.amazonaws.com/" }, "us-west-2" : { "URL" : "https://s3-us-west-2.amazonaws.com/" }, "eu-west-1" : { "URL" : "https://s3-eu-west-1.amazonaws.com/" }, - "ap-southeast-2" : { "URL" : "https://s3-ap-southeast-2.amazonaws.com/" } + "us-east-2" : { "URL" : "https://s3-us-east-2.amazonaws.com/" }, + "ap-southeast-2" : { "URL" : "https://s3-ap-southeast-2.amazonaws.com/" }, + "ap-northeast-1" : { "URL" : "https://s3-ap-northeast-1.amazonaws.com/" }, + "ap-northeast-2" : { "URL" : "https://s3-ap-northeast-2.amazonaws.com/" }, + "ap-south-1" : { "URL" : "https://s3-ap-south-1.amazonaws.com/" }, + "eu-central-1" : { "URL" : "https://s3-eu-central-1.amazonaws.com/" }, + "ap-southeast-1" : { "URL" : "https://s3-ap-southeast-1.amazonaws.com/" } }, "Other" : { "S3SourceBucket" : { "BucketNameSuffix" : "-aws-dl-cfn" }, @@ -150,7 +168,7 @@ "Role": { "Fn::GetAtt" : ["LambdaExecutionRole", "Arn"] }, "Code": { "S3Bucket": {"Fn::Join" : ["", [{ "Ref" : "AWS::Region" }, { "Fn::FindInMap" : [ "Other", "S3SourceBucket", "BucketNameSuffix" ]} ] ]}, - "S3Key": { "Fn::FindInMap" : [ "Other", "LambdaFunction", "FileName" ]}, + "S3Key": { "Fn::FindInMap" : [ "Other", "LambdaFunction", "FileName" ]} }, "MemorySize" : "256", "Timeout": "60",