Skip to content

Commit

Permalink
Support 405B training with single/multi node
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 665503841
  • Loading branch information
Minwoo Park authored and copybara-github committed Aug 22, 2024
1 parent 830c954 commit 83995c6
Showing 1 changed file with 56 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@
"\n",
"# @markdown **Only select and fill one of the following sections.**\n",
"# fmt: off\n",
"LOAD_MODEL_FROM = \"Hugging Face\" # @param [\"Hugging Face\", \"Google Cloud\"] {isTemplate:true}\n",
"LOAD_MODEL_FROM = \"Google Cloud\" # @param [\"Hugging Face\", \"Google Cloud\"] {isTemplate:true}\n",
"# fmt: on\n",
"\n",
"# @markdown ---\n",
Expand Down Expand Up @@ -319,22 +319,22 @@
"\n",
"# @markdown **Note**:\n",
"# @markdown 1. We recommend setting `finetuning_precision_mode` to `4bit` because it enables using fewer hardware resources for finetuning.\n",
"# @markdown 1. We recommend using NVIDIA_L4 for 8B models and NVIDIA_A100_80GB for 70B models.\n",
"# @markdown 1. We recommend using NVIDIA_A100_80GB for 8B and 70B models, and NVIDIA_H100_80GB for 405B model.\n",
"# @markdown 1. If `max_steps>0`, it will precedence over `epochs`. One can set a small `max_steps` value to quickly check the pipeline.\n",
"# @markdown 1. With the default setting, training takes between 1.5 ~ 2 hours.\n",
"\n",
"# The Llama 3.1 base model.\n",
"MODEL_ID = \"meta-llama/Meta-Llama-3.1-8B-Instruct\" # @param [\"meta-llama/Meta-Llama-3.1-8B\", \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"meta-llama/Meta-Llama-3.1-70B\", \"meta-llama/Meta-Llama-3.1-70B-Instruct\"] {isTemplate:true}\n",
"MODEL_ID = \"meta-llama/Meta-Llama-3.1-8B-Instruct\" # @param [\"meta-llama/Meta-Llama-3.1-8B\", \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"meta-llama/Meta-Llama-3.1-70B\", \"meta-llama/Meta-Llama-3.1-70B-Instruct\", \"meta-llama/Meta-Llama-3.1-405B\", \"meta-llama/Meta-Llama-3.1-405B-Instruct\"] {isTemplate:true}\n",
"if LOAD_MODEL_FROM == \"Google Cloud\":\n",
" base_model_id = os.path.join(MODEL_BUCKET, MODEL_ID.split(\"/\")[-1])\n",
"else:\n",
" base_model_id = MODEL_ID\n",
"\n",
"# The pre-built training docker image.\n",
"TRAIN_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240724_0936_RC00\"\n",
"TRAIN_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240822_0936_RC00\"\n",
"\n",
"# The accelerator to use.\n",
"accelerator_type = \"NVIDIA_A100_80GB\" # @param [\"NVIDIA_A100_80GB\"]\n",
"accelerator_type = \"NVIDIA_A100_80GB\" # @param [\"NVIDIA_A100_80GB\", \"NVIDIA_H100_80GB\"]\n",
"\n",
"# Batch size for finetuning.\n",
"per_device_train_batch_size = 1 # @param{type:\"integer\"}\n",
Expand All @@ -355,28 +355,61 @@
"lora_dropout = 0.05 # @param{type:\"number\"}\n",
"enable_gradient_checkpointing = True\n",
"attn_implementation = \"flash_attention_2\"\n",
"optimizer = \"paged_adamw_32bit\"\n",
"optimizer = \"adamw_torch\"\n",
"warmup_ratio = \"0.01\"\n",
"report_to = \"tensorboard\"\n",
"save_steps = 10\n",
"logging_steps = save_steps\n",
"\n",
"# Worker pool spec.\n",
"if accelerator_type == \"NVIDIA_A100_80GB\":\n",
" accelerator_count = 4\n",
" machine_type = \"a2-ultragpu-4g\"\n",
"machine_type = None\n",
"if \"405b\" in MODEL_ID.lower():\n",
" if accelerator_type == \"NVIDIA_H100_80GB\":\n",
" accelerator_count = 8\n",
" machine_type = \"a3-highgpu-8g\"\n",
" boot_disk_size_gb = 2000\n",
" merge_model_precision_mode = \"float8\"\n",
" else:\n",
" raise ValueError(\n",
" f\"Recommended machine settings not found for: {accelerator_type}. To use another accelerator, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model_vllm function by clicking `Show Code` and then modifying the code.\"\n",
" )\n",
"else:\n",
" raise ValueError(\n",
" f\"Recommended machine settings not found for: {accelerator_type}. To use another accelerator, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model_vllm function by clicking `Show Code` and then modifying the code.\"\n",
" )\n",
" if accelerator_type == \"NVIDIA_A100_80GB\":\n",
" accelerator_count = 4\n",
" machine_type = \"a2-ultragpu-4g\"\n",
" boot_disk_size_gb = 500\n",
" merge_model_precision_mode = \"float16\"\n",
" else:\n",
" raise ValueError(\n",
" f\"Recommended machine settings not found for: {accelerator_type}. To use another accelerator, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model_vllm function by clicking `Show Code` and then modifying the code.\"\n",
" )\n",
"\n",
"# The number of nodes to use for this worker pool in distributed training.\n",
"replica_count = 1 # @param{type:\"integer\"}\n",
"\n",
"# Set config file.\n",
"if \"405b\" in MODEL_ID.lower():\n",
" if replica_count > 4:\n",
" raise ValueError(\n",
" f\"Recommended config settings not found for replica_count: {replica_count}.\"\n",
" )\n",
" elif replica_count == 1:\n",
" config_file = \"vertex_vision_model_garden_peft/llama_fsdp_8gpu.yaml\"\n",
" else:\n",
" config_file = (\n",
" \"vertex_vision_model_garden_peft/\"\n",
" f\"llama_hsdp_{replica_count * accelerator_count}gpu.yaml\"\n",
" )\n",
"else:\n",
" raise ValueError(f\"Unsupported model ID or GCS path: {MODEL_ID}.\")\n",
"\n",
"replica_count = 1\n",
"\n",
"common_util.check_quota(\n",
" project_id=PROJECT_ID,\n",
" region=REGION,\n",
" accelerator_type=accelerator_type,\n",
" accelerator_count=accelerator_count,\n",
" accelerator_count=accelerator_count * replica_count,\n",
" is_for_training=True,\n",
")\n",
"\n",
Expand All @@ -400,7 +433,7 @@
"]\n",
"\n",
"train_job_args = [\n",
" \"--config_file=vertex_vision_model_garden_peft/deepspeed_zero2_4gpu.yaml\",\n",
" f\"--config_file={config_file}\",\n",
" \"--task=instruct-lora\",\n",
" \"--completion_only=True\",\n",
" f\"--pretrained_model_id={base_model_id}\",\n",
Expand All @@ -409,6 +442,7 @@
" f\"--instruct_column_in_dataset={instruct_column_in_dataset}\",\n",
" f\"--output_dir={lora_output_dir}\",\n",
" f\"--merge_base_and_lora_output_dir={merged_model_output_dir}\",\n",
" f\"--merge_model_precision_mode={merge_model_precision_mode}\",\n",
" f\"--per_device_train_batch_size={per_device_train_batch_size}\",\n",
" f\"--gradient_accumulation_steps={gradient_accumulation_steps}\",\n",
" f\"--lora_rank={lora_rank}\",\n",
Expand Down Expand Up @@ -451,7 +485,7 @@
" machine_type=machine_type,\n",
" accelerator_type=accelerator_type,\n",
" accelerator_count=accelerator_count,\n",
" boot_disk_size_gb=500,\n",
" boot_disk_size_gb=boot_disk_size_gb,\n",
" service_account=SERVICE_ACCOUNT,\n",
" tensorboard=tensorboard.resource_name,\n",
" base_output_dir=base_output_dir,\n",
Expand Down Expand Up @@ -485,10 +519,16 @@
" machine_type = \"g2-standard-12\"\n",
" accelerator_type = \"NVIDIA_L4\"\n",
" accelerator_count = 1\n",
"else:\n",
"elif \"70b\" in MODEL_ID.lower():\n",
" machine_type = \"g2-standard-96\"\n",
" accelerator_type = \"NVIDIA_L4\"\n",
" accelerator_count = 8\n",
"elif \"405b\" in MODEL_ID.lower():\n",
" machine_type = \"a3-highgpu-8g\"\n",
" accelerator_type = \"NVIDIA_H100_80GB\"\n",
" accelerator_count = 8\n",
"else:\n",
" raise ValueError(f\"Unsupported model ID or GCS path: {MODEL_ID}.\")\n",
"\n",
"common_util.check_quota(\n",
" project_id=PROJECT_ID,\n",
Expand Down

0 comments on commit 83995c6

Please sign in to comment.