diff --git a/notebooks/community/model_garden/model_garden_pytorch_llama3_1_finetuning.ipynb b/notebooks/community/model_garden/model_garden_pytorch_llama3_1_finetuning.ipynb index 554ef659d..d8b0be56d 100644 --- a/notebooks/community/model_garden/model_garden_pytorch_llama3_1_finetuning.ipynb +++ b/notebooks/community/model_garden/model_garden_pytorch_llama3_1_finetuning.ipynb @@ -192,7 +192,7 @@ "\n", "# @markdown **Only select and fill one of the following sections.**\n", "# fmt: off\n", - "LOAD_MODEL_FROM = \"Hugging Face\" # @param [\"Hugging Face\", \"Google Cloud\"] {isTemplate:true}\n", + "LOAD_MODEL_FROM = \"Google Cloud\" # @param [\"Hugging Face\", \"Google Cloud\"] {isTemplate:true}\n", "# fmt: on\n", "\n", "# @markdown ---\n", @@ -319,22 +319,22 @@ "\n", "# @markdown **Note**:\n", "# @markdown 1. We recommend setting `finetuning_precision_mode` to `4bit` because it enables using fewer hardware resources for finetuning.\n", - "# @markdown 1. We recommend using NVIDIA_L4 for 8B models and NVIDIA_A100_80GB for 70B models.\n", + "# @markdown 1. We recommend using NVIDIA_A100_80GB for 8B and 70B models, and NVIDIA_H100_80GB for 405B model.\n", "# @markdown 1. If `max_steps>0`, it will precedence over `epochs`. One can set a small `max_steps` value to quickly check the pipeline.\n", "# @markdown 1. With the default setting, training takes between 1.5 ~ 2 hours.\n", "\n", "# The Llama 3.1 base model.\n", - "MODEL_ID = \"meta-llama/Meta-Llama-3.1-8B-Instruct\" # @param [\"meta-llama/Meta-Llama-3.1-8B\", \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"meta-llama/Meta-Llama-3.1-70B\", \"meta-llama/Meta-Llama-3.1-70B-Instruct\"] {isTemplate:true}\n", + "MODEL_ID = \"meta-llama/Meta-Llama-3.1-8B-Instruct\" # @param [\"meta-llama/Meta-Llama-3.1-8B\", \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"meta-llama/Meta-Llama-3.1-70B\", \"meta-llama/Meta-Llama-3.1-70B-Instruct\", \"meta-llama/Meta-Llama-3.1-405B\", \"meta-llama/Meta-Llama-3.1-405B-Instruct\"] {isTemplate:true}\n", "if LOAD_MODEL_FROM == \"Google Cloud\":\n", " base_model_id = os.path.join(MODEL_BUCKET, MODEL_ID.split(\"/\")[-1])\n", "else:\n", " base_model_id = MODEL_ID\n", "\n", "# The pre-built training docker image.\n", - "TRAIN_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240724_0936_RC00\"\n", + "TRAIN_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240822_0936_RC00\"\n", "\n", "# The accelerator to use.\n", - "accelerator_type = \"NVIDIA_A100_80GB\" # @param [\"NVIDIA_A100_80GB\"]\n", + "accelerator_type = \"NVIDIA_A100_80GB\" # @param [\"NVIDIA_A100_80GB\", \"NVIDIA_H100_80GB\"]\n", "\n", "# Batch size for finetuning.\n", "per_device_train_batch_size = 1 # @param{type:\"integer\"}\n", @@ -355,20 +355,53 @@ "lora_dropout = 0.05 # @param{type:\"number\"}\n", "enable_gradient_checkpointing = True\n", "attn_implementation = \"flash_attention_2\"\n", - "optimizer = \"paged_adamw_32bit\"\n", + "optimizer = \"adamw_torch\"\n", "warmup_ratio = \"0.01\"\n", "report_to = \"tensorboard\"\n", "save_steps = 10\n", "logging_steps = save_steps\n", "\n", "# Worker pool spec.\n", - "if accelerator_type == \"NVIDIA_A100_80GB\":\n", - " accelerator_count = 4\n", - " machine_type = \"a2-ultragpu-4g\"\n", + "machine_type = None\n", + "if \"405b\" in MODEL_ID.lower():\n", + " if accelerator_type == \"NVIDIA_H100_80GB\":\n", + " accelerator_count = 8\n", + " machine_type = \"a3-highgpu-8g\"\n", + " boot_disk_size_gb = 2000\n", + " merge_model_precision_mode = \"float8\"\n", + " else:\n", + " raise ValueError(\n", + " f\"Recommended machine settings not found for: {accelerator_type}. To use another accelerator, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model_vllm function by clicking `Show Code` and then modifying the code.\"\n", + " )\n", "else:\n", - " raise ValueError(\n", - " f\"Recommended machine settings not found for: {accelerator_type}. To use another accelerator, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model_vllm function by clicking `Show Code` and then modifying the code.\"\n", - " )\n", + " if accelerator_type == \"NVIDIA_A100_80GB\":\n", + " accelerator_count = 4\n", + " machine_type = \"a2-ultragpu-4g\"\n", + " boot_disk_size_gb = 500\n", + " merge_model_precision_mode = \"float16\"\n", + " else:\n", + " raise ValueError(\n", + " f\"Recommended machine settings not found for: {accelerator_type}. To use another accelerator, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model_vllm function by clicking `Show Code` and then modifying the code.\"\n", + " )\n", + "\n", + "# The number of nodes to use for this worker pool in distributed training.\n", + "replica_count = 1 # @param{type:\"integer\"}\n", + "\n", + "# Set config file.\n", + "if \"405b\" in MODEL_ID.lower():\n", + " if replica_count > 4:\n", + " raise ValueError(\n", + " f\"Recommended config settings not found for replica_count: {replica_count}.\"\n", + " )\n", + " elif replica_count == 1:\n", + " config_file = \"vertex_vision_model_garden_peft/llama_fsdp_8gpu.yaml\"\n", + " else:\n", + " config_file = (\n", + " \"vertex_vision_model_garden_peft/\"\n", + " f\"llama_hsdp_{replica_count * accelerator_count}gpu.yaml\"\n", + " )\n", + "else:\n", + " raise ValueError(f\"Unsupported model ID or GCS path: {MODEL_ID}.\")\n", "\n", "replica_count = 1\n", "\n", @@ -376,7 +409,7 @@ " project_id=PROJECT_ID,\n", " region=REGION,\n", " accelerator_type=accelerator_type,\n", - " accelerator_count=accelerator_count,\n", + " accelerator_count=accelerator_count * replica_count,\n", " is_for_training=True,\n", ")\n", "\n", @@ -400,7 +433,7 @@ "]\n", "\n", "train_job_args = [\n", - " \"--config_file=vertex_vision_model_garden_peft/deepspeed_zero2_4gpu.yaml\",\n", + " f\"--config_file={config_file}\",\n", " \"--task=instruct-lora\",\n", " \"--completion_only=True\",\n", " f\"--pretrained_model_id={base_model_id}\",\n", @@ -409,6 +442,7 @@ " f\"--instruct_column_in_dataset={instruct_column_in_dataset}\",\n", " f\"--output_dir={lora_output_dir}\",\n", " f\"--merge_base_and_lora_output_dir={merged_model_output_dir}\",\n", + " f\"--merge_model_precision_mode={merge_model_precision_mode}\",\n", " f\"--per_device_train_batch_size={per_device_train_batch_size}\",\n", " f\"--gradient_accumulation_steps={gradient_accumulation_steps}\",\n", " f\"--lora_rank={lora_rank}\",\n", @@ -451,7 +485,7 @@ " machine_type=machine_type,\n", " accelerator_type=accelerator_type,\n", " accelerator_count=accelerator_count,\n", - " boot_disk_size_gb=500,\n", + " boot_disk_size_gb=boot_disk_size_gb,\n", " service_account=SERVICE_ACCOUNT,\n", " tensorboard=tensorboard.resource_name,\n", " base_output_dir=base_output_dir,\n", @@ -485,10 +519,16 @@ " machine_type = \"g2-standard-12\"\n", " accelerator_type = \"NVIDIA_L4\"\n", " accelerator_count = 1\n", - "else:\n", + "elif \"70b\" in MODEL_ID.lower():\n", " machine_type = \"g2-standard-96\"\n", " accelerator_type = \"NVIDIA_L4\"\n", " accelerator_count = 8\n", + "elif \"405b\" in MODEL_ID.lower():\n", + " machine_type = \"a3-highgpu-8g\"\n", + " accelerator_type = \"NVIDIA_H100_80GB\"\n", + " accelerator_count = 8\n", + "else:\n", + " raise ValueError(f\"Unsupported model ID or GCS path: {MODEL_ID}.\")\n", "\n", "common_util.check_quota(\n", " project_id=PROJECT_ID,\n",