.

huggingface · Nov 27, 2024 · c6a2cc6 · c6a2cc6
1 parent e3b886c
commit c6a2cc6
Show file tree

Hide file tree

Showing 2 changed files with 237 additions and 0 deletions.
diff --git a/run_multinode.sh b/run_multinode.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+#SBATCH --job-name=smolm2-bench    # Job name
+#SBATCH --time=00:15:00
+#SBATCH --partition=hopper-prod
+#SBATCH --qos=high
+
+#SBATCH -o /fsx/nouamane/projects/nanotron/logs/%x-%j.out
+
+#SBATCH --nodes=2                 # Number of nodes (modify as needed)
+#SBATCH --ntasks-per-node=1       # Number of tasks per node
+#SBATCH --cpus-per-task=80         # CPU cores per task
+#SBATCH --gres=gpu:8              # Number of GPUs per node
+#SBATCH --exclusive               # Exclusive use of nodes
+
+set -x -e
+
+# Load any necessary modules for your system
+source /etc/profile.d/modules.sh # for some reason module isn't loaded
+module load cuda/12.1
+
+# Activate your conda environment if needed
+source /fsx/nouamane/miniconda/bin/activate
+conda activate 2-1-cu121
+export PATH=/fsx/nouamane/miniconda/envs/2-1-cu121/bin:$PATH
+
+# Get the node names from SLURM
+export NODELIST=`scontrol show hostnames $SLURM_JOB_NODELIST`
+export MASTER_NODE=`scontrol show hostnames $SLURM_JOB_NODELIST | head -n1`
+export MASTER_PORT=12356
+
+# Calculate total number of processes
+export NNODES=$SLURM_NNODES
+export GPUS_PER_NODE=8
+export WORLD_SIZE=$(($NNODES * $GPUS_PER_NODE))
+
+# Set some environment variables for better distributed training
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_DEBUG=INFO
+
+# Nanotron specific
+export NANOTRON_BENCHMARK=1
+
+# Print some debugging information
+echo "Master node: $MASTER_NODE"
+echo "All nodes: $NODELIST"
+echo "World size: $WORLD_SIZE"
+
+# Launch the training script using srun
+srun torchrun \
+    --nnodes=$NNODES \
+    --nproc_per_node=$GPUS_PER_NODE \
+    --rdzv_id=$SLURM_JOB_ID \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint=$MASTER_NODE:$MASTER_PORT \
+    run_train.py \
+    --config-file examples/config_tiny_llama.yaml
diff --git a/scaling_benchmarks.py b/scaling_benchmarks.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+import argparse
+import math
+import os
+
+import yaml
+
+
+def create_config(
+    dp: int,
+    tp: int,
+    pp: int,
+    batch_accum: int,
+    seq_len: int,
+    micro_batch_size: int = 1,
+    base_config_path: str = "examples/config_tiny_llama.yaml",
+) -> dict:
+    """Create a config with the specified parallelism settings."""
+    # Load base config
+    if not os.path.exists(base_config_path):
+        raise FileNotFoundError(f"Base config file not found: {base_config_path}")
+
+    with open(base_config_path) as f:
+        config = yaml.safe_load(f)
+
+    # Modify parallelism settings
+    config["parallelism"]["dp"] = dp
+    config["parallelism"]["tp"] = tp
+    config["parallelism"]["pp"] = pp
+
+    # Modify batch and sequence settings
+    config["tokens"]["batch_accumulation_per_replica"] = batch_accum
+    config["tokens"]["sequence_length"] = seq_len
+    config["tokens"]["micro_batch_size"] = micro_batch_size
+
+    # Update run name to reflect configuration
+    config["general"]["run"] = f"dp{dp}_tp{tp}_pp{pp}_acc{batch_accum}_mbs{micro_batch_size}_seq{seq_len}"
+
+    # Update benchmark CSV path
+    config["general"]["benchmark_csv_path"] = "bench.csv"
+
+    return config
+
+
+def generate_slurm_script(
+    config: dict,
+    dp: int,
+    tp: int,
+    pp: int,
+    time: str = "00:15:00",
+    partition: str = "hopper-prod",
+    base_script_path: str = "run_multinode.sh",
+) -> str:
+    """Generate a SLURM script for the given configuration."""
+    # Check if base script exists
+    if not os.path.exists(base_script_path):
+        raise FileNotFoundError(f"Base script file not found: {base_script_path}")
+
+    # Load base script
+    with open(base_script_path) as f:
+        script = f.read()
+
+    # Calculate required number of nodes
+    gpus_per_node = 8
+    total_gpus_needed = dp * tp * pp
+    num_nodes = math.ceil(total_gpus_needed / gpus_per_node)
+
+    # Replace SLURM parameters
+    replacements = {
+        "--nodes=2": f"--nodes={num_nodes}",
+        "--time=00:15:00": f"--time={time}",
+        "--partition=hopper-prod": f"--partition={partition}",
+        "--job-name=smolm2-bench": f"--job-name=bench_{config['general']['run']}",
+        "examples/config_tiny_llama.yaml": f"benchmark/configs/config_{config['general']['run']}.yaml",
+    }
+
+    for old, new in replacements.items():
+        if old not in script:
+            print(f"Warning: Could not find '{old}' in base script")
+        script = script.replace(old, new)
+
+    return script
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run scaling benchmarks with different parallelism configurations")
+    parser.add_argument(
+        "--configs-dir", type=str, default="benchmark/configs", help="Directory to store generated configs"
+    )
+    parser.add_argument(
+        "--scripts-dir", type=str, default="benchmark/scripts", help="Directory to store generated SLURM scripts"
+    )
+    parser.add_argument("--partition", type=str, default="hopper-prod", help="SLURM partition to use")
+    parser.add_argument("--time", type=str, default="00:15:00", help="Time limit for each job")
+    parser.add_argument(
+        "--base-config", type=str, default="examples/config_tiny_llama.yaml", help="Base configuration file to use"
+    )
+    parser.add_argument("--base-script", type=str, default="run_multinode.sh", help="Base SLURM script to use")
+    parser.add_argument("--run", action="store_true", help="Automatically submit all generated SLURM scripts")
+    args = parser.parse_args()
+
+    # Validate input files exist
+    if not os.path.exists(args.base_config):
+        raise FileNotFoundError(f"Base config file not found: {args.base_config}")
+    if not os.path.exists(args.base_script):
+        raise FileNotFoundError(f"Base script file not found: {args.base_script}")
+
+    # Create directories if they don't exist
+    for directory in [args.configs_dir, args.scripts_dir]:
+        os.makedirs(directory, exist_ok=True)
+
+    # Define configurations to test
+    configurations = [
+        # (dp, tp, pp, batch_accum, seq_len, mbs)
+        # (1, 8, 1, 1, 2048, 1),    # Base configuration
+        # (2, 4, 1, 1, 2048, 1),
+        # (8, 1, 1, 1, 2048, 1),
+        # (16, 1, 1, 1, 2048, 1),
+        # *[(2**i, 1, 1, 1, 2048, 1) for i in range(3, 8)],
+        *[(2**i, 1, 1, 1, 2048, 8) for i in range(0, 7)],
+        *[(2**i, 8, 1, 1, 2048, 8) for i in range(0, 7)],
+    ]
+
+    # Validate configurations
+    for dp, tp, pp, batch_accum, seq_len, mbs in configurations:
+        total_gpus = dp * tp * pp
+        if total_gpus > 64:  # Assuming maximum of 8 nodes with 8 GPUs each
+            print(
+                f"Warning: Configuration dp={dp}, tp={tp}, pp={pp} requires {total_gpus} GPUs, which might be too many"
+            )
+
+    # Generate configs and scripts
+    generated_scripts = []  # Keep track of generated script paths
+    for dp, tp, pp, batch_accum, seq_len, mbs in configurations:
+        try:
+            # Create config
+            config = create_config(dp, tp, pp, batch_accum, seq_len, mbs, base_config_path=args.base_config)
+
+            # Save config
+            config_path = os.path.join(args.configs_dir, f"config_{config['general']['run']}.yaml")
+            with open(config_path, "w") as f:
+                yaml.dump(config, f, default_flow_style=False)
+
+            # Generate and save SLURM script
+            script = generate_slurm_script(
+                config, dp, tp, pp, time=args.time, partition=args.partition, base_script_path=args.base_script
+            )
+
+            script_path = os.path.join(args.scripts_dir, f"run_{config['general']['run']}.sh")
+            with open(script_path, "w") as f:
+                f.write(script)
+
+            # Make script executable
+            os.chmod(script_path, 0o755)
+
+            print(f"Successfully generated config and script for {config['general']['run']}")
+            generated_scripts.append(script_path)
+
+        except Exception as e:
+            print(f"Error processing configuration (dp={dp}, tp={tp}, pp={pp}): {str(e)}")
+
+    # Submit jobs if requested
+    if args.run:
+        import subprocess
+
+        print("\nSubmitting jobs...")
+        for script_path in generated_scripts:
+            try:
+                result = subprocess.run(["sbatch", script_path], check=True, capture_output=True, text=True)
+                print(f"Submitted {script_path}: {result.stdout.strip()}")
+            except subprocess.CalledProcessError as e:
+                print(f"Error submitting {script_path}: {e.stderr}")
+    else:
+        print("\nTo run individual jobs:")
+        for script_path in generated_scripts:
+            print(f"sbatch {script_path}")
+
+
+if __name__ == "__main__":
+    main()