Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
NouamaneTazi committed Nov 27, 2024
1 parent e3b886c commit c6a2cc6
Show file tree
Hide file tree
Showing 2 changed files with 237 additions and 0 deletions.
57 changes: 57 additions & 0 deletions run_multinode.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash

#SBATCH --job-name=smolm2-bench # Job name
#SBATCH --time=00:15:00
#SBATCH --partition=hopper-prod
#SBATCH --qos=high

#SBATCH -o /fsx/nouamane/projects/nanotron/logs/%x-%j.out

#SBATCH --nodes=2 # Number of nodes (modify as needed)
#SBATCH --ntasks-per-node=1 # Number of tasks per node
#SBATCH --cpus-per-task=80 # CPU cores per task
#SBATCH --gres=gpu:8 # Number of GPUs per node
#SBATCH --exclusive # Exclusive use of nodes

set -x -e

# Load any necessary modules for your system
source /etc/profile.d/modules.sh # for some reason module isn't loaded
module load cuda/12.1

# Activate your conda environment if needed
source /fsx/nouamane/miniconda/bin/activate
conda activate 2-1-cu121
export PATH=/fsx/nouamane/miniconda/envs/2-1-cu121/bin:$PATH

# Get the node names from SLURM
export NODELIST=`scontrol show hostnames $SLURM_JOB_NODELIST`
export MASTER_NODE=`scontrol show hostnames $SLURM_JOB_NODELIST | head -n1`
export MASTER_PORT=12356

# Calculate total number of processes
export NNODES=$SLURM_NNODES
export GPUS_PER_NODE=8
export WORLD_SIZE=$(($NNODES * $GPUS_PER_NODE))

# Set some environment variables for better distributed training
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_DEBUG=INFO

# Nanotron specific
export NANOTRON_BENCHMARK=1

# Print some debugging information
echo "Master node: $MASTER_NODE"
echo "All nodes: $NODELIST"
echo "World size: $WORLD_SIZE"

# Launch the training script using srun
srun torchrun \
--nnodes=$NNODES \
--nproc_per_node=$GPUS_PER_NODE \
--rdzv_id=$SLURM_JOB_ID \
--rdzv_backend=c10d \
--rdzv_endpoint=$MASTER_NODE:$MASTER_PORT \
run_train.py \
--config-file examples/config_tiny_llama.yaml
180 changes: 180 additions & 0 deletions scaling_benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
#!/usr/bin/env python3
import argparse
import math
import os

import yaml


def create_config(
dp: int,
tp: int,
pp: int,
batch_accum: int,
seq_len: int,
micro_batch_size: int = 1,
base_config_path: str = "examples/config_tiny_llama.yaml",
) -> dict:
"""Create a config with the specified parallelism settings."""
# Load base config
if not os.path.exists(base_config_path):
raise FileNotFoundError(f"Base config file not found: {base_config_path}")

with open(base_config_path) as f:
config = yaml.safe_load(f)

# Modify parallelism settings
config["parallelism"]["dp"] = dp
config["parallelism"]["tp"] = tp
config["parallelism"]["pp"] = pp

# Modify batch and sequence settings
config["tokens"]["batch_accumulation_per_replica"] = batch_accum
config["tokens"]["sequence_length"] = seq_len
config["tokens"]["micro_batch_size"] = micro_batch_size

# Update run name to reflect configuration
config["general"]["run"] = f"dp{dp}_tp{tp}_pp{pp}_acc{batch_accum}_mbs{micro_batch_size}_seq{seq_len}"

# Update benchmark CSV path
config["general"]["benchmark_csv_path"] = "bench.csv"

return config


def generate_slurm_script(
config: dict,
dp: int,
tp: int,
pp: int,
time: str = "00:15:00",
partition: str = "hopper-prod",
base_script_path: str = "run_multinode.sh",
) -> str:
"""Generate a SLURM script for the given configuration."""
# Check if base script exists
if not os.path.exists(base_script_path):
raise FileNotFoundError(f"Base script file not found: {base_script_path}")

# Load base script
with open(base_script_path) as f:
script = f.read()

# Calculate required number of nodes
gpus_per_node = 8
total_gpus_needed = dp * tp * pp
num_nodes = math.ceil(total_gpus_needed / gpus_per_node)

# Replace SLURM parameters
replacements = {
"--nodes=2": f"--nodes={num_nodes}",
"--time=00:15:00": f"--time={time}",
"--partition=hopper-prod": f"--partition={partition}",
"--job-name=smolm2-bench": f"--job-name=bench_{config['general']['run']}",
"examples/config_tiny_llama.yaml": f"benchmark/configs/config_{config['general']['run']}.yaml",
}

for old, new in replacements.items():
if old not in script:
print(f"Warning: Could not find '{old}' in base script")
script = script.replace(old, new)

return script


def main():
parser = argparse.ArgumentParser(description="Run scaling benchmarks with different parallelism configurations")
parser.add_argument(
"--configs-dir", type=str, default="benchmark/configs", help="Directory to store generated configs"
)
parser.add_argument(
"--scripts-dir", type=str, default="benchmark/scripts", help="Directory to store generated SLURM scripts"
)
parser.add_argument("--partition", type=str, default="hopper-prod", help="SLURM partition to use")
parser.add_argument("--time", type=str, default="00:15:00", help="Time limit for each job")
parser.add_argument(
"--base-config", type=str, default="examples/config_tiny_llama.yaml", help="Base configuration file to use"
)
parser.add_argument("--base-script", type=str, default="run_multinode.sh", help="Base SLURM script to use")
parser.add_argument("--run", action="store_true", help="Automatically submit all generated SLURM scripts")
args = parser.parse_args()

# Validate input files exist
if not os.path.exists(args.base_config):
raise FileNotFoundError(f"Base config file not found: {args.base_config}")
if not os.path.exists(args.base_script):
raise FileNotFoundError(f"Base script file not found: {args.base_script}")

# Create directories if they don't exist
for directory in [args.configs_dir, args.scripts_dir]:
os.makedirs(directory, exist_ok=True)

# Define configurations to test
configurations = [
# (dp, tp, pp, batch_accum, seq_len, mbs)
# (1, 8, 1, 1, 2048, 1), # Base configuration
# (2, 4, 1, 1, 2048, 1),
# (8, 1, 1, 1, 2048, 1),
# (16, 1, 1, 1, 2048, 1),
# *[(2**i, 1, 1, 1, 2048, 1) for i in range(3, 8)],
*[(2**i, 1, 1, 1, 2048, 8) for i in range(0, 7)],
*[(2**i, 8, 1, 1, 2048, 8) for i in range(0, 7)],
]

# Validate configurations
for dp, tp, pp, batch_accum, seq_len, mbs in configurations:
total_gpus = dp * tp * pp
if total_gpus > 64: # Assuming maximum of 8 nodes with 8 GPUs each
print(
f"Warning: Configuration dp={dp}, tp={tp}, pp={pp} requires {total_gpus} GPUs, which might be too many"
)

# Generate configs and scripts
generated_scripts = [] # Keep track of generated script paths
for dp, tp, pp, batch_accum, seq_len, mbs in configurations:
try:
# Create config
config = create_config(dp, tp, pp, batch_accum, seq_len, mbs, base_config_path=args.base_config)

# Save config
config_path = os.path.join(args.configs_dir, f"config_{config['general']['run']}.yaml")
with open(config_path, "w") as f:
yaml.dump(config, f, default_flow_style=False)

# Generate and save SLURM script
script = generate_slurm_script(
config, dp, tp, pp, time=args.time, partition=args.partition, base_script_path=args.base_script
)

script_path = os.path.join(args.scripts_dir, f"run_{config['general']['run']}.sh")
with open(script_path, "w") as f:
f.write(script)

# Make script executable
os.chmod(script_path, 0o755)

print(f"Successfully generated config and script for {config['general']['run']}")
generated_scripts.append(script_path)

except Exception as e:
print(f"Error processing configuration (dp={dp}, tp={tp}, pp={pp}): {str(e)}")

# Submit jobs if requested
if args.run:
import subprocess

print("\nSubmitting jobs...")
for script_path in generated_scripts:
try:
result = subprocess.run(["sbatch", script_path], check=True, capture_output=True, text=True)
print(f"Submitted {script_path}: {result.stdout.strip()}")
except subprocess.CalledProcessError as e:
print(f"Error submitting {script_path}: {e.stderr}")
else:
print("\nTo run individual jobs:")
for script_path in generated_scripts:
print(f"sbatch {script_path}")


if __name__ == "__main__":
main()

0 comments on commit c6a2cc6

Please sign in to comment.