-
Notifications
You must be signed in to change notification settings - Fork 127
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e3b886c
commit c6a2cc6
Showing
2 changed files
with
237 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --job-name=smolm2-bench # Job name | ||
#SBATCH --time=00:15:00 | ||
#SBATCH --partition=hopper-prod | ||
#SBATCH --qos=high | ||
|
||
#SBATCH -o /fsx/nouamane/projects/nanotron/logs/%x-%j.out | ||
|
||
#SBATCH --nodes=2 # Number of nodes (modify as needed) | ||
#SBATCH --ntasks-per-node=1 # Number of tasks per node | ||
#SBATCH --cpus-per-task=80 # CPU cores per task | ||
#SBATCH --gres=gpu:8 # Number of GPUs per node | ||
#SBATCH --exclusive # Exclusive use of nodes | ||
|
||
set -x -e | ||
|
||
# Load any necessary modules for your system | ||
source /etc/profile.d/modules.sh # for some reason module isn't loaded | ||
module load cuda/12.1 | ||
|
||
# Activate your conda environment if needed | ||
source /fsx/nouamane/miniconda/bin/activate | ||
conda activate 2-1-cu121 | ||
export PATH=/fsx/nouamane/miniconda/envs/2-1-cu121/bin:$PATH | ||
|
||
# Get the node names from SLURM | ||
export NODELIST=`scontrol show hostnames $SLURM_JOB_NODELIST` | ||
export MASTER_NODE=`scontrol show hostnames $SLURM_JOB_NODELIST | head -n1` | ||
export MASTER_PORT=12356 | ||
|
||
# Calculate total number of processes | ||
export NNODES=$SLURM_NNODES | ||
export GPUS_PER_NODE=8 | ||
export WORLD_SIZE=$(($NNODES * $GPUS_PER_NODE)) | ||
|
||
# Set some environment variables for better distributed training | ||
export CUDA_DEVICE_MAX_CONNECTIONS=1 | ||
export NCCL_DEBUG=INFO | ||
|
||
# Nanotron specific | ||
export NANOTRON_BENCHMARK=1 | ||
|
||
# Print some debugging information | ||
echo "Master node: $MASTER_NODE" | ||
echo "All nodes: $NODELIST" | ||
echo "World size: $WORLD_SIZE" | ||
|
||
# Launch the training script using srun | ||
srun torchrun \ | ||
--nnodes=$NNODES \ | ||
--nproc_per_node=$GPUS_PER_NODE \ | ||
--rdzv_id=$SLURM_JOB_ID \ | ||
--rdzv_backend=c10d \ | ||
--rdzv_endpoint=$MASTER_NODE:$MASTER_PORT \ | ||
run_train.py \ | ||
--config-file examples/config_tiny_llama.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
#!/usr/bin/env python3 | ||
import argparse | ||
import math | ||
import os | ||
|
||
import yaml | ||
|
||
|
||
def create_config( | ||
dp: int, | ||
tp: int, | ||
pp: int, | ||
batch_accum: int, | ||
seq_len: int, | ||
micro_batch_size: int = 1, | ||
base_config_path: str = "examples/config_tiny_llama.yaml", | ||
) -> dict: | ||
"""Create a config with the specified parallelism settings.""" | ||
# Load base config | ||
if not os.path.exists(base_config_path): | ||
raise FileNotFoundError(f"Base config file not found: {base_config_path}") | ||
|
||
with open(base_config_path) as f: | ||
config = yaml.safe_load(f) | ||
|
||
# Modify parallelism settings | ||
config["parallelism"]["dp"] = dp | ||
config["parallelism"]["tp"] = tp | ||
config["parallelism"]["pp"] = pp | ||
|
||
# Modify batch and sequence settings | ||
config["tokens"]["batch_accumulation_per_replica"] = batch_accum | ||
config["tokens"]["sequence_length"] = seq_len | ||
config["tokens"]["micro_batch_size"] = micro_batch_size | ||
|
||
# Update run name to reflect configuration | ||
config["general"]["run"] = f"dp{dp}_tp{tp}_pp{pp}_acc{batch_accum}_mbs{micro_batch_size}_seq{seq_len}" | ||
|
||
# Update benchmark CSV path | ||
config["general"]["benchmark_csv_path"] = "bench.csv" | ||
|
||
return config | ||
|
||
|
||
def generate_slurm_script( | ||
config: dict, | ||
dp: int, | ||
tp: int, | ||
pp: int, | ||
time: str = "00:15:00", | ||
partition: str = "hopper-prod", | ||
base_script_path: str = "run_multinode.sh", | ||
) -> str: | ||
"""Generate a SLURM script for the given configuration.""" | ||
# Check if base script exists | ||
if not os.path.exists(base_script_path): | ||
raise FileNotFoundError(f"Base script file not found: {base_script_path}") | ||
|
||
# Load base script | ||
with open(base_script_path) as f: | ||
script = f.read() | ||
|
||
# Calculate required number of nodes | ||
gpus_per_node = 8 | ||
total_gpus_needed = dp * tp * pp | ||
num_nodes = math.ceil(total_gpus_needed / gpus_per_node) | ||
|
||
# Replace SLURM parameters | ||
replacements = { | ||
"--nodes=2": f"--nodes={num_nodes}", | ||
"--time=00:15:00": f"--time={time}", | ||
"--partition=hopper-prod": f"--partition={partition}", | ||
"--job-name=smolm2-bench": f"--job-name=bench_{config['general']['run']}", | ||
"examples/config_tiny_llama.yaml": f"benchmark/configs/config_{config['general']['run']}.yaml", | ||
} | ||
|
||
for old, new in replacements.items(): | ||
if old not in script: | ||
print(f"Warning: Could not find '{old}' in base script") | ||
script = script.replace(old, new) | ||
|
||
return script | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="Run scaling benchmarks with different parallelism configurations") | ||
parser.add_argument( | ||
"--configs-dir", type=str, default="benchmark/configs", help="Directory to store generated configs" | ||
) | ||
parser.add_argument( | ||
"--scripts-dir", type=str, default="benchmark/scripts", help="Directory to store generated SLURM scripts" | ||
) | ||
parser.add_argument("--partition", type=str, default="hopper-prod", help="SLURM partition to use") | ||
parser.add_argument("--time", type=str, default="00:15:00", help="Time limit for each job") | ||
parser.add_argument( | ||
"--base-config", type=str, default="examples/config_tiny_llama.yaml", help="Base configuration file to use" | ||
) | ||
parser.add_argument("--base-script", type=str, default="run_multinode.sh", help="Base SLURM script to use") | ||
parser.add_argument("--run", action="store_true", help="Automatically submit all generated SLURM scripts") | ||
args = parser.parse_args() | ||
|
||
# Validate input files exist | ||
if not os.path.exists(args.base_config): | ||
raise FileNotFoundError(f"Base config file not found: {args.base_config}") | ||
if not os.path.exists(args.base_script): | ||
raise FileNotFoundError(f"Base script file not found: {args.base_script}") | ||
|
||
# Create directories if they don't exist | ||
for directory in [args.configs_dir, args.scripts_dir]: | ||
os.makedirs(directory, exist_ok=True) | ||
|
||
# Define configurations to test | ||
configurations = [ | ||
# (dp, tp, pp, batch_accum, seq_len, mbs) | ||
# (1, 8, 1, 1, 2048, 1), # Base configuration | ||
# (2, 4, 1, 1, 2048, 1), | ||
# (8, 1, 1, 1, 2048, 1), | ||
# (16, 1, 1, 1, 2048, 1), | ||
# *[(2**i, 1, 1, 1, 2048, 1) for i in range(3, 8)], | ||
*[(2**i, 1, 1, 1, 2048, 8) for i in range(0, 7)], | ||
*[(2**i, 8, 1, 1, 2048, 8) for i in range(0, 7)], | ||
] | ||
|
||
# Validate configurations | ||
for dp, tp, pp, batch_accum, seq_len, mbs in configurations: | ||
total_gpus = dp * tp * pp | ||
if total_gpus > 64: # Assuming maximum of 8 nodes with 8 GPUs each | ||
print( | ||
f"Warning: Configuration dp={dp}, tp={tp}, pp={pp} requires {total_gpus} GPUs, which might be too many" | ||
) | ||
|
||
# Generate configs and scripts | ||
generated_scripts = [] # Keep track of generated script paths | ||
for dp, tp, pp, batch_accum, seq_len, mbs in configurations: | ||
try: | ||
# Create config | ||
config = create_config(dp, tp, pp, batch_accum, seq_len, mbs, base_config_path=args.base_config) | ||
|
||
# Save config | ||
config_path = os.path.join(args.configs_dir, f"config_{config['general']['run']}.yaml") | ||
with open(config_path, "w") as f: | ||
yaml.dump(config, f, default_flow_style=False) | ||
|
||
# Generate and save SLURM script | ||
script = generate_slurm_script( | ||
config, dp, tp, pp, time=args.time, partition=args.partition, base_script_path=args.base_script | ||
) | ||
|
||
script_path = os.path.join(args.scripts_dir, f"run_{config['general']['run']}.sh") | ||
with open(script_path, "w") as f: | ||
f.write(script) | ||
|
||
# Make script executable | ||
os.chmod(script_path, 0o755) | ||
|
||
print(f"Successfully generated config and script for {config['general']['run']}") | ||
generated_scripts.append(script_path) | ||
|
||
except Exception as e: | ||
print(f"Error processing configuration (dp={dp}, tp={tp}, pp={pp}): {str(e)}") | ||
|
||
# Submit jobs if requested | ||
if args.run: | ||
import subprocess | ||
|
||
print("\nSubmitting jobs...") | ||
for script_path in generated_scripts: | ||
try: | ||
result = subprocess.run(["sbatch", script_path], check=True, capture_output=True, text=True) | ||
print(f"Submitted {script_path}: {result.stdout.strip()}") | ||
except subprocess.CalledProcessError as e: | ||
print(f"Error submitting {script_path}: {e.stderr}") | ||
else: | ||
print("\nTo run individual jobs:") | ||
for script_path in generated_scripts: | ||
print(f"sbatch {script_path}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |