Skip to content

Commit

Permalink
Benchmarks - Add configurations for NDv5 and AMD MI300 (#652)
Browse files Browse the repository at this point in the history
**Description**
Add configurations for NDv5 and AMD MI300.

**Major Revision**
- Add cublaslt , FP8 transformers training, dist-inference cpp in NDv5
config example
- Add hipblaslt , FP8 transformers training, dist-inference cpp in AMD
MI300 config example
  • Loading branch information
yukirora authored Sep 26, 2024
1 parent 0ac21a5 commit e39489c
Show file tree
Hide file tree
Showing 5 changed files with 545 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ jobs:
else
echo "No Docker images found with the specified references."
fi
sudo docker ps -q | grep build | xargs -r sudo docker stop
echo y | sudo docker system prune -a --volumes
df -h
- name: Prepare metadata
id: metadata
Expand Down
5 changes: 5 additions & 0 deletions dockerfile/rocm6.0.x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,11 @@ RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASL
RUN cd third_party/Megatron/Megatron-DeepSpeed && \
git apply ../megatron_deepspeed_rocm6.patch

# Install AMD SMI Python Library
RUN apt install amd-smi-lib -y && \
cd /opt/rocm/share/amd_smi && \
python3 -m pip install .

ADD . .
ENV USE_HIP_DATATYPE=1
ENV USE_HIPBLAS_COMPUTETYPE=1
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def run(self):
'onnxruntime-gpu; python_version>="3.10"',
],
'nvidia': ['py3nvml>=0.2.6'],
'amd': ['pyrsmi>=1.0.1'],
'amd': ['amdsmi'],
}
),
include_package_data=True,
Expand Down
232 changes: 232 additions & 0 deletions superbench/config/amd_mi300.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
# SuperBench Config
version: v0.11
superbench:
enable: null
var:
default_local_mode: &default_local_mode
enable: true
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank}
parallel: yes
default_pytorch_mode: &default_pytorch_mode
enable: true
modes:
- name: torch.distributed
proc_num: 8
node_num: 1
frameworks:
- pytorch
common_model_config: &common_model_config
model_ddp_parameter: &model_ddp_param
duration: 0
num_warmup: 128
num_steps: 512
sample_count: 8192
batch_size: 128
precision: [float32, float16]
model_action: [train]
pin_memory: yes
num_workers: 0
benchmarks:
kernel-launch:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
parameters:
m: 7680
n: 8192
k: 8192
hipblaslt-gemm:
enable: true
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank}
parallel: yes
parameters:
in_types: ["fp32", "fp16", "bf16", 'fp8']
tolerant_fail: yes
num_warmup: 100
num_steps: 1000
shapes:
- 4096,4096,4096
- 8192,8192,8192
- 16384,16384,16384
rccl-bw:
enable: true
modes:
- name: mpi
proc_num: 8
node_num: 1
mca:
pml: ob1
btl: ^openib
btl_tcp_if_exclude: lo,docker0
coll_hcoll_enable: 0
parameters:
maxbytes: 16G
ngpus: 1
operation: allreduce
cpu-memory-bw-latency:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
tests:
- bandwidth_matrix
- latency_matrix
- max_bandwidth
mem-bw:
enable: true
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/4))
parallel: no
ib-loopback:
enable: true
modes:
- name: local
proc_num: 16
prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8))
parallel: no
parameters:
msg_size: 8388608
disk-benchmark:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
block_devices: []
gpu-copy-bw:correctness:
enable: true
modes:
- name: local
parallel: no
parameters:
mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
copy_type: [sm, dma]
size: 4096
num_warm_up: 0
num_loops: 1
check_data: true
gpu-copy-bw:perf:
enable: true
modes:
- name: local
parallel: no
parameters:
mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
copy_type: [sm, dma]
ib-traffic:
enable: false
modes:
- name: mpi
proc_num: 1
mca:
btl: tcp,self
pml: ob1
btl_tcp_if_include: ens17f0
gpcnet-network-test:
enable: false
modes:
- name: mpi
proc_num: 1
mca:
pml: ucx
btl: ^uct
btl_tcp_if_include: ens17f0
tcp-connectivity:
enable: false
modes:
- name: local
parallel: no
parameters:
port: 22
dist-inference:
modes:
- name: mpi
proc_num: 8
node_num: 1
mca:
pml: ob1
btl: ^openib
btl_tcp_if_exclude: lo,docker0
coll_hcoll_enable: 0
frameworks:
- pytorch
parameters:
num_layers: 50
num_warmup: 20
num_steps: 100
use_cuda_graph: true
precision: float16
hidden_size: 128
input_size: 128
batch_size: 1024
model-benchmarks:gpt:
enable: true
<<: *default_pytorch_mode
models:
- gpt2-small
- gpt2-large
parameters:
<<: *model_ddp_param
precision: [float32, float16, fp8_hybrid]
batch_size: 32
seq_len: 224
model-benchmarks:bert:
enable: true
<<: *default_pytorch_mode
models:
- bert-base
- bert-large
parameters:
<<: *model_ddp_param
precision: [float32, float16, fp8_hybrid]
seq_len: 224
model-benchmarks:lstm:
enable: true
<<: *default_pytorch_mode
models:
- lstm
parameters:
<<: *model_ddp_param
batch_size: 1024
input_size: 224
hidden_size: 1000
seq_len: 32
model-benchmarks:resnet:
enable: true
<<: *default_pytorch_mode
models:
- resnet50
- resnet101
- resnet152
parameters:
<<: *model_ddp_param
batch_size: 384
model-benchmarks:densenet:
enable: true
<<: *default_pytorch_mode
models:
- densenet169
- densenet201
parameters:
<<: *model_ddp_param
model-benchmarks:vgg:
enable: true
<<: *default_pytorch_mode
models:
- vgg11
- vgg13
- vgg16
- vgg19
parameters:
<<: *model_ddp_param
Loading

0 comments on commit e39489c

Please sign in to comment.