From dbc3157bf256f1339b3fa1fef2be89ac4078be0e Mon Sep 17 00:00:00 2001 From: Qiang Zhang Date: Mon, 4 Dec 2023 14:00:48 -0800 Subject: [PATCH] Benchmark block_bucketize_sparse_features uneven sharding for GPU (#2169) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2169 CPU implementation D51288847 Reviewed By: tissue3 Differential Revision: D51533599 fbshipit-source-id: cf9c7fcbe7043f385e97901a95916287c9f618a5 --- fbgemm_gpu/bench/sparse_ops_benchmark.py | 27 +++++++++++++++++------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/fbgemm_gpu/bench/sparse_ops_benchmark.py b/fbgemm_gpu/bench/sparse_ops_benchmark.py index afca7d467..a578f3f40 100644 --- a/fbgemm_gpu/bench/sparse_ops_benchmark.py +++ b/fbgemm_gpu/bench/sparse_ops_benchmark.py @@ -874,9 +874,11 @@ def ben(fn, name, ad_indices, ad_lengths, batch_offsets, num_ads_in_batch): @click.option("--batch-size", default=4096) @click.option("--bucket-num", default=16) @click.option("--input-precision", type=str, default="long") +@click.option("--device", type=click.Choice(["cpu", "cuda"]), default="cpu") def block_bucketize_sparse_features_bench( - row_size: int, batch_size: int, bucket_num: int, input_precision: str + row_size: int, batch_size: int, bucket_num: int, input_precision: str, device: str ) -> None: + dtype = torch.int if input_precision == "int": dtype = torch.int @@ -900,25 +902,34 @@ def block_bucketize_sparse_features_bench( block_sizes = torch.tensor([bucket_size] * lengths.numel(), dtype=dtype) bucket_pos = [j * bucket_size for j in range(bucket_num + 1)] - block_bucketize_pos = [torch.tensor(bucket_pos)] * lengths.numel() + block_bucketize_pos = [torch.tensor(bucket_pos, device=device)] * lengths.numel() test_param = {"uneven": block_bucketize_pos, "even": None} + print("device {device}") for name, is_block_bucketize_pos in test_param.items(): time, output = benchmark_torch_function( torch.ops.fbgemm.block_bucketize_sparse_features, ( - lengths, - indices, + lengths if device == "cpu" else lengths.to(device), + indices if device == "cpu" else indices.to(device), False, True, - block_sizes, + block_sizes if device == "cpu" else block_sizes.to(device), bucket_num, - weights, + weights + if device == "cpu" + else (weights.to(device) if weights is not None else None), None, -1, # unused - is_block_bucketize_pos, + is_block_bucketize_pos + if device == "cpu" + else ( + [i.to(device) for i in is_block_bucketize_pos] + if is_block_bucketize_pos is not None + else None + ), ), iters=100, - device="cpu", + device=device, ) num_bytes = 0