Split up f8f8bf16_rowwise_batched.cu
(#3381)
#604
fbgemm_gpu_ci_cuda.yml
on: push
Matrix: build_artifact
Matrix: test_and_publish_artifact