Merge pull request #13 from intelligent-machine-learning/pin_2014_01_08

Pin 2024 01 08
intelligent-machine-learning · Jan 8, 2024 · faff325 · faff325
2 parents f5bf0b6 + bd5e5cf
commit faff325
Show file tree

Hide file tree

Showing 28 changed files with 243 additions and 165 deletions.
diff --git a/codegen/xla_native_functions.yaml b/codegen/xla_native_functions.yaml
@@ -84,6 +84,7 @@ full_codegen:
   - sinh
   - softshrink
   - softshrink_backward
+  - sqrt
   - take
   - tan
   - tanh
@@ -313,7 +314,6 @@ supported:
   - sort.stable
   - split_copy.Tensor
   - split_with_sizes_copy
-  - sqrt
   - squeeze_copy
   - squeeze_copy.dim
   - squeeze_copy.dims
@@ -373,7 +373,6 @@ supported:
   - narrow_copy
   - pixel_shuffle
   - pixel_unshuffle
-  - reshape
   - select_backward
   - select.int
   - slice.Tensor
@@ -406,8 +405,6 @@ symint:
   - narrow_copy
   - select_backward
   - select.int
-  # See Note: [functionalization and CompositeExplicitAutograd]
-  - reshape
   # See Note: [Disabling functionalization]
   - expand
   - view

diff --git a/infra/tpu-pytorch-releases/artifacts.auto.tfvars b/infra/tpu-pytorch-releases/artifacts.auto.tfvars
@@ -25,54 +25,54 @@ nightly_builds = [
 versioned_builds = [
   # Remove libtpu from PyPI builds
   {
-    git_tag         = "v2.2.0-rc5"
-    package_version = "2.2.0rc5"
-    pytorch_git_rev = "v2.2.0-rc5"
+    git_tag         = "v2.2.0-rc6"
+    package_version = "2.2.0rc6"
+    pytorch_git_rev = "v2.2.0-rc6"
     accelerator     = "tpu"
     bundle_libtpu   = "0"
   },
   {
-    git_tag         = "v2.2.0-rc5"
-    package_version = "2.2.0rc5"
-    pytorch_git_rev = "v2.2.0-rc5"
+    git_tag         = "v2.2.0-rc6"
+    package_version = "2.2.0rc6"
+    pytorch_git_rev = "v2.2.0-rc6"
     accelerator     = "tpu"
     python_version  = "3.9"
     bundle_libtpu   = "0"
   },
   {
-    git_tag         = "v2.2.0-rc5"
-    package_version = "2.2.0rc5"
-    pytorch_git_rev = "v2.2.0-rc5"
+    git_tag         = "v2.2.0-rc6"
+    package_version = "2.2.0rc6"
+    pytorch_git_rev = "v2.2.0-rc6"
     accelerator     = "tpu"
     python_version  = "3.10"
     bundle_libtpu   = "0"
   },
   {
-    git_tag         = "v2.2.0-rc5"
-    package_version = "2.2.0rc5"
-    pytorch_git_rev = "v2.2.0-rc5"
+    git_tag         = "v2.2.0-rc6"
+    package_version = "2.2.0rc6"
+    pytorch_git_rev = "v2.2.0-rc6"
     accelerator     = "tpu"
     python_version  = "3.11"
     bundle_libtpu   = "0"
   },
   # Bundle libtpu for Kaggle
   {
-    git_tag         = "v2.2.0-rc5"
-    package_version = "2.2.0rc5+libtpu"
-    pytorch_git_rev = "v2.2.0-rc5"
+    git_tag         = "v2.2.0-rc6"
+    package_version = "2.2.0rc6+libtpu"
+    pytorch_git_rev = "v2.2.0-rc6"
     accelerator     = "tpu"
     python_version  = "3.10"
     bundle_libtpu   = "1"
   },
   {
-    git_tag         = "v2.2.0-rc5"
-    package_version = "2.2.0rc5"
+    git_tag         = "v2.2.0-rc6"
+    package_version = "2.2.0rc6"
     accelerator     = "cuda"
     cuda_version    = "12.1"
   },
   {
-    git_tag         = "v2.2.0-rc5"
-    package_version = "2.2.0rc5"
+    git_tag         = "v2.2.0-rc6"
+    package_version = "2.2.0rc6"
     accelerator     = "cuda"
     cuda_version    = "12.1"
     python_version  = "3.10"

diff --git a/plugins/cuda/README.md b/plugins/cuda/README.md
@@ -0,0 +1,41 @@
+# CUDA PJRT plugin (experimental)
+
+This directory contains an experimental implementation of the PJRT GPU client as
+a plugin. The actual implementation of the PJRT C API lives in the main OpenXLA
+repository (see `bazel build` command below).
+
+## Building
+
+```bash
+# Build PJRT plugin
+bazel build @xla//xla/pjrt/c:pjrt_c_api_gpu_plugin.so --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=1  --config=cuda
+# Copy to package dir
+cp bazel-bin/external/xla/xla/pjrt/c/pjrt_c_api_gpu_plugin.so plugins/cuda/torch_xla_cuda_plugin
+
+# Build wheel
+pip wheel plugins/cuda
+# Or install directly
+pip install plugins/cuda
+```
+
+## Usage
+
+```python
+import os
+
+# Log device type
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
+os.environ['TF_CPP_VMODULE'] = 'pjrt_registry=5'
+
+from torch_xla.experimental import plugins
+import torch_xla_cuda_plugin
+import torch_xla.core.xla_model as xm
+import torch_xla.runtime as xr
+
+# Use dynamic plugin instead of built-in CUDA support
+plugins.use_dynamic_plugins()
+plugins.register_plugin('CUDA', torch_xla_cuda_plugin.GpuPlugin())
+xr.set_device_type('CUDA')
+
+print(xm.xla_device())
+```
diff --git a/plugins/cuda/pyproject.toml b/plugins/cuda/pyproject.toml
@@ -0,0 +1,18 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "torch_xla_cuda_plugin"
+version = "0.0.1"
+authors = [
+    {name = "Will Cromar", email = "[email protected]"},
+]
+description = "CUDA Plugin"
+requires-python = ">=3.8"
+
+[tool.setuptools.package-data]
+torch_xla_cuda_plugin = ["*.so"]
+
+[project.entry-points."torch_xla.plugins"]
+gpu = "torch_xla_cuda_plugin:GpuPlugin"
diff --git a/plugins/cuda/torch_xla_cuda_plugin/__init__.py b/plugins/cuda/torch_xla_cuda_plugin/__init__.py
@@ -0,0 +1,11 @@
+import os
+from torch_xla.experimental import plugins
+from torch_xla._internal import tpu
+
+class GpuPlugin(plugins.DevicePlugin):
+  def library_path(self) -> str:
+    return os.path.join(os.path.dirname(__file__), 'pjrt_c_api_gpu_plugin.so')
+
+  def physical_chip_count(self) -> int:
+    # TODO: default to actual device count
+    return os.getenv('GPU_NUM_DEVICES', 1)
diff --git a/test/pjrt/test_dtypes.py b/test/pjrt/test_dtypes.py
@@ -0,0 +1,35 @@
+from absl.testing import absltest, parameterized
+import torch
+import torch_xla.core.xla_model as xm
+import torch_xla.runtime as xr
+
+
+class TestDtypes(parameterized.TestCase):
+
+  @parameterized.parameters(torch.float16, torch.float32, torch.float64,
+                            torch.bfloat16, torch.complex64)
+  def test_float_round_trip(self, dtype: torch.dtype):
+    t = torch.randn((3, 3), dtype=dtype)
+    xt = t.to(xm.xla_device())
+    torch.testing.assert_close(xt.cpu(), t)
+
+  @parameterized.parameters(
+      torch.uint8,
+      torch.int8,
+      torch.int16,
+      torch.int32,
+      torch.int64,
+  )
+  def test_int_round_trip(self, dtype: torch.dtype):
+    t = torch.randint(0, 128, (3, 3), dtype=dtype)
+    xt = t.to(xm.xla_device())
+    torch.testing.assert_close(xt.cpu(), t)
+
+  def test_bool_round_trip(self):
+    t = torch.randint(0, 2, (3, 3), dtype=torch.bool)
+    xt = t.to(xm.xla_device())
+    torch.testing.assert_close(xt.cpu(), t)
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -128,7 +128,7 @@ function run_torchrun {
     echo "Running torchrun test for GPU $@"
     num_devices=$(nvidia-smi --list-gpus | wc -l)
     PJRT_DEVICE=CUDA torchrun --nnodes 1 --nproc-per-node $num_devices $@
-  fi 
+  fi
 }
 
 function run_torch_op_tests {
@@ -190,6 +190,7 @@ function run_xla_op_tests1 {
 # DO NOT MODIFY
 function run_xla_op_tests2 {
   run_downcast_bf16 "$CDIR/test_data_type.py"
+  run_test "$CDIR/pjrt/test_dtypes.py"
   run_test "$CDIR/test_autocast.py"  # TODO(yeounoh) this is expensive on GPU
 }
 
@@ -235,6 +236,7 @@ function run_mp_op_tests {
   run_test "$CDIR/test_mp_save.py"
   run_test "$CDIR/test_mp_mesh_reduce.py"
   run_test "$CDIR/test_mp_sync_batch_norm.py"
+  run_test "$CDIR/test_mp_early_exit.py"
   run_pt_xla_debug "$CDIR/debug_tool/test_mp_pt_xla_debug.py"
   run_xla_backend_mp "$CDIR/test_torch_distributed_all_gather_xla_backend.py"
   run_xla_backend_mp "$CDIR/test_torch_distributed_all_reduce_xla_backend.py"

diff --git a/test/test_core_aten_ops.py b/test/test_core_aten_ops.py
@@ -295,7 +295,6 @@ def test_aten__adaptive_avg_pool3d_1(self):
     run_export_and_compare(self, torch.ops.aten._adaptive_avg_pool3d, args,
                            kwargs)
 
-  @unittest.skip
   def test_aten_add_Scalar_0(self):
     args = (
         torch.randn((10, 10)).to(torch.float32),
@@ -1641,13 +1640,18 @@ def test_aten_expm1_0(self):
     kwargs = dict()
     run_export_and_compare(self, torch.ops.aten.expm1, args, kwargs)
 
-  @unittest.skip
   def test_aten_expm1_1(self):
     args = (torch.randn((10, 10)).to(torch.float16),)
     kwargs = dict()
-    run_export_and_compare(self, torch.ops.aten.expm1, args, kwargs)
+    run_export_and_compare(
+        self,
+        torch.ops.aten.expm1,
+        args,
+        kwargs,
+        rtol=0.001,
+        atol=0.01,
+    )
 
-  @unittest.skip
   def test_aten_expm1_2(self):
     args = (torch.randint(0, 10, (10, 10)).to(torch.int32),)
     kwargs = dict()
@@ -1757,7 +1761,6 @@ def test_aten_floor_divide_0(self):
     kwargs = dict()
     run_export_and_compare(self, torch.ops.aten.floor_divide, args, kwargs)
 
-  @unittest.skip
   def test_aten_floor_divide_1(self):
     args = (
         torch.randn((10, 10)).to(torch.float16),
@@ -2475,7 +2478,6 @@ def test_aten_logical_or_0(self):
     kwargs = dict()
     run_export_and_compare(self, torch.ops.aten.logical_or, args, kwargs)
 
-  @unittest.skip
   def test_aten_logical_or_1(self):
     args = (
         torch.randn((10, 10)).to(torch.float16),
@@ -3347,7 +3349,6 @@ def test_aten_prod_0(self):
     kwargs = dict()
     run_export_and_compare(self, torch.ops.aten.prod, args, kwargs)
 
-  @unittest.skip
   def test_aten_prod_1(self):
     args = (torch.randint(0, 10, (10, 10)).to(torch.int32),)
     kwargs = dict()
@@ -4018,7 +4019,6 @@ def test_aten_sinh_1(self):
     kwargs = dict()
     run_export_and_compare(self, torch.ops.aten.sinh, args, kwargs)
 
-  @unittest.skip
   def test_aten_sinh_2(self):
     args = (torch.randint(0, 10, (10, 10)).to(torch.int32),)
     kwargs = dict()
@@ -4197,19 +4197,16 @@ def test_aten_split_with_sizes_2(self):
     kwargs = dict()
     run_export_and_compare(self, torch.ops.aten.split_with_sizes, args, kwargs)
 
-  @unittest.skip
   def test_aten_sqrt_0(self):
     args = (torch.randn((10, 10)).to(torch.float32),)
     kwargs = dict()
     run_export_and_compare(self, torch.ops.aten.sqrt, args, kwargs)
 
-  @unittest.skip
   def test_aten_sqrt_1(self):
     args = (torch.randn((10, 10)).to(torch.float16),)
     kwargs = dict()
     run_export_and_compare(self, torch.ops.aten.sqrt, args, kwargs)
 
-  @unittest.skip
   def test_aten_sqrt_2(self):
     args = (torch.randint(0, 10, (10, 10)).to(torch.int32),)
     kwargs = dict()

diff --git a/test/test_mp_early_exit.py b/test/test_mp_early_exit.py
@@ -0,0 +1,26 @@
+import sys
+import torch
+import torch_xla
+import torch_xla.core.xla_model as xm
+import torch_xla.distributed.parallel_loader as pl
+import torch_xla.distributed.xla_multiprocessing as xmp
+import torch_xla.utils.utils as xu
+
+
+def _mp_fn(index):
+  device = xm.xla_device()
+  if xm.xla_device_hw(device) in ('TPU', 'GPU', 'CUDA', 'ROCM', 'NEURON'):
+    train_loader = xu.SampleGenerator(
+        data=torch.zeros(1, 12), sample_count=1024)
+    train_loader = pl.MpDeviceLoader(train_loader, device)
+    max_steps = 10
+    for step, inputs in enumerate(train_loader):
+      xm.all_reduce('sum', [inputs], scale=1.0 / xm.xrt_world_size())
+      if step > max_steps:
+        break
+  else:
+    print(f'{device} is not a TPU or GPU device', file=sys.stderr)
+
+
+if __name__ == '__main__':
+  xmp.spawn(_mp_fn, args=())
diff --git a/test/tpu/xla_test_job.yaml b/test/tpu/xla_test_job.yaml
@@ -57,6 +57,7 @@ spec:
       python3 /src/pytorch/xla/test/test_autocast.py
       python3 /src/pytorch/xla/test/dynamo/test_dynamo.py
       python3 /src/pytorch/xla/test/spmd/test_spmd_debugging.py
+      python3 /src/pytorch/xla/test/pjrt/test_dtypes.py
       python3 /src/pytorch/xla/test/pjrt/test_dynamic_plugin_tpu.py
     volumeMounts:
     - mountPath: /dev/shm

diff --git a/torch_xla/__init__.py b/torch_xla/__init__.py
@@ -148,6 +148,8 @@ def _setup_tpu_vm_library_path() -> bool:
 
 
 def _prepare_to_exit():
+  device = _XLAC._xla_get_default_device()
+  _XLAC._set_all_reduce_token(device, None)
   _XLAC._prepare_to_exit()
   if int(os.environ.get('PT_XLA_DEBUG', '0')):
     _summarize_fn_tracker()

diff --git a/torch_xla/csrc/aten_xla_type.cpp b/torch_xla/csrc/aten_xla_type.cpp
@@ -2888,12 +2888,6 @@ std::vector<at::Tensor> XLANativeFunctions::split_with_sizes_copy(
   return bridge::AtenFromXlaTensors(xla_tensors);
 }
 
-at::Tensor XLANativeFunctions::sqrt(const at::Tensor& self) {
-  TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
-  return bridge::AtenFromXlaTensor(
-      tensor_methods::sqrt(bridge::GetXlaTensor(self)));
-}
-
 at::Tensor XLANativeFunctions::squeeze_copy(const at::Tensor& self) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return bridge::AtenFromXlaTensor(
@@ -3649,16 +3643,6 @@ at::Tensor XLANativeFunctions::pixel_unshuffle(const at::Tensor& self,
       pixel_unshuffle)>::call(self, downscale_factor);
 }
 
-at::Tensor XLANativeFunctions::reshape_symint(const at::Tensor& self,
-                                              c10::SymIntArrayRef shape) {
-  // See Note: [Disabling functionalization]
-  if (runtime::sys_util::GetEnvBool("XLA_DISABLE_FUNCTIONALIZATION", false)) {
-    return at::native::reshape_symint(self, shape);
-  }
-  return at::functionalization::functionalize_aten_op_symint<ATEN_OP(
-      reshape)>::call(self, shape);
-}
-
 at::Tensor XLANativeFunctions::select_backward_symint(
     const at::Tensor& grad_output, c10::SymIntArrayRef input_sizes, int64_t dim,
     c10::SymInt index) {