Skip to content

Commit

Permalink
Merge pull request #13 from intelligent-machine-learning/pin_2014_01_08
Browse files Browse the repository at this point in the history
Pin 2024 01 08
  • Loading branch information
mars1248 authored Jan 8, 2024
2 parents f5bf0b6 + bd5e5cf commit faff325
Show file tree
Hide file tree
Showing 28 changed files with 243 additions and 165 deletions.
5 changes: 1 addition & 4 deletions codegen/xla_native_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ full_codegen:
- sinh
- softshrink
- softshrink_backward
- sqrt
- take
- tan
- tanh
Expand Down Expand Up @@ -313,7 +314,6 @@ supported:
- sort.stable
- split_copy.Tensor
- split_with_sizes_copy
- sqrt
- squeeze_copy
- squeeze_copy.dim
- squeeze_copy.dims
Expand Down Expand Up @@ -373,7 +373,6 @@ supported:
- narrow_copy
- pixel_shuffle
- pixel_unshuffle
- reshape
- select_backward
- select.int
- slice.Tensor
Expand Down Expand Up @@ -406,8 +405,6 @@ symint:
- narrow_copy
- select_backward
- select.int
# See Note: [functionalization and CompositeExplicitAutograd]
- reshape
# See Note: [Disabling functionalization]
- expand
- view
Expand Down
38 changes: 19 additions & 19 deletions infra/tpu-pytorch-releases/artifacts.auto.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -25,54 +25,54 @@ nightly_builds = [
versioned_builds = [
# Remove libtpu from PyPI builds
{
git_tag = "v2.2.0-rc5"
package_version = "2.2.0rc5"
pytorch_git_rev = "v2.2.0-rc5"
git_tag = "v2.2.0-rc6"
package_version = "2.2.0rc6"
pytorch_git_rev = "v2.2.0-rc6"
accelerator = "tpu"
bundle_libtpu = "0"
},
{
git_tag = "v2.2.0-rc5"
package_version = "2.2.0rc5"
pytorch_git_rev = "v2.2.0-rc5"
git_tag = "v2.2.0-rc6"
package_version = "2.2.0rc6"
pytorch_git_rev = "v2.2.0-rc6"
accelerator = "tpu"
python_version = "3.9"
bundle_libtpu = "0"
},
{
git_tag = "v2.2.0-rc5"
package_version = "2.2.0rc5"
pytorch_git_rev = "v2.2.0-rc5"
git_tag = "v2.2.0-rc6"
package_version = "2.2.0rc6"
pytorch_git_rev = "v2.2.0-rc6"
accelerator = "tpu"
python_version = "3.10"
bundle_libtpu = "0"
},
{
git_tag = "v2.2.0-rc5"
package_version = "2.2.0rc5"
pytorch_git_rev = "v2.2.0-rc5"
git_tag = "v2.2.0-rc6"
package_version = "2.2.0rc6"
pytorch_git_rev = "v2.2.0-rc6"
accelerator = "tpu"
python_version = "3.11"
bundle_libtpu = "0"
},
# Bundle libtpu for Kaggle
{
git_tag = "v2.2.0-rc5"
package_version = "2.2.0rc5+libtpu"
pytorch_git_rev = "v2.2.0-rc5"
git_tag = "v2.2.0-rc6"
package_version = "2.2.0rc6+libtpu"
pytorch_git_rev = "v2.2.0-rc6"
accelerator = "tpu"
python_version = "3.10"
bundle_libtpu = "1"
},
{
git_tag = "v2.2.0-rc5"
package_version = "2.2.0rc5"
git_tag = "v2.2.0-rc6"
package_version = "2.2.0rc6"
accelerator = "cuda"
cuda_version = "12.1"
},
{
git_tag = "v2.2.0-rc5"
package_version = "2.2.0rc5"
git_tag = "v2.2.0-rc6"
package_version = "2.2.0rc6"
accelerator = "cuda"
cuda_version = "12.1"
python_version = "3.10"
Expand Down
41 changes: 41 additions & 0 deletions plugins/cuda/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# CUDA PJRT plugin (experimental)

This directory contains an experimental implementation of the PJRT GPU client as
a plugin. The actual implementation of the PJRT C API lives in the main OpenXLA
repository (see `bazel build` command below).

## Building

```bash
# Build PJRT plugin
bazel build @xla//xla/pjrt/c:pjrt_c_api_gpu_plugin.so --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=1 --config=cuda
# Copy to package dir
cp bazel-bin/external/xla/xla/pjrt/c/pjrt_c_api_gpu_plugin.so plugins/cuda/torch_xla_cuda_plugin

# Build wheel
pip wheel plugins/cuda
# Or install directly
pip install plugins/cuda
```

## Usage

```python
import os

# Log device type
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
os.environ['TF_CPP_VMODULE'] = 'pjrt_registry=5'

from torch_xla.experimental import plugins
import torch_xla_cuda_plugin
import torch_xla.core.xla_model as xm
import torch_xla.runtime as xr

# Use dynamic plugin instead of built-in CUDA support
plugins.use_dynamic_plugins()
plugins.register_plugin('CUDA', torch_xla_cuda_plugin.GpuPlugin())
xr.set_device_type('CUDA')

print(xm.xla_device())
```
18 changes: 18 additions & 0 deletions plugins/cuda/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

[project]
name = "torch_xla_cuda_plugin"
version = "0.0.1"
authors = [
{name = "Will Cromar", email = "[email protected]"},
]
description = "CUDA Plugin"
requires-python = ">=3.8"

[tool.setuptools.package-data]
torch_xla_cuda_plugin = ["*.so"]

[project.entry-points."torch_xla.plugins"]
gpu = "torch_xla_cuda_plugin:GpuPlugin"
11 changes: 11 additions & 0 deletions plugins/cuda/torch_xla_cuda_plugin/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import os
from torch_xla.experimental import plugins
from torch_xla._internal import tpu

class GpuPlugin(plugins.DevicePlugin):
def library_path(self) -> str:
return os.path.join(os.path.dirname(__file__), 'pjrt_c_api_gpu_plugin.so')

def physical_chip_count(self) -> int:
# TODO: default to actual device count
return os.getenv('GPU_NUM_DEVICES', 1)
35 changes: 35 additions & 0 deletions test/pjrt/test_dtypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from absl.testing import absltest, parameterized
import torch
import torch_xla.core.xla_model as xm
import torch_xla.runtime as xr


class TestDtypes(parameterized.TestCase):

@parameterized.parameters(torch.float16, torch.float32, torch.float64,
torch.bfloat16, torch.complex64)
def test_float_round_trip(self, dtype: torch.dtype):
t = torch.randn((3, 3), dtype=dtype)
xt = t.to(xm.xla_device())
torch.testing.assert_close(xt.cpu(), t)

@parameterized.parameters(
torch.uint8,
torch.int8,
torch.int16,
torch.int32,
torch.int64,
)
def test_int_round_trip(self, dtype: torch.dtype):
t = torch.randint(0, 128, (3, 3), dtype=dtype)
xt = t.to(xm.xla_device())
torch.testing.assert_close(xt.cpu(), t)

def test_bool_round_trip(self):
t = torch.randint(0, 2, (3, 3), dtype=torch.bool)
xt = t.to(xm.xla_device())
torch.testing.assert_close(xt.cpu(), t)


if __name__ == "__main__":
absltest.main()
4 changes: 3 additions & 1 deletion test/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ function run_torchrun {
echo "Running torchrun test for GPU $@"
num_devices=$(nvidia-smi --list-gpus | wc -l)
PJRT_DEVICE=CUDA torchrun --nnodes 1 --nproc-per-node $num_devices $@
fi
fi
}

function run_torch_op_tests {
Expand Down Expand Up @@ -190,6 +190,7 @@ function run_xla_op_tests1 {
# DO NOT MODIFY
function run_xla_op_tests2 {
run_downcast_bf16 "$CDIR/test_data_type.py"
run_test "$CDIR/pjrt/test_dtypes.py"
run_test "$CDIR/test_autocast.py" # TODO(yeounoh) this is expensive on GPU
}

Expand Down Expand Up @@ -235,6 +236,7 @@ function run_mp_op_tests {
run_test "$CDIR/test_mp_save.py"
run_test "$CDIR/test_mp_mesh_reduce.py"
run_test "$CDIR/test_mp_sync_batch_norm.py"
run_test "$CDIR/test_mp_early_exit.py"
run_pt_xla_debug "$CDIR/debug_tool/test_mp_pt_xla_debug.py"
run_xla_backend_mp "$CDIR/test_torch_distributed_all_gather_xla_backend.py"
run_xla_backend_mp "$CDIR/test_torch_distributed_all_reduce_xla_backend.py"
Expand Down
19 changes: 8 additions & 11 deletions test/test_core_aten_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,6 @@ def test_aten__adaptive_avg_pool3d_1(self):
run_export_and_compare(self, torch.ops.aten._adaptive_avg_pool3d, args,
kwargs)

@unittest.skip
def test_aten_add_Scalar_0(self):
args = (
torch.randn((10, 10)).to(torch.float32),
Expand Down Expand Up @@ -1641,13 +1640,18 @@ def test_aten_expm1_0(self):
kwargs = dict()
run_export_and_compare(self, torch.ops.aten.expm1, args, kwargs)

@unittest.skip
def test_aten_expm1_1(self):
args = (torch.randn((10, 10)).to(torch.float16),)
kwargs = dict()
run_export_and_compare(self, torch.ops.aten.expm1, args, kwargs)
run_export_and_compare(
self,
torch.ops.aten.expm1,
args,
kwargs,
rtol=0.001,
atol=0.01,
)

@unittest.skip
def test_aten_expm1_2(self):
args = (torch.randint(0, 10, (10, 10)).to(torch.int32),)
kwargs = dict()
Expand Down Expand Up @@ -1757,7 +1761,6 @@ def test_aten_floor_divide_0(self):
kwargs = dict()
run_export_and_compare(self, torch.ops.aten.floor_divide, args, kwargs)

@unittest.skip
def test_aten_floor_divide_1(self):
args = (
torch.randn((10, 10)).to(torch.float16),
Expand Down Expand Up @@ -2475,7 +2478,6 @@ def test_aten_logical_or_0(self):
kwargs = dict()
run_export_and_compare(self, torch.ops.aten.logical_or, args, kwargs)

@unittest.skip
def test_aten_logical_or_1(self):
args = (
torch.randn((10, 10)).to(torch.float16),
Expand Down Expand Up @@ -3347,7 +3349,6 @@ def test_aten_prod_0(self):
kwargs = dict()
run_export_and_compare(self, torch.ops.aten.prod, args, kwargs)

@unittest.skip
def test_aten_prod_1(self):
args = (torch.randint(0, 10, (10, 10)).to(torch.int32),)
kwargs = dict()
Expand Down Expand Up @@ -4018,7 +4019,6 @@ def test_aten_sinh_1(self):
kwargs = dict()
run_export_and_compare(self, torch.ops.aten.sinh, args, kwargs)

@unittest.skip
def test_aten_sinh_2(self):
args = (torch.randint(0, 10, (10, 10)).to(torch.int32),)
kwargs = dict()
Expand Down Expand Up @@ -4197,19 +4197,16 @@ def test_aten_split_with_sizes_2(self):
kwargs = dict()
run_export_and_compare(self, torch.ops.aten.split_with_sizes, args, kwargs)

@unittest.skip
def test_aten_sqrt_0(self):
args = (torch.randn((10, 10)).to(torch.float32),)
kwargs = dict()
run_export_and_compare(self, torch.ops.aten.sqrt, args, kwargs)

@unittest.skip
def test_aten_sqrt_1(self):
args = (torch.randn((10, 10)).to(torch.float16),)
kwargs = dict()
run_export_and_compare(self, torch.ops.aten.sqrt, args, kwargs)

@unittest.skip
def test_aten_sqrt_2(self):
args = (torch.randint(0, 10, (10, 10)).to(torch.int32),)
kwargs = dict()
Expand Down
26 changes: 26 additions & 0 deletions test/test_mp_early_exit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import sys
import torch
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.utils.utils as xu


def _mp_fn(index):
device = xm.xla_device()
if xm.xla_device_hw(device) in ('TPU', 'GPU', 'CUDA', 'ROCM', 'NEURON'):
train_loader = xu.SampleGenerator(
data=torch.zeros(1, 12), sample_count=1024)
train_loader = pl.MpDeviceLoader(train_loader, device)
max_steps = 10
for step, inputs in enumerate(train_loader):
xm.all_reduce('sum', [inputs], scale=1.0 / xm.xrt_world_size())
if step > max_steps:
break
else:
print(f'{device} is not a TPU or GPU device', file=sys.stderr)


if __name__ == '__main__':
xmp.spawn(_mp_fn, args=())
1 change: 1 addition & 0 deletions test/tpu/xla_test_job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ spec:
python3 /src/pytorch/xla/test/test_autocast.py
python3 /src/pytorch/xla/test/dynamo/test_dynamo.py
python3 /src/pytorch/xla/test/spmd/test_spmd_debugging.py
python3 /src/pytorch/xla/test/pjrt/test_dtypes.py
python3 /src/pytorch/xla/test/pjrt/test_dynamic_plugin_tpu.py
volumeMounts:
- mountPath: /dev/shm
Expand Down
2 changes: 2 additions & 0 deletions torch_xla/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ def _setup_tpu_vm_library_path() -> bool:


def _prepare_to_exit():
device = _XLAC._xla_get_default_device()
_XLAC._set_all_reduce_token(device, None)
_XLAC._prepare_to_exit()
if int(os.environ.get('PT_XLA_DEBUG', '0')):
_summarize_fn_tracker()
Expand Down
16 changes: 0 additions & 16 deletions torch_xla/csrc/aten_xla_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2888,12 +2888,6 @@ std::vector<at::Tensor> XLANativeFunctions::split_with_sizes_copy(
return bridge::AtenFromXlaTensors(xla_tensors);
}

at::Tensor XLANativeFunctions::sqrt(const at::Tensor& self) {
TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
return bridge::AtenFromXlaTensor(
tensor_methods::sqrt(bridge::GetXlaTensor(self)));
}

at::Tensor XLANativeFunctions::squeeze_copy(const at::Tensor& self) {
TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
return bridge::AtenFromXlaTensor(
Expand Down Expand Up @@ -3649,16 +3643,6 @@ at::Tensor XLANativeFunctions::pixel_unshuffle(const at::Tensor& self,
pixel_unshuffle)>::call(self, downscale_factor);
}

at::Tensor XLANativeFunctions::reshape_symint(const at::Tensor& self,
c10::SymIntArrayRef shape) {
// See Note: [Disabling functionalization]
if (runtime::sys_util::GetEnvBool("XLA_DISABLE_FUNCTIONALIZATION", false)) {
return at::native::reshape_symint(self, shape);
}
return at::functionalization::functionalize_aten_op_symint<ATEN_OP(
reshape)>::call(self, shape);
}

at::Tensor XLANativeFunctions::select_backward_symint(
const at::Tensor& grad_output, c10::SymIntArrayRef input_sizes, int64_t dim,
c10::SymInt index) {
Expand Down
Loading

0 comments on commit faff325

Please sign in to comment.