From 2b2e0fddf8001c0c662bd582e1d958a74bc84ac4 Mon Sep 17 00:00:00 2001 From: Mateusz Sypniewski Date: Wed, 7 Sep 2022 07:23:03 -0700 Subject: [PATCH] Add CUDA Sanitizer (#83984) Example of a simple synchronization error: ``` a = torch.rand(4, 2, device="cuda") with torch.cuda.stream(second_stream): torch.mul(a, 5, out=a) ``` Output produced by CSAN: ``` ============================ CSAN detected a possible data race on tensor with data pointer 139719969079296 Access by stream 94646435460352 during kernel: aten::mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) writing to argument: self, out, output With stack trace: File "/private/home/sypniewski/pytorch/torch/cuda/_sanitizer.py", line 364, in _handle_kernel_launch stack_trace = traceback.StackSummary.extract( File "/private/home/sypniewski/pytorch/torch/cuda/_sanitizer.py", line 544, in __torch_dispatch__ errors = self.event_handler._handle_kernel_launch( File "/private/home/sypniewski/pytorch/torch/utils/_python_dispatch.py", line 76, in wrapped return f(self, *args, **kwargs) File "/private/home/sypniewski/pytorch/tester.py", line 9, in torch.mul(a, 5, out=a) Previous access by stream 0 during kernel: aten::rand(int[] size, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor writing to argument: output With stack trace: File "/private/home/sypniewski/pytorch/torch/cuda/_sanitizer.py", line 364, in _handle_kernel_launch stack_trace = traceback.StackSummary.extract( File "/private/home/sypniewski/pytorch/torch/cuda/_sanitizer.py", line 544, in __torch_dispatch__ errors = self.event_handler._handle_kernel_launch( File "/private/home/sypniewski/pytorch/torch/utils/_python_dispatch.py", line 76, in wrapped return f(self, *args, **kwargs) File "/private/home/sypniewski/pytorch/tester.py", line 6, in a = torch.rand(10000, device="cuda") Tensor was allocated with stack trace: File "/private/home/sypniewski/pytorch/torch/cuda/_sanitizer.py", line 420, in _handle_memory_allocation traceback.StackSummary.extract( File "/private/home/sypniewski/pytorch/torch/utils/_cuda_trace.py", line 23, in fire_callbacks cb(*args, **kwargs) File "/private/home/sypniewski/pytorch/torch/_ops.py", line 60, in __call__ return self._op(*args, **kwargs or {}) File "/private/home/sypniewski/pytorch/torch/cuda/_sanitizer.py", line 541, in __torch_dispatch__ outputs = func(*args, **kwargs) File "/private/home/sypniewski/pytorch/torch/utils/_python_dispatch.py", line 76, in wrapped return f(self, *args, **kwargs) File "/private/home/sypniewski/pytorch/tester.py", line 6, in a = torch.rand(10000, device="cuda") ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/83984 Approved by: https://github.com/ezyang --- test/test_cuda_sanitizer.py | 446 ++++++++++++++++++++++++++++ torch/__init__.py | 6 + torch/cuda/_sanitizer.py | 559 ++++++++++++++++++++++++++++++++++++ 3 files changed, 1011 insertions(+) create mode 100644 test/test_cuda_sanitizer.py create mode 100644 torch/cuda/_sanitizer.py diff --git a/test/test_cuda_sanitizer.py b/test/test_cuda_sanitizer.py new file mode 100644 index 0000000000000..e8629788be59b --- /dev/null +++ b/test/test_cuda_sanitizer.py @@ -0,0 +1,446 @@ +# Owner(s): ["module: cuda"] + +import sys +import textwrap +import traceback +from typing import List + +import torch +import torch.cuda._sanitizer as csan +from torch.cuda._sanitizer import StreamId, DataPtr, EventId +from torch.testing._internal.common_utils import TestCase, run_tests + + +# We cannot import TEST_CUDA from torch.testing._internal.common_cuda here, +# because if we do that, the TEST_CUDNN line from torch.testing._internal.common_cuda will be executed +# multiple times as well during the execution of this test suite, and it will +# cause CUDA OOM error on Windows. +TEST_CUDA = torch.cuda.is_available() + +if not TEST_CUDA: + print("CUDA not available, skipping tests", file=sys.stderr) + TestCase = object # noqa: F811 + + +class TestArgumentHandler(TestCase): + def test_add(self): + add_func = torch.ops.aten.add.Tensor + a = torch.ones(5, 3, device="cuda") + b = torch.randn(5, 3, device="cuda") + + argument_handler = csan.ArgumentHandler() + argument_handler.parse_inputs(add_func._schema, (a, b), {}) + c = torch.add(a, b) + argument_handler.parse_outputs(c) + + self.assertEqual({a.data_ptr(), b.data_ptr()}, argument_handler.dataptrs_read) + self.assertEqual({c.data_ptr()}, argument_handler.dataptrs_written) + + def test_cat(self): + cat_func = torch.ops.aten.cat.default + a = torch.ones(2, 4, 5, device="cuda") + b = torch.zeros(2, 1, 5, device="cuda") + c = torch.rand(2, 7, 5, device="cuda") + + argument_handler = csan.ArgumentHandler() + argument_handler.parse_inputs(cat_func._schema, ([a, b, c], 1), {}) + d = torch.cat((a, b, c), dim=1) + argument_handler.parse_outputs(d) + + self.assertEqual( + {a.data_ptr(), b.data_ptr(), c.data_ptr()}, argument_handler.dataptrs_read + ) + self.assertEqual({d.data_ptr()}, argument_handler.dataptrs_written) + + def test_split(self): + split_func = torch.ops.aten.split.Tensor + a = torch.arange(10, device="cuda").reshape(5, 2) + + argument_handler = csan.ArgumentHandler() + argument_handler.parse_inputs(split_func._schema, (a, 2), {}) + out = torch.split(a, 2) + argument_handler.parse_outputs(out) + + outputs = {out[0].data_ptr(), out[1].data_ptr(), out[2].data_ptr()} + self.assertEqual({a.data_ptr()}, argument_handler.dataptrs_read) + self.assertEqual( + outputs, + argument_handler.dataptrs_written, + ) + + def test_inplace(self): + add_inplace_func = torch.ops.aten.add_.Tensor + a = torch.rand(4, 2, device="cuda") + + argument_handler = csan.ArgumentHandler() + argument_handler.parse_inputs(add_inplace_func._schema, (a, 5), {}) + a.add_(5) + argument_handler.parse_outputs(a) + + self.assertEqual(set(), argument_handler.dataptrs_read) + self.assertEqual({a.data_ptr()}, argument_handler.dataptrs_written) + + def test_out(self): + mul_out_func = torch.ops.aten.mul.out + a = torch.arange(8, device="cuda") + b = torch.empty(8, device="cuda") + + argument_handler = csan.ArgumentHandler() + argument_handler.parse_inputs(mul_out_func._schema, (a, 3), {"out": b}) + torch.mul(a, 3, out=b) + argument_handler.parse_outputs(b) + + self.assertEqual({a.data_ptr()}, argument_handler.dataptrs_read) + self.assertEqual({b.data_ptr()}, argument_handler.dataptrs_written) + + def test_nonzero(self): + nonzero_func = torch.ops.aten.nonzero.default + a = torch.ones(5, 3, 2, device="cuda") + + argument_handler = csan.ArgumentHandler() + argument_handler.parse_inputs(nonzero_func._schema, (a,), {"as_tuple": True}) + out = torch.nonzero(a, as_tuple=True) + argument_handler.parse_outputs(out) + + outputs = {out[0].data_ptr(), out[1].data_ptr(), out[2].data_ptr()} + self.assertEqual({a.data_ptr()}, argument_handler.dataptrs_read) + self.assertEqual(outputs, argument_handler.dataptrs_written) + + def test_tensor_names(self): + addr_func = torch.ops.aten.addr.default + vec = torch.arange(1, 4, device="cuda") + M = torch.zeros(3, 3, device="cuda") + + argument_handler = csan.ArgumentHandler() + argument_handler.parse_inputs(addr_func._schema, (M, vec, vec), {}) + out = torch.addr(M, vec, vec) + argument_handler.parse_outputs(out) + + self.assertEqual( + argument_handler.tensor_names, + { + M.data_ptr(): ["self"], + vec.data_ptr(): ["vec1", "vec2"], + out.data_ptr(): ["output"], + }, + ) + + +def tensor_id(i: int) -> DataPtr: + return i + + +def stream_id(i: int) -> StreamId: + return 1000 + i + + +def event_id(i: int) -> EventId: + return 2000 + i + + +class TestEventHandler(TestCase): + def setUp(self): + self.handler = csan.EventHandler() + + def kernel_launch( + self, + stream: StreamId, + read_only: List[DataPtr] = None, + read_write: List[DataPtr] = None, + ) -> List[csan.SynchronizationError]: + if read_only is None: + read_only = [] + if read_write is None: + read_write = [] + return self.handler._handle_kernel_launch( + stream, + read_only, + read_write, + "", + {k: [""] for k in read_only + read_write}, + ) + + def assert_good_kernel_launch( + self, + stream: StreamId, + read_only: List[DataPtr] = None, + read_write: List[DataPtr] = None, + ) -> None: + self.assertEqual(self.kernel_launch(stream, read_only, read_write), []) + + def assert_bad_kernel_launch( + self, + number_of_errors: int, + stream: StreamId, + read_only: List[DataPtr] = None, + read_write: List[DataPtr] = None, + ) -> None: + errors = self.kernel_launch(stream, read_only, read_write) + self.assertEqual(len(errors), number_of_errors) + + def test_empty_kernel_launch(self): + self.assert_good_kernel_launch(stream_id(0)) + + def test_simple_passing(self): + self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)]) + self.assert_good_kernel_launch(stream_id(2), read_only=[tensor_id(1)]) + + def test_simple_error(self): + self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)]) + self.assert_bad_kernel_launch(1, stream_id(2), read_write=[tensor_id(1)]) + + def test_simple_sync(self): + self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)]) + self.handler._handle_event_record(event_id(0), stream_id(1)) + self.handler._handle_event_wait(event_id(0), stream_id(2)) + self.assert_good_kernel_launch(stream_id(2), read_write=[tensor_id(1)]) + + def test_reads_check_last_write(self): + # Tests that not only the first read operation checks if it is in conflict + # with the last write operation, but all read operations do. + + self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)]) + self.handler._handle_event_record(event_id(0), stream_id(1)) + self.handler._handle_event_wait(event_id(0), stream_id(2)) + self.assert_good_kernel_launch(stream_id(2), read_only=[tensor_id(1)]) + + self.assert_bad_kernel_launch(1, stream_id(3), read_only=[tensor_id(1)]) + + def test_branch_sync(self): + # Tests that two streams can read after both waiting for a third, but they + # cannot write without further synchronization. + + self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)]) + self.handler._handle_event_record(event_id(0), stream_id(1)) + self.handler._handle_event_wait(event_id(0), stream_id(2)) + self.handler._handle_event_wait(event_id(0), stream_id(3)) + self.assert_good_kernel_launch(stream_id(2), read_only=[tensor_id(1)]) + self.assert_good_kernel_launch(stream_id(3), read_only=[tensor_id(1)]) + + self.assert_bad_kernel_launch(1, stream_id(2), read_write=[tensor_id(1)]) + + def test_chain_sync(self): + iterations = 10 + + self.assert_good_kernel_launch(stream_id(0), read_only=[tensor_id(1)]) + for i in range(iterations): + self.handler._handle_event_record(event_id(i), stream_id(i)) + self.handler._handle_event_wait(event_id(i), stream_id(i + 1)) + self.assert_good_kernel_launch(stream_id(iterations), read_write=[tensor_id(1)]) + + def test_expired_record(self): + self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)]) + self.handler._handle_event_record(event_id(0), stream_id(1)) + self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)]) + self.handler._handle_event_wait(event_id(0), stream_id(2)) + + self.assert_bad_kernel_launch(1, stream_id(2), read_write=[tensor_id(1)]) + + def test_deleted_record(self): + for should_delete, should_create in [ + (True, True), + (True, False), + (False, True), + ]: + self.setUp() + with self.subTest(should_delete=should_delete, should_create=should_create): + self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)]) + self.handler._handle_event_record(event_id(0), stream_id(1)) + + if should_delete: + self.handler._handle_event_deletion(event_id(0)) + if should_create: + self.handler._handle_event_creation(event_id(0)) + + self.handler._handle_event_wait(event_id(0), stream_id(2)) + self.assert_bad_kernel_launch( + 1, stream_id(2), read_write=[tensor_id(1)] + ) + + def test_all_reads_checked_failing(self): + iterations = 10 + for i in range(1, iterations): + self.assert_good_kernel_launch(stream_id(i), read_only=[tensor_id(1)]) + self.handler._handle_event_record(event_id(i), stream_id(i)) + + for i in range(1, iterations): + self.handler._handle_event_wait(event_id(i), stream_id(0)) + + self.assert_good_kernel_launch(stream_id(iterations), read_only=[tensor_id(1)]) + self.handler._handle_event_record(event_id(iterations), stream_id(i)) + + # Does not synchronize with the last read. + self.assert_bad_kernel_launch(1, stream_id(0), read_write=[tensor_id(1)]) + + def test_all_reads_checked_passing(self): + iterations = 10 + for i in range(1, iterations): + self.assert_good_kernel_launch(stream_id(i), read_only=[tensor_id(1)]) + self.handler._handle_event_record(event_id(i), stream_id(i)) + + for i in range(1, iterations): + self.handler._handle_event_wait(event_id(i), stream_id(0)) + + self.assert_good_kernel_launch(stream_id(0), read_write=[tensor_id(1)]) + + def test_multiple_errors(self): + iterations = 10 + self.assert_good_kernel_launch( + stream_id(0), read_write=[tensor_id(i) for i in range(iterations)] + ) + self.assert_bad_kernel_launch( + iterations, + stream_id(1), + read_write=[tensor_id(i) for i in range(iterations)], + ) + + def test_correct_state_merging(self): + # Tests that after waiting for an event, a stream's state is indeed set + # to the pointwise maximum of its old state and the recorded state. + + self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)]) + self.assert_good_kernel_launch(stream_id(2), read_write=[tensor_id(2)]) + self.handler._handle_event_record(event_id(1), stream_id(1)) + self.handler._handle_event_record(event_id(2), stream_id(2)) + + self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)]) + self.assert_good_kernel_launch(stream_id(2), read_write=[tensor_id(2)]) + self.handler._handle_event_wait(event_id(1), stream_id(2)) + self.handler._handle_event_wait(event_id(2), stream_id(1)) + + self.handler._handle_event_record(event_id(3), stream_id(2)) + self.handler._handle_event_wait(event_id(3), stream_id(1)) + self.assert_good_kernel_launch( + stream_id(1), read_write=[tensor_id(1), tensor_id(2)] + ) + + def test_record_override(self): + self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)]) + self.assert_good_kernel_launch(stream_id(2), read_only=[tensor_id(2)]) + self.handler._handle_event_record(event_id(1), stream_id(1)) + self.handler._handle_event_record(event_id(1), stream_id(2)) + + self.handler._handle_event_wait(event_id(1), stream_id(3)) + self.assert_bad_kernel_launch(1, stream_id(3), read_write=[tensor_id(1)]) + + def test_multiple_wait(self): + # Tests that a wait operation can be performed multiple times on the same event + # by different streams. + + self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)]) + self.handler._handle_event_record(event_id(1), stream_id(1)) + self.handler._handle_event_wait(event_id(1), stream_id(2)) + self.handler._handle_event_wait(event_id(1), stream_id(3)) + + self.assert_good_kernel_launch(stream_id(2), read_only=[tensor_id(1)]) + self.assert_good_kernel_launch(stream_id(3), read_only=[tensor_id(1)]) + + +class TestMessages(TestCase): + def setUp(self): + self.handler = csan.EventHandler() + + def test_ensure_exists(self): + ARG = 0 + for func, out in [ + ( + self.handler._handle_event_deletion, + f"Found Event with id: {ARG}, but no matching event " + "creation in the trace. Backfilling the trace now. " + "Perhaps the sanitizer was enabled after some torch operations?", + ), + ( + self.handler._handle_memory_deallocation, + f"Found tensor with pointer: {ARG}, but no matching tensor " + "allocation in the trace. Backfilling the trace now. " + "Perhaps the sanitizer was enabled after some torch operations?", + ), + ]: + with self.subTest(func=func, out=out): + with self.assertLogs() as captured: + func(ARG) + self.assertEqual(captured.records[0].getMessage(), out) + + def test_ensure_does_not_exist(self): + ARG = 0 + self.handler._handle_event_creation(ARG) + self.handler._handle_stream_creation(ARG) + for func, out in [ + ( + self.handler._handle_event_creation, + "Found duplicate event creation in the trace for event with " + f"id: {ARG}. Assuming the trace for event deletion wasn't caught " + "and backfilling it now. " + "Perhaps the sanitizer was enabled after some torch operations?", + ), + ( + self.handler._handle_stream_creation, + "Found duplicate Stream creation in the trace for Stream with " + f"id: {ARG}. PyTorch Streams are only created once, so this " + "trace entry is ignored.", + ), + ]: + with self.subTest(func=func, out=out): + with self.assertLogs() as captured: + func(ARG) + self.assertEqual(captured.records[0].getMessage(), out) + + def test_error_message(self): + current_access = csan.Access( + type=csan.AccessType.WRITE, + seq_num=1, + stream=stream_id(1), + operator="schema", + names=["b"], + stack_trace=traceback.StackSummary.from_list( + [("file", 0, "name", "trace a")] + ), + ) + previous_access = csan.Access( + type=csan.AccessType.READ, + seq_num=2, + stream=stream_id(0), + operator="schema", + names=["a"], + stack_trace=traceback.StackSummary.from_list( + [("file", 0, "name", "trace b")] + ), + ) + error = csan.UnsynchronizedAccessError( + data_ptr=tensor_id(1), + allocation_stack_trace=traceback.StackSummary.from_list( + [("file", 0, "name", "alloc")] + ), + current_access=current_access, + previous_access=previous_access, + ) + self.assertEqual( + str(error), + textwrap.dedent( + """\ + ============================ + CSAN detected a possible data race on tensor with data pointer 1 + Access by stream 1001 during kernel: + schema + writing to argument: b + With stack trace: + File "file", line 0, in name + trace a + + Previous access by stream 1000 during kernel: + schema + reading from argument: a + With stack trace: + File "file", line 0, in name + trace b + + Tensor was allocated with stack trace: + File "file", line 0, in name + alloc + """ + ), + ) + + +if __name__ == "__main__": + run_tests() diff --git a/torch/__init__.py b/torch/__init__.py index e186db209e1cd..a6e8bc295d087 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -955,3 +955,9 @@ def _register_device_module(device_type, module): from . import library if not TYPE_CHECKING: from . import _meta_registrations + +# Enable CUDA Sanitizer +if 'TORCH_CUDA_SANITIZER' in os.environ: + import torch.cuda._sanitizer as csan + + csan.enable_cuda_sanitizer() diff --git a/torch/cuda/_sanitizer.py b/torch/cuda/_sanitizer.py new file mode 100644 index 0000000000000..aa98b02eefc90 --- /dev/null +++ b/torch/cuda/_sanitizer.py @@ -0,0 +1,559 @@ +r""" +This module introduces CUDA Sanitizer, a tool for detecting synchronization errors +between kernels ran on different streams. It stores information on accesses to tensors +to determine if they are synchronized or not. When enabled in a python program and a +possible data race is detected, a detailed warning will be printed and the program +will exit. + +It can be enabled either by importing this module and using +:func:`enable_cuda_sanitizer()` or by exporting ``TORCH_CUDA_SANITIZER`` +environment variable. +""" + +import enum +import functools +import io +import logging +import sys +import textwrap +import traceback +from dataclasses import dataclass, field +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, TypeVar + +import torch +import torch.utils._cuda_trace as cuda_trace +from torch.utils._python_dispatch import TorchDispatchMode +from torch.utils._pytree import tree_map + + +TK = TypeVar("TK") +TVa = TypeVar("TVa") +TVb = TypeVar("TVb") + +DataPtr = int +StreamId = int +EventId = int +SeqNum = int + +logger = logging.getLogger(__name__) + + +class AccessType(enum.Enum): + READ = enum.auto() + WRITE = enum.auto() + + def __str__(self): + return "reading from" if self is AccessType.READ else "writing to" + + +@dataclass +class Access: + r"""Stores information about a single access to a tensor by a kernel. + + Args: + type: either AccessType.READ or AccessType.Write. + seq_num: the sequential number of the kernel performing the access. + stream: the stream id of the stream executing the kernel. + operator: the schema of the launched kernel, which lists the + arguments and return type. + names: the arguments in the schema this access corresponds to. + stack_trace: the stack summary object captured during access. + """ + type: AccessType + seq_num: SeqNum + stream: StreamId + operator: str + names: List[str] + stack_trace: traceback.StackSummary + + +class SynchronizationError(Exception): + """Base class for errors detected by CUDA Sanitizer.""" + + pass + + +class UnsynchronizedAccessError(SynchronizationError): + """Stores information about two unsynchronized accesses to one data pointer.""" + + def __init__( + self, + data_ptr: DataPtr, + allocation_stack_trace: Optional[traceback.StackSummary], + current_access: Access, + previous_access: Access, + ): + self.data_ptr = data_ptr + self.allocation_stack_trace = allocation_stack_trace + self.current_access = current_access + self.previous_access = previous_access + + def __str__(self): + with io.StringIO() as message: + message.write( + textwrap.dedent( + f"""\ + ============================ + CSAN detected a possible data race on tensor with data pointer {self.data_ptr} + Access by stream {self.current_access.stream} during kernel: + {self.current_access.operator} + {self.current_access.type} argument: {', '.join(self.current_access.names)} + With stack trace: + """ + ) + ) + message.write(f"{''.join(self.current_access.stack_trace.format())}\n") + message.write( + textwrap.dedent( + f"""\ + Previous access by stream {self.previous_access.stream} during kernel: + {self.previous_access.operator} + {self.previous_access.type} argument: {', '.join(self.previous_access.names)} + With stack trace: + """ + ) + ) + message.write(f"{''.join(self.previous_access.stack_trace.format())}\n") + if self.allocation_stack_trace: + message.write( + "Tensor was allocated with stack trace:\n" + f"{''.join(self.allocation_stack_trace.format())}" + ) + else: + message.write("Trace for tensor allocation not found.") + return message.getvalue() + + +class CUDASanitizerErrors(Exception): + """Wrapper class for errors reported by CUDA Sanitizer.""" + + def __init__(self, errors: List[SynchronizationError]): + self.errors = errors + + def __str__(self): + return f"detected {len(self.errors)} errors" + + +def format_log_message(message: str) -> str: + return " ".join(line.strip() for line in message.strip().splitlines()) + + +@dataclass +class TensorInfo: + r"""Stores information about a single tensor and recent accesses to it. + + Args: + allocation_stack_trace: the stack summary object captured during tensor + allocation. Can be ``None`` if the allocation wasn't caught by CSAN. + reads: list of read accesses to the tensor that were performed since + the last write. + write: the last write access to the tensor. + """ + allocation_stack_trace: Optional[traceback.StackSummary] + reads: List[Access] = field(default_factory=list) + write: Optional[Access] = None + + +class _TensorsAccessed: + def __init__(self): + self.accesses: Dict[DataPtr, TensorInfo] = {} + + def ensure_tensor_exists(self, data_ptr: DataPtr) -> None: + if data_ptr not in self.accesses: + logger.info( + format_log_message( + f""" + Found tensor with pointer: {data_ptr}, but no matching tensor + allocation in the trace. Backfilling the trace now. + Perhaps the sanitizer was enabled after some torch operations? + """ + ) + ) + self.create_tensor(data_ptr, None) + + def ensure_tensor_does_not_exist(self, data_ptr: DataPtr) -> None: + if data_ptr in self.accesses: + logger.info( + format_log_message( + f""" + Found duplicate tensor allocation in the trace for tensor with + pointer: {data_ptr}. Assuming the trace for tensor deallocation + wasn't caught and backfilling it now. + Perhaps the sanitizer was enabled after some torch operations? + """ + ) + ) + self.delete_tensor(data_ptr) + + def create_tensor( + self, data_ptr: DataPtr, stack_trace: Optional[traceback.StackSummary] + ) -> None: + self.accesses[data_ptr] = TensorInfo(stack_trace) + + def delete_tensor(self, data_ptr: DataPtr) -> None: + del self.accesses[data_ptr] + + def were_there_reads_since_last_write(self, data_ptr: DataPtr) -> bool: + return True if self.accesses[data_ptr].reads else False + + def get_allocation_stack_trace( + self, data_ptr: DataPtr + ) -> Optional[traceback.StackSummary]: + return self.accesses[data_ptr].allocation_stack_trace + + def get_write(self, data_ptr: DataPtr) -> Optional[Access]: + return self.accesses[data_ptr].write + + def get_reads(self, data_ptr: DataPtr) -> List[Access]: + return self.accesses[data_ptr].reads + + def add_read(self, data_ptr: DataPtr, access: Access) -> None: + self.accesses[data_ptr].reads.append(access) + + def set_write(self, data_ptr: DataPtr, access: Access) -> None: + self.accesses[data_ptr].write = access + self.accesses[data_ptr].reads = [] + + +class StreamSynchronizations: + def __init__(self): + self.current_sync_states: Dict[StreamId, Dict[StreamId, SeqNum]] = {} + self.recorded_sync_states: Dict[EventId, Dict[StreamId, SeqNum]] = {} + + def _ensure_stream_exists(self, stream: StreamId) -> None: + if stream not in self.current_sync_states: + logger.info( + format_log_message( + f""" + Found Stream with id: {stream}, but no matching stream + creation in the trace. Backfilling the trace now. + Perhaps the sanitizer was enabled after some torch operations? + """ + ) + ) + self.create_stream(stream) + + def _ensure_event_exists(self, event: EventId) -> None: + if event not in self.recorded_sync_states: + logger.info( + format_log_message( + f""" + Found Event with id: {event}, but no matching event + creation in the trace. Backfilling the trace now. + Perhaps the sanitizer was enabled after some torch operations? + """ + ) + ) + self.create_event(event) + + def _ensure_event_does_not_exist(self, event: EventId) -> None: + if event in self.recorded_sync_states: + logger.info( + format_log_message( + f""" + Found duplicate event creation in the trace for event with + id: {event}. Assuming the trace for event deletion wasn't caught + and backfilling it now. + Perhaps the sanitizer was enabled after some torch operations? + """ + ) + ) + self.delete_event(event) + + def create_stream(self, stream: StreamId) -> None: + if stream in self.current_sync_states: + logger.info( + format_log_message( + f""" + Found duplicate Stream creation in the trace for Stream with + id: {stream}. PyTorch Streams are only created once, so this + trace entry is ignored. + """ + ) + ) + else: + self.current_sync_states[stream] = {} + + def create_event(self, event: EventId) -> None: + self._ensure_event_does_not_exist(event) + self.recorded_sync_states[event] = {} + + def delete_event(self, event: EventId) -> None: + self._ensure_event_exists(event) + del self.recorded_sync_states[event] + + def update_seq_num(self, stream: StreamId, seq_num: SeqNum) -> None: + self._ensure_stream_exists(stream) + self.current_sync_states[stream][stream] = seq_num + + def record_state(self, event: EventId, stream: StreamId) -> None: + self._ensure_event_exists(event) + self._ensure_stream_exists(stream) + self.recorded_sync_states[event] = self.current_sync_states[stream].copy() + + def state_wait_for_event(self, stream: StreamId, event: EventId) -> None: + self._ensure_event_exists(event) + self._ensure_stream_exists(stream) + for other_stream, seq_num in self.recorded_sync_states[event].items(): + self.current_sync_states[stream][other_stream] = max( + self.current_sync_states[stream].get(other_stream, -1), seq_num + ) + + def is_ordered_after( + self, current_stream: StreamId, seq_num: SeqNum, other_stream: StreamId + ) -> bool: + self._ensure_stream_exists(current_stream) + self._ensure_stream_exists(other_stream) + return seq_num <= self.current_sync_states[current_stream].get(other_stream, -1) + + +class EventHandler: + """Analyzes CSAN trace for synchronization errors. + + Stores information on each stream's synchronizations with other streams as well + as tensor accesses to determine whether a given kernel launch might cause a + data race. + """ + + def __init__(self): + self.tensors_accessed = _TensorsAccessed() + self.syncs = StreamSynchronizations() + self.seq_num: SeqNum = 0 + + def _handle_kernel_launch( + self, + stream: StreamId, + read_only: List[DataPtr], + read_write: List[DataPtr], + operator: str, + tensor_names: Dict[int, List[str]], + ) -> List[SynchronizationError]: + def check_conflict( + data_ptr: DataPtr, current_access: Access, previous_access: Optional[Access] + ) -> None: + if previous_access is None: + return + if not self.syncs.is_ordered_after( + current_access.stream, previous_access.seq_num, previous_access.stream + ): + error_list.append( + UnsynchronizedAccessError( + data_ptr, + self.tensors_accessed.get_allocation_stack_trace(data_ptr), + current_access, + previous_access, + ) + ) + + error_list: List[SynchronizationError] = [] + self.seq_num += 1 + self.syncs.update_seq_num(stream, self.seq_num) + stack_trace = traceback.StackSummary.extract( + traceback.walk_stack(None), lookup_lines=False + ) + + for data_ptr in read_only: + self.tensors_accessed.ensure_tensor_exists(data_ptr) + current_access = Access( + AccessType.READ, + self.seq_num, + stream, + operator, + tensor_names[data_ptr], + stack_trace, + ) + check_conflict( + data_ptr, current_access, self.tensors_accessed.get_write(data_ptr) + ) + self.tensors_accessed.add_read(data_ptr, current_access) + + for data_ptr in read_write: + self.tensors_accessed.ensure_tensor_exists(data_ptr) + current_access = Access( + AccessType.WRITE, + self.seq_num, + stream, + operator, + tensor_names[data_ptr], + stack_trace, + ) + if self.tensors_accessed.were_there_reads_since_last_write(data_ptr): + for previous_access in self.tensors_accessed.get_reads(data_ptr): + check_conflict(data_ptr, current_access, previous_access) + else: + check_conflict( + data_ptr, current_access, self.tensors_accessed.get_write(data_ptr) + ) + self.tensors_accessed.set_write(data_ptr, current_access) + + return error_list + + def _handle_event_creation(self, event: EventId) -> None: + self.syncs.create_event(event) + + def _handle_event_deletion(self, event: EventId) -> None: + self.syncs.delete_event(event) + + def _handle_event_record(self, event: EventId, stream: StreamId) -> None: + self.syncs.record_state(event, stream) + + def _handle_event_wait(self, event: EventId, stream: StreamId) -> None: + self.syncs.state_wait_for_event(stream, event) + + def _handle_memory_allocation(self, data_ptr: DataPtr) -> None: + self.tensors_accessed.ensure_tensor_does_not_exist(data_ptr) + self.tensors_accessed.create_tensor( + data_ptr, + traceback.StackSummary.extract( + traceback.walk_stack(None), lookup_lines=False + ), + ) + + def _handle_memory_deallocation(self, data_ptr: DataPtr) -> None: + self.tensors_accessed.ensure_tensor_exists(data_ptr) + self.tensors_accessed.delete_tensor(data_ptr) + + def _handle_stream_creation(self, stream: StreamId) -> None: + self.syncs.create_stream(stream) + + +def zip_by_key(a: Dict[TK, TVa], b: Dict[TK, TVb]) -> Iterator[Tuple[TK, TVa, TVb]]: + for arg, value in a.items(): + if arg in b: + yield arg, value, b[arg] + + +def zip_arguments( + schema: torch.FunctionSchema, args: Tuple[Any, ...], kwargs: Dict[str, Any] +) -> Iterator[Tuple[torch.Argument, Any]]: + schema_args = schema.arguments[: len(args)] + schema_kwargs = {arg.name: arg for arg in schema.arguments[len(args) :]} + + yield from zip(schema_args, args) + + for _, argument, value in zip_by_key(schema_kwargs, kwargs): + yield (argument, value) + + +class ArgumentHandler: + def __init__(self): + self.dataptrs_read: Set[int] = set() + self.dataptrs_written: Set[int] = set() + self.tensor_names: Dict[int, List[str]] = dict() + + def _handle_argument(self, value: Any, is_write: bool, name: str) -> None: + if isinstance(value, torch.Tensor) and value.is_cuda: + data_ptr = value.data_ptr() + if is_write: + self.dataptrs_written.add(data_ptr) + else: + self.dataptrs_read.add(data_ptr) + self.tensor_names.setdefault(data_ptr, []).append(name) + + def parse_inputs( + self, + schema: torch.FunctionSchema, + args: Tuple[Any, ...], + kwargs: Dict[str, Any], + ) -> None: + for argument, value in zip_arguments(schema, args, kwargs): + is_write = argument.alias_info is not None and argument.alias_info.is_write + tree_map( + functools.partial( + self._handle_argument, is_write=is_write, name=argument.name + ), + value, + ) + + def parse_outputs(self, outputs: Any) -> None: + tree_map( + functools.partial(self._handle_argument, is_write=True, name="output"), + outputs, + ) + + +class CUDASanitizerDispatchMode(TorchDispatchMode): + def __init__(self): + self.event_handler = EventHandler() + torch._C._activate_cuda_trace() + cuda_trace.register_callback_for_cuda_event_creation( + self.event_handler._handle_event_creation + ) + cuda_trace.register_callback_for_cuda_event_deletion( + self.event_handler._handle_event_deletion + ) + cuda_trace.register_callback_for_cuda_event_record( + self.event_handler._handle_event_record + ) + cuda_trace.register_callback_for_cuda_event_wait( + self.event_handler._handle_event_wait + ) + cuda_trace.register_callback_for_cuda_memory_allocation( + self.event_handler._handle_memory_allocation + ) + cuda_trace.register_callback_for_cuda_memory_deallocation( + self.event_handler._handle_memory_deallocation + ) + cuda_trace.register_callback_for_cuda_stream_creation( + self.event_handler._handle_stream_creation + ) + + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + + argument_handler = ArgumentHandler() + argument_handler.parse_inputs(func._schema, args, kwargs) + + outputs = func(*args, **kwargs) + + argument_handler.parse_outputs(outputs) + errors = self.event_handler._handle_kernel_launch( + torch.cuda.current_stream().cuda_stream, + list(argument_handler.dataptrs_read - argument_handler.dataptrs_written), + list(argument_handler.dataptrs_written), + func._schema, + argument_handler.tensor_names, + ) + if errors: + for error in errors: + print(error, file=sys.stderr) + raise CUDASanitizerErrors(errors) + + return outputs + + +class CUDASanitizer: + """Manages the lifetime of a CUDASanitizer dispatch mode object. + + The CUDASanitizer class wraps the entering/exiting functions of the dispatch mode + context manager in the enable function/destructor, respectively. This is to + explicitly set the lifetime of the dispatch mode object to that of the application. + This approach was deemed more elegant than using the atexit module. + """ + + def __init__(self): + self.dispatch = CUDASanitizerDispatchMode() + self.enabled = False + + def enable(self): + self.dispatch.__enter__() + self.enabled = True + + def __del__(self): + if self.enabled: + self.dispatch.__exit__(None, None, None) + + +def enable_cuda_sanitizer(): + """Enables CUDA Sanitizer. + + The sanitizer will begin to analyze low-level CUDA calls invoked by torch functions + for synchronization errors. All data races found will be printed to the standard + error output along with stack traces of suspected causes. For best results, the + sanitizer should be enabled at the very beginning of the program. + """ + cuda_sanitizer.enable() + + +cuda_sanitizer = CUDASanitizer()