Skip to content

Commit

Permalink
Added debug_run.py section is docs and done some tweaks on it.
Browse files Browse the repository at this point in the history
  • Loading branch information
dlibenzi committed Dec 2, 2019
1 parent 7258805 commit ae9f540
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 7 deletions.
29 changes: 29 additions & 0 deletions TROUBLESHOOTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -237,4 +237,33 @@ The, given the PID, it is possible to grab the stack traces with the following c
./scripts/dump_stacks.py PID > /tmp/stack-traces.log
```

## Using debug_run.py To Collect Debug Information

A utility is provided in `scripts/debug_run.py` which can be used to create a `tar.gz`
archive with the information required to debug _PyTorch/XLA_ executions.

Example:

```Shell
./scripts/debug_run.py --outfile /tmp/debug_run.tar.gz -- python -u SCRIPT [ARGS...]
```

The _python_ `-u` flag is suggested to disable buffering so that captured logs are correctly
interleaved (otherwise STDOUT will be rendered after all STDERR).

The above command line example will leave the temporary folder containing the archived
information on the filesystem. Use the `--tidy` flag to have that removed on exit:

```Shell
./scripts/debug_run.py --tidy --outfile /tmp/debug_run.tar.gz -- python -u SCRIPT [ARGS...]
```

The `debug_run.tar.gz` file should then be attached to bug reports when necessary.

Since the script will collect a lot of data, it should usually be let run for no more
than hundred steps or so.
If the SCRIPT has arguments to control the number of steps, those should be used,
otherwise hitting `CTRL^C` will interrupt the run.

It is also sugested to run in single-core mode, to minimize the amount of data.
Running in single-core mode is also strongly suggested when debugging execution issues.
60 changes: 53 additions & 7 deletions scripts/debug_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,23 @@

import argparse
import copy
import getpass
import glob
import os
import re
import shutil
import signal
import socket
import subprocess
import sys
import time
import tempfile

_QUIT = False
_DEFAULT_VMODULE = [
'tensor=5', 'computation_client=5', 'xrt_computation_client=5',
'aten_xla_type=1'
]


def term_handler(signum, frame):
Expand Down Expand Up @@ -54,16 +60,31 @@ def get_first_file(path):
return path if os.path.isfile(path) else None


def build_vmodule(args, default):
default = list(default)
if args.vmodule:
default += args.vmodule.split(',')
return ','.join(default)


def show_env(env, fd=sys.stdout):
print('XLA Environment:', file=fd)
for k, v in env.items():
if re.match(r'(XLA_|XRT_|TF_)', k):
print(' {}={}'.format(k, v), file=fd)


def create_env(args):
env = copy.copy(os.environ)
env['XLA_IR_DEBUG'] = '1'
env['XLA_HLO_DEBUG'] = '1'
env['TF_CPP_LOG_THREAD_ID'] = '1'
env['TF_CPP_VMODULE'] = 'tensor=5'
env['TF_CPP_VMODULE'] = build_vmodule(args, _DEFAULT_VMODULE)
env['XLA_SAVE_TENSORS_FILE'] = get_graphs_file_path(args.outdir)
if args.hlo:
env['XLA_SAVE_TENSORS_FMT'] = 'hlo'
env['XLA_METRICS_FILE'] = get_metrics_file_path(args.outdir)
show_env(env)
return env


Expand All @@ -79,9 +100,22 @@ def grab_graphs(args):
fd.write(report)


def create_temp_folder():
seq = -1
while True:
dir_name = 'debug_run-{}-{}'.format(socket.gethostname(), getpass.getuser())
if seq >= 0:
dir_name += '-{}'.format(seq)
temp_folder = os.path.join(tempfile.gettempdir(), dir_name)
if not os.path.isdir(temp_folder):
os.mkdir(temp_folder)
return temp_folder
seq += 1


def setup_outdir(args):
if args.outdir is None:
args.outdir = tempfile.mkdtemp()
args.outdir = create_temp_folder()
print('Writing run results to {}'.format(args.outdir), file=sys.stderr)
elif os.path.isdir(args.outdir):
raise RuntimeError('Output folder must not exist: {}'.format(args.outdir))
Expand All @@ -90,16 +124,19 @@ def setup_outdir(args):


def targz(folder, tarfile):
if subprocess.call(['tar', 'czf', tarfile, folder]) != 0:
dirbase = os.path.dirname(folder)
dirname = os.path.basename(folder)
if subprocess.call(['tar', '-C', dirbase, '-czf', tarfile, dirname]) != 0:
raise RuntimeError('Failed to create folder {} archive into {}'.format(
folder, tarfile))


def read_proc_output(logfd, offset):
def read_proc_output(logfd, offset, outfd=None):
size = os.fstat(logfd).st_size
if size > offset:
data = os.pread(logfd, size - offset, offset)
sys.stdout.write(data.decode('utf-8'))
if outfd is not None:
os.write(outfd, data)
offset = size
else:
data = None
Expand All @@ -115,10 +152,13 @@ def run_and_monitor(args):
args.cmdline, stdout=logfd, stderr=subprocess.STDOUT, env=env)

while not _QUIT and proc.poll() is None:
time.sleep(1.0)
offset, _ = read_proc_output(logfd, offset)
offset, data = read_proc_output(logfd, offset, outfd=sys.stdout.fileno())
if data is None:
time.sleep(1.0)

proc.terminate()
proc.wait()
read_proc_output(logfd, offset, outfd=sys.stdout.fileno())
os.close(logfd)


Expand Down Expand Up @@ -156,6 +196,12 @@ def run_binary(args):
type=str,
default=None,
help='The location of the tar.gz debug report file')
arg_parser.add_argument(
'--vmodule',
type=str,
default=None,
help='Extra --vmodule files to be added. A list of comma-separated NAME=LEVEL'
)
arg_parser.add_argument('cmdline', nargs='+')

args = arg_parser.parse_args()
Expand Down

0 comments on commit ae9f540

Please sign in to comment.