From ae9f5402d166a3f7c08f930ec112490c89b0a75e Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Thu, 28 Nov 2019 09:44:12 -0800 Subject: [PATCH] Added debug_run.py section is docs and done some tweaks on it. --- TROUBLESHOOTING.md | 29 +++++++++++++++++++++ scripts/debug_run.py | 60 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 82 insertions(+), 7 deletions(-) diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md index 15542a2a001..eedec835310 100644 --- a/TROUBLESHOOTING.md +++ b/TROUBLESHOOTING.md @@ -237,4 +237,33 @@ The, given the PID, it is possible to grab the stack traces with the following c ./scripts/dump_stacks.py PID > /tmp/stack-traces.log ``` +## Using debug_run.py To Collect Debug Information +A utility is provided in `scripts/debug_run.py` which can be used to create a `tar.gz` +archive with the information required to debug _PyTorch/XLA_ executions. + +Example: + +```Shell +./scripts/debug_run.py --outfile /tmp/debug_run.tar.gz -- python -u SCRIPT [ARGS...] +``` + +The _python_ `-u` flag is suggested to disable buffering so that captured logs are correctly +interleaved (otherwise STDOUT will be rendered after all STDERR). + +The above command line example will leave the temporary folder containing the archived +information on the filesystem. Use the `--tidy` flag to have that removed on exit: + +```Shell +./scripts/debug_run.py --tidy --outfile /tmp/debug_run.tar.gz -- python -u SCRIPT [ARGS...] +``` + +The `debug_run.tar.gz` file should then be attached to bug reports when necessary. + +Since the script will collect a lot of data, it should usually be let run for no more +than hundred steps or so. +If the SCRIPT has arguments to control the number of steps, those should be used, +otherwise hitting `CTRL^C` will interrupt the run. + +It is also sugested to run in single-core mode, to minimize the amount of data. +Running in single-core mode is also strongly suggested when debugging execution issues. diff --git a/scripts/debug_run.py b/scripts/debug_run.py index c3d8d42648d..5980ee1ce99 100755 --- a/scripts/debug_run.py +++ b/scripts/debug_run.py @@ -4,17 +4,23 @@ import argparse import copy +import getpass import glob import os import re import shutil import signal +import socket import subprocess import sys import time import tempfile _QUIT = False +_DEFAULT_VMODULE = [ + 'tensor=5', 'computation_client=5', 'xrt_computation_client=5', + 'aten_xla_type=1' +] def term_handler(signum, frame): @@ -54,16 +60,31 @@ def get_first_file(path): return path if os.path.isfile(path) else None +def build_vmodule(args, default): + default = list(default) + if args.vmodule: + default += args.vmodule.split(',') + return ','.join(default) + + +def show_env(env, fd=sys.stdout): + print('XLA Environment:', file=fd) + for k, v in env.items(): + if re.match(r'(XLA_|XRT_|TF_)', k): + print(' {}={}'.format(k, v), file=fd) + + def create_env(args): env = copy.copy(os.environ) env['XLA_IR_DEBUG'] = '1' env['XLA_HLO_DEBUG'] = '1' env['TF_CPP_LOG_THREAD_ID'] = '1' - env['TF_CPP_VMODULE'] = 'tensor=5' + env['TF_CPP_VMODULE'] = build_vmodule(args, _DEFAULT_VMODULE) env['XLA_SAVE_TENSORS_FILE'] = get_graphs_file_path(args.outdir) if args.hlo: env['XLA_SAVE_TENSORS_FMT'] = 'hlo' env['XLA_METRICS_FILE'] = get_metrics_file_path(args.outdir) + show_env(env) return env @@ -79,9 +100,22 @@ def grab_graphs(args): fd.write(report) +def create_temp_folder(): + seq = -1 + while True: + dir_name = 'debug_run-{}-{}'.format(socket.gethostname(), getpass.getuser()) + if seq >= 0: + dir_name += '-{}'.format(seq) + temp_folder = os.path.join(tempfile.gettempdir(), dir_name) + if not os.path.isdir(temp_folder): + os.mkdir(temp_folder) + return temp_folder + seq += 1 + + def setup_outdir(args): if args.outdir is None: - args.outdir = tempfile.mkdtemp() + args.outdir = create_temp_folder() print('Writing run results to {}'.format(args.outdir), file=sys.stderr) elif os.path.isdir(args.outdir): raise RuntimeError('Output folder must not exist: {}'.format(args.outdir)) @@ -90,16 +124,19 @@ def setup_outdir(args): def targz(folder, tarfile): - if subprocess.call(['tar', 'czf', tarfile, folder]) != 0: + dirbase = os.path.dirname(folder) + dirname = os.path.basename(folder) + if subprocess.call(['tar', '-C', dirbase, '-czf', tarfile, dirname]) != 0: raise RuntimeError('Failed to create folder {} archive into {}'.format( folder, tarfile)) -def read_proc_output(logfd, offset): +def read_proc_output(logfd, offset, outfd=None): size = os.fstat(logfd).st_size if size > offset: data = os.pread(logfd, size - offset, offset) - sys.stdout.write(data.decode('utf-8')) + if outfd is not None: + os.write(outfd, data) offset = size else: data = None @@ -115,10 +152,13 @@ def run_and_monitor(args): args.cmdline, stdout=logfd, stderr=subprocess.STDOUT, env=env) while not _QUIT and proc.poll() is None: - time.sleep(1.0) - offset, _ = read_proc_output(logfd, offset) + offset, data = read_proc_output(logfd, offset, outfd=sys.stdout.fileno()) + if data is None: + time.sleep(1.0) proc.terminate() + proc.wait() + read_proc_output(logfd, offset, outfd=sys.stdout.fileno()) os.close(logfd) @@ -156,6 +196,12 @@ def run_binary(args): type=str, default=None, help='The location of the tar.gz debug report file') + arg_parser.add_argument( + '--vmodule', + type=str, + default=None, + help='Extra --vmodule files to be added. A list of comma-separated NAME=LEVEL' + ) arg_parser.add_argument('cmdline', nargs='+') args = arg_parser.parse_args()