Added debug_run.py section is docs and done some tweaks on it.

huamichaelchen · Dec 2, 2019 · ae9f540 · ae9f540
1 parent 7258805
commit ae9f540
Show file tree

Hide file tree

Showing 2 changed files with 82 additions and 7 deletions.
diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md
@@ -237,4 +237,33 @@ The, given the PID, it is possible to grab the stack traces with the following c
 ./scripts/dump_stacks.py PID > /tmp/stack-traces.log
 ```
 
+## Using debug_run.py To Collect Debug Information
 
+A utility is provided in `scripts/debug_run.py` which can be used to create a `tar.gz`
+archive with the information required to debug _PyTorch/XLA_ executions.
+
+Example:
+
+```Shell
+./scripts/debug_run.py --outfile /tmp/debug_run.tar.gz -- python -u SCRIPT [ARGS...]
+```
+
+The _python_ `-u` flag is suggested to disable buffering so that captured logs are correctly
+interleaved (otherwise STDOUT will be rendered after all STDERR).
+
+The above command line example will leave the temporary folder containing the archived
+information on the filesystem. Use the `--tidy` flag to have that removed on exit:
+
+```Shell
+./scripts/debug_run.py --tidy --outfile /tmp/debug_run.tar.gz -- python -u SCRIPT [ARGS...]
+```
+
+The `debug_run.tar.gz` file should then be attached to bug reports when necessary.
+
+Since the script will collect a lot of data, it should usually be let run for no more
+than hundred steps or so.
+If the SCRIPT has arguments to control the number of steps, those should be used,
+otherwise hitting `CTRL^C` will interrupt the run.
+
+It is also sugested to run in single-core mode, to minimize the amount of data.
+Running in single-core mode is also strongly suggested when debugging execution issues.
diff --git a/scripts/debug_run.py b/scripts/debug_run.py
@@ -4,17 +4,23 @@
 
 import argparse
 import copy
+import getpass
 import glob
 import os
 import re
 import shutil
 import signal
+import socket
 import subprocess
 import sys
 import time
 import tempfile
 
 _QUIT = False
+_DEFAULT_VMODULE = [
+    'tensor=5', 'computation_client=5', 'xrt_computation_client=5',
+    'aten_xla_type=1'
+]
 
 
 def term_handler(signum, frame):
@@ -54,16 +60,31 @@ def get_first_file(path):
   return path if os.path.isfile(path) else None
 
 
+def build_vmodule(args, default):
+  default = list(default)
+  if args.vmodule:
+    default += args.vmodule.split(',')
+  return ','.join(default)
+
+
+def show_env(env, fd=sys.stdout):
+  print('XLA Environment:', file=fd)
+  for k, v in env.items():
+    if re.match(r'(XLA_|XRT_|TF_)', k):
+      print('  {}={}'.format(k, v), file=fd)
+
+
 def create_env(args):
   env = copy.copy(os.environ)
   env['XLA_IR_DEBUG'] = '1'
   env['XLA_HLO_DEBUG'] = '1'
   env['TF_CPP_LOG_THREAD_ID'] = '1'
-  env['TF_CPP_VMODULE'] = 'tensor=5'
+  env['TF_CPP_VMODULE'] = build_vmodule(args, _DEFAULT_VMODULE)
   env['XLA_SAVE_TENSORS_FILE'] = get_graphs_file_path(args.outdir)
   if args.hlo:
     env['XLA_SAVE_TENSORS_FMT'] = 'hlo'
   env['XLA_METRICS_FILE'] = get_metrics_file_path(args.outdir)
+  show_env(env)
   return env
 
 
@@ -79,9 +100,22 @@ def grab_graphs(args):
       fd.write(report)
 
 
+def create_temp_folder():
+  seq = -1
+  while True:
+    dir_name = 'debug_run-{}-{}'.format(socket.gethostname(), getpass.getuser())
+    if seq >= 0:
+      dir_name += '-{}'.format(seq)
+    temp_folder = os.path.join(tempfile.gettempdir(), dir_name)
+    if not os.path.isdir(temp_folder):
+      os.mkdir(temp_folder)
+      return temp_folder
+    seq += 1
+
+
 def setup_outdir(args):
   if args.outdir is None:
-    args.outdir = tempfile.mkdtemp()
+    args.outdir = create_temp_folder()
     print('Writing run results to {}'.format(args.outdir), file=sys.stderr)
   elif os.path.isdir(args.outdir):
     raise RuntimeError('Output folder must not exist: {}'.format(args.outdir))
@@ -90,16 +124,19 @@ def setup_outdir(args):
 
 
 def targz(folder, tarfile):
-  if subprocess.call(['tar', 'czf', tarfile, folder]) != 0:
+  dirbase = os.path.dirname(folder)
+  dirname = os.path.basename(folder)
+  if subprocess.call(['tar', '-C', dirbase, '-czf', tarfile, dirname]) != 0:
     raise RuntimeError('Failed to create folder {} archive into {}'.format(
         folder, tarfile))
 
 
-def read_proc_output(logfd, offset):
+def read_proc_output(logfd, offset, outfd=None):
   size = os.fstat(logfd).st_size
   if size > offset:
     data = os.pread(logfd, size - offset, offset)
-    sys.stdout.write(data.decode('utf-8'))
+    if outfd is not None:
+      os.write(outfd, data)
     offset = size
   else:
     data = None
@@ -115,10 +152,13 @@ def run_and_monitor(args):
       args.cmdline, stdout=logfd, stderr=subprocess.STDOUT, env=env)
 
   while not _QUIT and proc.poll() is None:
-    time.sleep(1.0)
-    offset, _ = read_proc_output(logfd, offset)
+    offset, data = read_proc_output(logfd, offset, outfd=sys.stdout.fileno())
+    if data is None:
+      time.sleep(1.0)
 
   proc.terminate()
+  proc.wait()
+  read_proc_output(logfd, offset, outfd=sys.stdout.fileno())
   os.close(logfd)
 
 
@@ -156,6 +196,12 @@ def run_binary(args):
       type=str,
       default=None,
       help='The location of the tar.gz debug report file')
+  arg_parser.add_argument(
+      '--vmodule',
+      type=str,
+      default=None,
+      help='Extra --vmodule files to be added. A list of comma-separated NAME=LEVEL'
+  )
   arg_parser.add_argument('cmdline', nargs='+')
 
   args = arg_parser.parse_args()