Merge pull request #686 from rsepassi/push

v1.5.6
tensorflow · Apr 5, 2018 · 9053259 · 9053259
2 parents 7d1c9ea + 160bed3
commit 9053259
Show file tree

Hide file tree

Showing 53 changed files with 2,413 additions and 533 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -11,12 +11,15 @@ env:
     - TF_VERSION="1.4.*"
     - TF_VERSION="1.5.*"
     - TF_VERSION="1.6.*"
+    - TF_VERSION="1.7.*"
 matrix:
   exclude:
     - python: "3.6"
       env: TF_VERSION="1.4.*"
     - python: "3.6"
       env: TF_VERSION="1.5.*"
+    - python: "3.6"
+      env: TF_VERSION="1.6.*"
 before_install:
   - echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | sudo tee /etc/apt/sources.list.d/tensorflow-serving.list
   - curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | sudo apt-key add -

diff --git a/README.md b/README.md
@@ -355,6 +355,23 @@ README](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/da
 
 ## Papers
 
+When referencing Tensor2Tensor, please cite [this
+paper](https://arxiv.org/abs/1803.07416).
+
+```
+@article{tensor2tensor,
+  author    = {Ashish Vaswani and Samy Bengio and Eugene Brevdo and
+    Francois Chollet and Aidan N. Gomez and Stephan Gouws and Llion Jones and
+    \L{}ukasz Kaiser and Nal Kalchbrenner and Niki Parmar and Ryan Sepassi and
+    Noam Shazeer and Jakob Uszkoreit},
+  title     = {Tensor2Tensor for Neural Machine Translation},
+  journal   = {CoRR},
+  volume    = {abs/1803.07416},
+  year      = {2018},
+  url       = {http://arxiv.org/abs/1803.07416},
+}
+```
+
 Tensor2Tensor was used to develop a number of state-of-the-art models
 and deep learning methods. Here we list some papers that were based on T2T
 from the start and benefited from its features and architecture in ways
@@ -370,5 +387,7 @@ T2T](https://research.googleblog.com/2017/06/accelerating-deep-learning-research
    Sequences](https://arxiv.org/abs/1801.10198)
 * [Image Transformer](https://arxiv.org/abs/1802.05751)
 * [Training Tips for the Transformer Model](https://arxiv.org/abs/1804.00247)
+* [Self-Attention with Relative Position Representations](https://arxiv.org/abs/1803.02155)
+* [Fast Decoding in Sequence Models using Discrete Latent Variables](https://arxiv.org/abs/1803.03382)
 
 *Note: This is not an official Google product.*
diff --git a/docs/cloud_mlengine.md b/docs/cloud_mlengine.md
@@ -28,12 +28,14 @@ machines with 4 or 8 GPUs.
 You can additionally pass the `--cloud_mlengine_master_type` to select another
 kind of machine (see the [docs for
 `masterType`](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput)
-for options, including 
-[ML Engine machine types](https://cloud.google.com/ml-engine/docs/training-overview)
+for options, including
+[ML Engine machine
+types](https://cloud.google.com/ml-engine/docs/training-overview)
 and their
 [specs](https://cloud.google.com/compute/docs/machine-types)).
 If you provide this flag yourself, make sure you pass the
-correct value for `--worker_gpu` (for non-GPU machines, you must explicitly pass `--worker_gpu=0`).
+correct value for `--worker_gpu` (for non-GPU machines, you should pass
+`--worker_gpu=0`).
 
 **Note**: `t2t-trainer` only currently supports launching with single machines,
 possibly with multiple GPUs. Multi-machine setups are not yet supported out of

diff --git a/docs/walkthrough.md b/docs/walkthrough.md
@@ -355,6 +355,23 @@ README](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/da
 
 ## Papers
 
+When referencing Tensor2Tensor, please cite [this
+paper](https://arxiv.org/abs/1803.07416).
+
+```
+@article{tensor2tensor,
+  author    = {Ashish Vaswani and Samy Bengio and Eugene Brevdo and
+    Francois Chollet and Aidan N. Gomez and Stephan Gouws and Llion Jones and
+    \L{}ukasz Kaiser and Nal Kalchbrenner and Niki Parmar and Ryan Sepassi and
+    Noam Shazeer and Jakob Uszkoreit},
+  title     = {Tensor2Tensor for Neural Machine Translation},
+  journal   = {CoRR},
+  volume    = {abs/1803.07416},
+  year      = {2018},
+  url       = {http://arxiv.org/abs/1803.07416},
+}
+```
+
 Tensor2Tensor was used to develop a number of state-of-the-art models
 and deep learning methods. Here we list some papers that were based on T2T
 from the start and benefited from its features and architecture in ways
@@ -370,5 +387,7 @@ T2T](https://research.googleblog.com/2017/06/accelerating-deep-learning-research
    Sequences](https://arxiv.org/abs/1801.10198)
 * [Image Transformer](https://arxiv.org/abs/1802.05751)
 * [Training Tips for the Transformer Model](https://arxiv.org/abs/1804.00247)
+* [Self-Attention with Relative Position Representations](https://arxiv.org/abs/1803.02155)
+* [Fast Decoding in Sequence Models using Discrete Latent Variables](https://arxiv.org/abs/1803.03382)
 
 *Note: This is not an official Google product.*
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.5.5',
+    version='1.5.6',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='[email protected]',

diff --git a/tensor2tensor/bin/t2t_avg_all.py b/tensor2tensor/bin/t2t_avg_all.py
@@ -19,7 +19,6 @@
 from __future__ import print_function
 
 from collections import deque
-import logging
 import os
 import shutil
 
@@ -45,8 +44,6 @@
 
 
 def main(_):
-  tf.logging._handler.setFormatter(  # pylint: disable=protected-access
-      logging.Formatter("%(asctime)s:" + logging.BASIC_FORMAT, None))
   tf.logging.set_verbosity(tf.logging.INFO)
 
   model_dir = os.path.expanduser(FLAGS.model_dir)
@@ -56,7 +53,8 @@ def main(_):
   # Copy flags.txt with the original time, so t2t-bleu can report correct
   # relative time.
   tf.gfile.MakeDirs(FLAGS.output_dir)
-  if not os.path.exists(os.path.join(output_dir, "flags.txt")):
+  if (not os.path.exists(os.path.join(output_dir, "flags.txt")) and
+      os.path.exists(os.path.join(model_dir, "flags.txt"))):
     shutil.copy2(os.path.join(model_dir, "flags.txt"),
                  os.path.join(output_dir, "flags.txt"))
 

diff --git a/tensor2tensor/bin/t2t_bleu.py b/tensor2tensor/bin/t2t_bleu.py
@@ -113,8 +113,8 @@ def main(_):
   transl_dir = os.path.expanduser(FLAGS.translations_dir)
   if not os.path.exists(transl_dir):
     exit_time = time.time() + FLAGS.wait_minutes * 60
-    tf.logging.info("Translation dir %s does not exist, waiting till %s."
-                    % (transl_dir, time.asctime(time.localtime(exit_time))))
+    tf.logging.info("Translation dir %s does not exist, waiting till %s.",
+                    transl_dir, time.asctime(time.localtime(exit_time)))
     while not os.path.exists(transl_dir):
       time.sleep(10)
       if time.time() > exit_time:

diff --git a/tensor2tensor/bin/t2t_decoder.py b/tensor2tensor/bin/t2t_decoder.py
@@ -37,7 +37,9 @@
 # Dependency imports
 
 from tensor2tensor.bin import t2t_trainer
+from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.utils import decoding
+from tensor2tensor.utils import registry
 from tensor2tensor.utils import trainer_lib
 from tensor2tensor.utils import usr_dir
 
@@ -59,6 +61,8 @@
 flags.DEFINE_bool("decode_interactive", False,
                   "Interactive local inference mode.")
 flags.DEFINE_integer("decode_shards", 1, "Number of decoding replicas.")
+flags.DEFINE_string("score_file", "", "File to score. Each line in the file "
+                    "must be in the format input \t target.")
 
 
 def create_hparams():
@@ -96,11 +100,80 @@ def decode(estimator, hparams, decode_hp):
         dataset_split="test" if FLAGS.eval_use_test_set else None)
 
 
+def score_file(filename):
+  """Score each line in a file and return the scores."""
+  # Prepare model.
+  hparams = create_hparams()
+  encoders = registry.problem(FLAGS.problems).feature_encoders(FLAGS.data_dir)
+  has_inputs = "inputs" in encoders
+
+  # Prepare features for feeding into the model.
+  if has_inputs:
+    inputs_ph = tf.placeholder(dtype=tf.int32)  # Just length dimension.
+    batch_inputs = tf.reshape(inputs_ph, [1, -1, 1, 1])  # Make it 4D.
+  targets_ph = tf.placeholder(dtype=tf.int32)  # Just length dimension.
+  batch_targets = tf.reshape(targets_ph, [1, -1, 1, 1])  # Make it 4D.
+  features = {
+      "inputs": batch_inputs,
+      "targets": batch_targets,
+  } if has_inputs else {"targets": batch_targets}
+
+  # Prepare the model and the graph when model runs on features.
+  model = registry.model(FLAGS.model)(hparams, tf.estimator.ModeKeys.EVAL)
+  _, losses = model(features)
+  saver = tf.train.Saver()
+
+  with tf.Session() as sess:
+    # Load weights from checkpoint.
+    ckpts = tf.train.get_checkpoint_state(FLAGS.output_dir)
+    ckpt = ckpts.model_checkpoint_path
+    saver.restore(sess, ckpt)
+    # Run on each line.
+    results = []
+    for line in open(filename):
+      tab_split = line.split("\t")
+      if len(tab_split) > 2:
+        raise ValueError("Each line must have at most one tab separator.")
+      if len(tab_split) == 1:
+        targets = tab_split[0].strip()
+      else:
+        targets = tab_split[1].strip()
+        inputs = tab_split[0].strip()
+      # Run encoders and append EOS symbol.
+      targets_numpy = encoders["targets"].encode(
+          targets) + [text_encoder.EOS_ID]
+      if has_inputs:
+        inputs_numpy = encoders["inputs"].encode(inputs) + [text_encoder.EOS_ID]
+      # Prepare the feed.
+      feed = {
+          inputs_ph: inputs_numpy,
+          targets_ph: targets_numpy
+      } if has_inputs else {targets_ph: targets_numpy}
+      # Get the score.
+      np_loss = sess.run(losses["training"], feed)
+      results.append(np_loss)
+  return results
+
+
 def main(_):
   tf.logging.set_verbosity(tf.logging.INFO)
+  trainer_lib.set_random_seed(FLAGS.random_seed)
   usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
   FLAGS.use_tpu = False  # decoding not supported on TPU
 
+  if FLAGS.score_file:
+    filename = os.path.expanduser(FLAGS.score_file)
+    if not tf.gfile.Exists(filename):
+      raise ValueError("The file to score doesn't exist: %s" % filename)
+    results = score_file(filename)
+    if not FLAGS.decode_to_file:
+      raise ValueError("To score a file, specify --decode_to_file for results.")
+    write_file = open(os.path.expanduser(FLAGS.decode_to_file), "w")
+    for score in results:
+      write_file.write("%.6f\n" % score)
+    write_file.close()
+    return
+
   hp = create_hparams()
   decode_hp = create_decode_hparams()
 

diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
@@ -178,6 +178,7 @@ def create_run_config(hp):
   save_ckpt_secs = FLAGS.save_checkpoints_secs or None
   if save_ckpt_secs:
     save_ckpt_steps = None
+  assert FLAGS.output_dir
   return trainer_lib.create_run_config(
       model_dir=os.path.expanduser(FLAGS.output_dir),
       master=FLAGS.master,

diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
@@ -41,6 +41,7 @@
 from tensor2tensor.data_generators import problem_hparams
 from tensor2tensor.data_generators import ptb
 from tensor2tensor.data_generators import snli
+from tensor2tensor.data_generators import squad
 from tensor2tensor.data_generators import translate_encs
 from tensor2tensor.data_generators import translate_ende
 from tensor2tensor.data_generators import translate_enfr

diff --git a/tensor2tensor/data_generators/celeba.py b/tensor2tensor/data_generators/celeba.py
@@ -149,6 +149,43 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
         self.dev_filepaths(data_dir, self.dev_shards, shuffled=False))
 
 
+@registry.register_problem
+class ImageCelebaMultiResolution(ImageCeleba):
+  """CelebA at multiple resolutions.
+
+  The resolutions are specified as a hyperparameter during preprocessing.
+  """
+
+  def dataset_filename(self):
+    return "image_celeba"
+
+  def preprocess_example(self, example, mode, hparams):
+    image = example["inputs"]
+    if hasattr(hparams, "resize_method"):
+      method = getattr(tf.image.ResizeMethod, hparams.resize_method)
+    else:  # default
+      method = tf.image.ResizeMethod.BICUBIC
+
+    # Remove boundaries in CelebA images. Remove 40 pixels each side
+    # vertically and 20 pixels each side horizontally.
+    image = tf.image.crop_to_bounding_box(image, 40, 20, 218 - 80, 178 - 40)
+
+    scaled_images = image_utils.make_multiscale(
+        image, hparams.resolutions,
+        resize_method=method, num_channels=self.num_channels)
+
+    # Pack tuple of scaled images into one tensor. We do this by enforcing the
+    # columns to match for every resolution.
+    highest_res = hparams.resolutions[-1]
+    example["inputs"] = image
+    example["targets"] = tf.concat([
+        tf.reshape(scaled_image,
+                   [res**2 // highest_res, highest_res, self.num_channels])
+        for scaled_image, res in zip(scaled_images, hparams.resolutions)],
+                                   axis=0)
+    return example
+
+
 @registry.register_problem
 class Img2imgCeleba(ImageCeleba):
   """8px to 32px problem."""

diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
@@ -19,7 +19,6 @@
 from __future__ import division
 from __future__ import print_function
 
-from collections import defaultdict
 import gzip
 import os
 import random
@@ -34,7 +33,6 @@
 import six.moves.urllib_request as urllib  # Imports urllib on Python2, urllib.request on Python3
 
 from tensor2tensor.data_generators import text_encoder
-from tensor2tensor.data_generators import tokenizer
 
 import tensorflow as tf
 
@@ -299,40 +297,41 @@ def gunzip_file(gz_path, new_path):
 
 
 def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
-                                generator):
+                                generator, max_subtoken_length=None,
+                                reserved_tokens=None):
   """Inner implementation for vocab generators.
 
   Args:
     data_dir: The base directory where data and vocab files are stored. If None,
-        then do not save the vocab even if it doesn't exist.
+      then do not save the vocab even if it doesn't exist.
     vocab_filename: relative filename where vocab file is stored
     vocab_size: target size of the vocabulary constructed by SubwordTextEncoder
     generator: a generator that produces tokens from the vocabulary
+    max_subtoken_length: an optional integer.  Set this to a finite value to
+      avoid quadratic costs during vocab building.
+    reserved_tokens: List of reserved tokens. `text_encoder.RESERVED_TOKENS`
+      should be a prefix of `reserved_tokens`. If `None`, defaults to
+      `RESERVED_TOKENS`.
 
   Returns:
     A SubwordTextEncoder vocabulary object.
   """
-  if data_dir is None:
-    vocab_filepath = None
-  else:
+  if data_dir and vocab_filename:
     vocab_filepath = os.path.join(data_dir, vocab_filename)
-
-  if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath):
-    tf.logging.info("Found vocab file: %s", vocab_filepath)
-    vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
-    return vocab
+    if tf.gfile.Exists(vocab_filepath):
+      tf.logging.info("Found vocab file: %s", vocab_filepath)
+      return text_encoder.SubwordTextEncoder(vocab_filepath)
+  else:
+    vocab_filepath = None
 
   tf.logging.info("Generating vocab file: %s", vocab_filepath)
-  token_counts = defaultdict(int)
-  for item in generator:
-    for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
-      token_counts[tok] += 1
-
-  vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
-      vocab_size, token_counts, 1, 1e3)
+  vocab = text_encoder.SubwordTextEncoder.build_from_generator(
+      generator, vocab_size, max_subtoken_length=max_subtoken_length,
+      reserved_tokens=reserved_tokens)
 
-  if vocab_filepath is not None:
+  if vocab_filepath:
     vocab.store_to_file(vocab_filepath)
+
   return vocab
 
 
@@ -368,7 +367,6 @@ def generate():
             gunzip_file(filepath, new_filepath)
           filepath = new_filepath
 
-        # Use Tokenizer to count the word occurrences.
         with tf.gfile.GFile(filepath, mode="r") as source_file:
           file_byte_budget_ = file_byte_budget
           counter = 0