diff --git a/.travis.yml b/.travis.yml index 2ae1acf65..e83c02d0c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,11 +21,13 @@ matrix: - python: "3.6" env: TF_VERSION="1.7.*" before_install: - - echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | sudo tee /etc/apt/sources.list.d/tensorflow-serving.list - - curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | sudo apt-key add - + # Disabled TensorFlow Serving install until bug fixed. See "Export and query" + # section below. + # - echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | sudo tee /etc/apt/sources.list.d/tensorflow-serving.list + # - curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | sudo apt-key add - - sudo apt-get update -qq - sudo apt-get install -qq libhdf5-dev - - sudo apt-get install -qq tensorflow-model-server + # - sudo apt-get install -qq tensorflow-model-server install: - pip install -q "tensorflow==$TF_VERSION" - pip install -q .[tests] diff --git a/docs/cloud_tpu.md b/docs/cloud_tpu.md index d508aa125..2b81c19f5 100644 --- a/docs/cloud_tpu.md +++ b/docs/cloud_tpu.md @@ -18,6 +18,11 @@ See the official tutorial for [running Transformer on Cloud TPUs](https://cloud.google.com/tpu/docs/tutorials/transformer) for some examples and try out your own problems. +You can train an Automatic Speech Recognition (ASR) model with Transformer +on TPU by using `transformer` as `model` with `transformer_librispeech_tpu` as +`hparams_set` and `librispeech` as `problem`. See this [tutorial](tutorials/ast_with_transformer.md) for more details on training it and this +[notebook](https://colab.research.google.com/github/tensorflow/tensor2tensor/blob/master/tensor2tensor/notebooks/asr_transformer.ipynb) to see how the resulting model transcribes your speech to text. + Image Transformer: * `imagetransformer` with `imagetransformer_base_tpu` (or `imagetransformer_tiny_tpu`) diff --git a/docs/index.md b/docs/index.md index 2ffbb956d..58dffb134 100644 --- a/docs/index.md +++ b/docs/index.md @@ -16,10 +16,14 @@ accessible and [accelerate ML research](https://research.googleblog.com/2017/06/accelerating-deep-learning-research.html). -## Basics +## Introduction * [Walkthrough](walkthrough.md): Install and run. * [IPython notebook](https://colab.research.google.com/github/tensorflow/tensor2tensor/blob/master/tensor2tensor/notebooks/hello_t2t.ipynb): Get a hands-on experience. +* [Automatic Speech Recognition notebook](https://colab.research.google.com/github/tensorflow/tensor2tensor/blob/master/tensor2tensor/notebooks/asr_transformer.ipynb): Transcribe speech to text with a T2T model. + +## Basics + * [Overview](overview.md): How all parts of T2T code are connected. * [New Problem](new_problem.md): Train T2T models on your data. * [New Model](new_model.md): Create your own T2T model. @@ -29,6 +33,7 @@ research](https://research.googleblog.com/2017/06/accelerating-deep-learning-res * [Training on Google Cloud ML](cloud_mlengine.md) * [Training on Google Cloud TPUs](cloud_tpu.md) * [Distributed Training](distributed_training.md) +# [Automatic Speech Recognition (ASR) with Transformer](tutorials/asr_with_transformer.md) ## Solving your task diff --git a/docs/tutorials/asr_with_transformer.md b/docs/tutorials/asr_with_transformer.md index 728d85c4b..7a6ebe172 100644 --- a/docs/tutorials/asr_with_transformer.md +++ b/docs/tutorials/asr_with_transformer.md @@ -1,10 +1,13 @@ # Automatic Speech Recognition (ASR) with Transformer +Check out the [Automatic Speech Recognition notebook](https://colab.research.google.com/github/tensorflow/tensor2tensor/blob/master/tensor2tensor/notebooks/asr_transformer.ipynb) to see how the resulting model transcribes your speech to text. + ## Data set This tutorial uses the publicly available [Librispeech](http://www.openslr.org/12/) ASR corpus. + ## Generate the dataset To generate the dataset use `t2t-datagen`. You need to create environment diff --git a/setup.py b/setup.py index 6cb7841c4..0444de73f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='tensor2tensor', - version='1.6.1', + version='1.6.2', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com', diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py index 87ce4df86..ab9891c8b 100644 --- a/tensor2tensor/data_generators/all_problems.py +++ b/tensor2tensor/data_generators/all_problems.py @@ -36,6 +36,7 @@ "tensor2tensor.data_generators.ice_parsing", "tensor2tensor.data_generators.imagenet", "tensor2tensor.data_generators.imdb", + "tensor2tensor.data_generators.lambada", "tensor2tensor.data_generators.librispeech", "tensor2tensor.data_generators.lm1b", "tensor2tensor.data_generators.mnist", diff --git a/tensor2tensor/data_generators/gym.py b/tensor2tensor/data_generators/gym.py index 1d85d0ac3..01c83dfab 100644 --- a/tensor2tensor/data_generators/gym.py +++ b/tensor2tensor/data_generators/gym.py @@ -28,6 +28,7 @@ from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import video_utils +from tensor2tensor.models.research import autoencoders from tensor2tensor.models.research import rl from tensor2tensor.rl import collect from tensor2tensor.rl.envs import tf_atari_wrappers as atari @@ -42,7 +43,9 @@ flags = tf.flags FLAGS = flags.FLAGS -flags.DEFINE_string("agent_policy_path", "", "File with model for agent") + +flags.DEFINE_string("agent_policy_path", "", "File with model for agent.") +flags.DEFINE_string("autoencoder_path", "", "File with model for autoencoder.") class GymDiscreteProblem(video_utils.VideoProblem): @@ -179,6 +182,7 @@ class GymPongRandom50k(GymPongRandom5k): def num_steps(self): return 50000 + @registry.register_problem class GymFreewayRandom5k(GymDiscreteProblem): """Freeway game, random actions.""" @@ -209,7 +213,6 @@ def num_steps(self): return 50000 -@registry.register_problem class GymDiscreteProblemWithAgent(GymDiscreteProblem): """Gym environment with discrete actions and rewards and an agent.""" @@ -239,7 +242,7 @@ def _setup(self): generator_batch_env = batch_env_factory( self.environment_spec, env_hparams, num_agents=1, xvfb=False) - with tf.variable_scope("", reuse=tf.AUTO_REUSE): + with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): if FLAGS.agent_policy_path: policy_lambda = self.collect_hparams.network else: @@ -252,7 +255,7 @@ def _setup(self): create_scope_now_=True, unique_name_="network") - with tf.variable_scope("", reuse=tf.AUTO_REUSE): + with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): self.collect_hparams.epoch_length = 10 _, self.collect_trigger_op = collect.define_collect( policy_factory, generator_batch_env, self.collect_hparams, @@ -267,6 +270,22 @@ def restore_networks(self, sess): tf.global_variables(".*network_parameters.*")) model_saver.restore(sess, FLAGS.agent_policy_path) + def autoencode(self, image, sess): + with tf.Graph().as_default(): + hparams = autoencoders.autoencoder_discrete_pong() + hparams.data_dir = "unused" + hparams.problem_hparams = self.get_hparams(hparams) + hparams.problem = self + model = autoencoders.AutoencoderOrderedDiscrete( + hparams, tf.estimator.ModeKeys.EVAL) + img = tf.constant(image) + img = tf.to_int32(tf.reshape( + img, [1, 1, self.frame_height, self.frame_width, self.num_channels])) + encoded = model.encode(img) + model_saver = tf.train.Saver(tf.global_variables()) + model_saver.restore(sess, FLAGS.autoencoder_path) + return sess.run(encoded) + def generate_encoded_samples(self, data_dir, tmp_dir, unused_dataset_split): self._setup() self.debug_dump_frames_path = os.path.join( @@ -275,17 +294,14 @@ def generate_encoded_samples(self, data_dir, tmp_dir, unused_dataset_split): with tf.Session() as sess: sess.run(tf.global_variables_initializer()) self.restore_networks(sess) - # Actions are shifted by 1 by MemoryWrapper, compensate here. - avilable_data_size = sess.run(self.avilable_data_size_op) - if avilable_data_size < 1: - sess.run(self.collect_trigger_op) pieces_generated = 0 - observ, reward, _, _ = sess.run(self.data_get_op) while pieces_generated < self.num_steps + self.warm_up: avilable_data_size = sess.run(self.avilable_data_size_op) if avilable_data_size < 1: sess.run(self.collect_trigger_op) - next_observ, next_reward, action, _ = sess.run(self.data_get_op) + observ, reward, action, _, img = sess.run(self.data_get_op) + if FLAGS.autoencoder_path: + observ = self.autoencode(img, sess) yield {"image/encoded": [observ], "image/format": ["png"], "image/height": [self.frame_height], @@ -294,7 +310,6 @@ def generate_encoded_samples(self, data_dir, tmp_dir, unused_dataset_split): "done": [int(False)], "reward": [int(reward) - self.min_reward]} pieces_generated += 1 - observ, reward = next_observ, next_reward @registry.register_problem @@ -318,20 +333,24 @@ def restore_networks(self, sess): @registry.register_problem -class GymSimulatedDiscreteProblemWithAgentOnPong(GymSimulatedDiscreteProblemWithAgent, GymPongRandom5k): +class GymSimulatedDiscreteProblemWithAgentOnPong( + GymSimulatedDiscreteProblemWithAgent, GymPongRandom5k): pass @registry.register_problem -class GymDiscreteProblemWithAgentOnPong(GymDiscreteProblemWithAgent, GymPongRandom5k): +class GymDiscreteProblemWithAgentOnPong( + GymDiscreteProblemWithAgent, GymPongRandom5k): pass @registry.register_problem -class GymSimulatedDiscreteProblemWithAgentOnFreeway(GymSimulatedDiscreteProblemWithAgent, GymFreewayRandom5k): +class GymSimulatedDiscreteProblemWithAgentOnFreeway( + GymSimulatedDiscreteProblemWithAgent, GymFreewayRandom5k): pass @registry.register_problem -class GymDiscreteProblemWithAgentOnFreeway(GymDiscreteProblemWithAgent, GymFreewayRandom5k): +class GymDiscreteProblemWithAgentOnFreeway( + GymDiscreteProblemWithAgent, GymFreewayRandom5k): pass diff --git a/tensor2tensor/data_generators/lambada.py b/tensor2tensor/data_generators/lambada.py new file mode 100644 index 000000000..c33d7c599 --- /dev/null +++ b/tensor2tensor/data_generators/lambada.py @@ -0,0 +1,377 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Data generators for LAMBADA data-sets. + + +Lmbada as a language modeling task: + https://arxiv.org/abs/1606.06031 + +Lmbada as a reading comprehension task: + https://arxiv.org/abs/1610.08431 + For lambada as reading comprehension task, one can use the dataset that is + provided here: + http://ttic.uchicago.edu/~kgimpel/data/lambada-train-valid.tar.gz + In this dataset samples for which the target word is not in the context are + removed from the trained data. + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import csv +import os +import tarfile + +# Dependency imports + +from tensor2tensor.data_generators import generator_utils +from tensor2tensor.data_generators import problem +from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators import text_problems +from tensor2tensor.utils import registry + +import tensorflow as tf + + +_UNK = "" + + +_TAR = "lambada-dataset.tar.gz" +_URL = "http://clic.cimec.unitn.it/lambada/" + _TAR +_VOCAB = "lambada-vocab-2.txt" + + +def _prepare_lambada_data(tmp_dir, data_dir, vocab_size, vocab_filename): + """Downloading and preparing the dataset. + + Args: + tmp_dir: tem directory + data_dir: data directory + vocab_size: size of vocabulary + vocab_filename: name of vocab file + + """ + + if not tf.gfile.Exists(data_dir): + tf.gfile.MakeDirs(data_dir) + + file_path = generator_utils.maybe_download(tmp_dir, _TAR, _URL) + tar_all = tarfile.open(file_path) + tar_all.extractall(tmp_dir) + tar_all.close() + tar_train = tarfile.open(os.path.join(tmp_dir, "train-novels.tar")) + tar_train.extractall(tmp_dir) + tar_train.close() + + vocab_path = os.path.join(data_dir, vocab_filename) + if not tf.gfile.Exists(vocab_path): + with tf.gfile.GFile(os.path.join(tmp_dir, _VOCAB), "r") as infile: + reader = csv.reader(infile, delimiter="\t") + words = [row[0] for row in reader] + words = [_UNK] + words[:vocab_size] + with tf.gfile.GFile(vocab_path, "w") as outfile: + outfile.write("\n".join(words)) + + +def get_dataset_split(tmp_dir, split, use_control_set): + """Gives the file paths with regards to the given split. + + Args: + tmp_dir: temp directory + split: dataset split + use_control_set: uses control dataset if true. + + Returns: + list of file paths. + + """ + if not use_control_set: + dataset_split = { + problem.DatasetSplit.TRAIN: [ + f for f in tf.gfile.Glob( + os.path.join(tmp_dir, "train-novels/*/*.txt")) + ], + problem.DatasetSplit.EVAL: [ + os.path.join(tmp_dir, "lambada_development_plain_text.txt") + ], + problem.DatasetSplit.TEST: [ + os.path.join(tmp_dir, "lambada_test_plain_text.txt") + ] + } + + else: + dataset_split = { + problem.DatasetSplit.TRAIN: [ + f for f in tf.gfile.Glob( + os.path.join(tmp_dir, "train-novels/*/*.txt")) + ], + problem.DatasetSplit.EVAL: [ + os.path.join(tmp_dir, "lambada_control_test_data_plain_text.txt") + ], + } + + return dataset_split[split] + + +@registry.register_problem +class LambadaLm(text_problems.Text2SelfProblem): + """Lambada as language modeling task.""" + + @property + def is_generate_per_split(self): + """If true, a single call to generate_samples generates for a single split. + + Returns: + Boolean. + """ + return True + + @property + def dataset_splits(self): + """Splits of data to produce and number of output shards for each. + + Returns: + A dict containing splits information. + """ + return [{ + "split": problem.DatasetSplit.TRAIN, + "shards": 10, + }, { + "split": problem.DatasetSplit.EVAL, + "shards": 1, + }, { + "split": problem.DatasetSplit.TEST, + "shards": 1, + }] + + @property + def vocab_type(self): + return text_problems.VocabType.TOKEN + + @property + def vocab_size(self): + # Similar to the setup of the main paper + return 60000 + + @property + def oov_token(self): + return _UNK + + @property + def use_control_set(self): + """If evaluate on control set.""" + return False + + def generate_samples(self, data_dir, tmp_dir, dataset_split): + """Generates samples. + + Args: + data_dir: data directory + tmp_dir: temp directory + dataset_split: dataset split + + Returns: + sample generator + + """ + _prepare_lambada_data(tmp_dir, data_dir, self.vocab_size, + self.vocab_filename) + files = get_dataset_split(tmp_dir, dataset_split, self.use_control_set) + + def _generate_samples(): + """sample generator. + + Yields: + A dict. + + """ + for filepath in files: + with tf.gfile.GFile(filepath, "r") as f: + for line in f: + line = " ".join(line.split()) + yield {"targets": line} + + return _generate_samples() + + +@registry.register_problem +class LambadaLmControl(LambadaLm): + """Lambada as language modeling task on control dataset.""" + + @property + def control_set(self): + """If test on control set.""" + return False + + +@registry.register_problem +class LambadaRc(text_problems.Text2ClassProblem): + """Lambada as reading comprehension task.""" + + @property + def is_generate_per_split(self): + """If true, a single call to generate_samples generates for a single split. + + Returns: + Boolean. + """ + return True + + @property + def dataset_splits(self): + """Splits of data to produce and number of output shards for each. + + Returns: + A dict containing splits information. + """ + return [{ + "split": problem.DatasetSplit.TRAIN, + "shards": 10, + }, { + "split": problem.DatasetSplit.EVAL, + "shards": 1, + }, { + "split": problem.DatasetSplit.TEST, + "shards": 1, + }] + + @property + def vocab_type(self): + return text_problems.VocabType.TOKEN + + @property + def vocab_size(self): + # Similar to the setup of the main paper + return 60000 + + @property + def oov_token(self): + return _UNK + + @property + def use_control_set(self): + """If test on control set.""" + return False + + def get_labels_encoder(self, data_dir): + """Builds encoder for the given class labels. + + Args: + data_dir: data directory + + Returns: + An encoder for class labels. + """ + label_filepath = os.path.join(data_dir, self.vocab_filename) + return text_encoder.TokenTextEncoder( + label_filepath, replace_oov=self.oov_token) + + def generate_samples(self, data_dir, tmp_dir, dataset_split): + """Generates samples. + + Args: + data_dir: data directory + tmp_dir: temp directory + dataset_split: dataset split + + Returns: + sample generator + + """ + _prepare_lambada_data(tmp_dir, data_dir, self.vocab_size, + self.vocab_filename) + files = get_dataset_split(tmp_dir, dataset_split, self.use_control_set) + + def _generate_samples(): + """sample generator. + + Yields: + A dict. + + """ + for filepath in files: + with tf.gfile.GFile(filepath, "r") as f: + for line in f: + input_target = line.split() + yield { + "inputs": " ".join(input_target[:-1]), + "label": input_target[-1] + } + + return _generate_samples() + + def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): + """A generator that generates samples that are encoded. + + Args: + data_dir: data directory + tmp_dir: temp directory + dataset_split: dataset split + + Yields: + A dict. + + """ + generator = self.generate_samples(data_dir, tmp_dir, dataset_split) + txt_encoder = self.get_or_create_vocab(data_dir, tmp_dir) + label_encoder = self.get_labels_encoder(data_dir) + for sample in generator: + inputs = txt_encoder.encode(sample["inputs"]) + inputs.append(text_encoder.EOS_ID) + targets = label_encoder.encode(sample["label"]) + yield {"inputs": inputs, "targets": targets} + + def feature_encoders(self, data_dir): + """Return a dict for encoding and decoding inference input/output. + + Args: + data_dir: data directory + + Returns: + A dict of . + + """ + txt_encoder = self.get_or_create_vocab(data_dir, None, force_get=True) + label_encoder = self.get_labels_encoder(data_dir) + return {"inputs": txt_encoder, "targets": label_encoder} + + def hparams(self, defaults, unused_model_hparams): + """Returns problem_hparams. + + Args: + defaults: default hyperparameters + unused_model_hparams: model hyperparameters + + """ + + p = defaults + source_vocab_size = self._encoders["inputs"].vocab_size + num_classes = self._encoders["targets"].vocab_size + p.input_modality = { + "inputs": (registry.Modalities.SYMBOL, source_vocab_size) + } + p.target_modality = (registry.Modalities.CLASS_LABEL, num_classes) + + +@registry.register_problem +class LambadaRcControl(LambadaRc): + """Lambada as reading comprehension task on control dataset.""" + + @property + def control_set(self): + """If test on control set.""" + return True diff --git a/tensor2tensor/data_generators/speech_recognition.py b/tensor2tensor/data_generators/speech_recognition.py index 7f72d5afd..12f31667c 100644 --- a/tensor2tensor/data_generators/speech_recognition.py +++ b/tensor2tensor/data_generators/speech_recognition.py @@ -251,6 +251,7 @@ def hparams(self, defaults, model_hparams): p.add_hparam("audio_upper_edge_hertz", 8000.0) p.add_hparam("audio_num_mel_bins", 80) p.add_hparam("audio_add_delta_deltas", True) + p.add_hparam("num_zeropad_frames", 250) p = defaults # p.stop_at_eos = int(False) @@ -319,8 +320,9 @@ def preprocess_example(self, example, mode, hparams): # Later models like to flatten the two spatial dims. Instead, we add a # unit spatial dim and flatten the frequencies and channels. - example["inputs"] = tf.reshape( - mel_fbanks, [fbank_size[1], fbank_size[2], fbank_size[3]]) + example["inputs"] = tf.concat([ + tf.reshape(mel_fbanks, [fbank_size[1], fbank_size[2], fbank_size[3]]), + tf.zeros((p.num_zeropad_frames, fbank_size[2], fbank_size[3]))], 0) if not p.audio_keep_example_waveforms: del example["waveforms"] diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index e94b470de..2b0ad8854 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -1383,6 +1383,7 @@ def dot_product_attention(q, # [batch, num_heads, query_length, memory_length] logits = tf.matmul(q, k, transpose_b=True) if bias is not None: + bias = tf.cast(bias, logits.dtype) logits += bias weights = tf.nn.softmax(logits, name="attention_weights") if save_weights_to is not None: @@ -1800,6 +1801,7 @@ def local(x, depth): good_part = common_layers.ones_matrix_band_part(block_length, local_length, -1, block_length) mask = (1.0 - good_part) * -1e9 + mask = tf.cast(mask, attention.dtype) attention += tf.reshape(mask, [1, 1, 1, block_length, local_length]) attention = tf.nn.softmax(attention) # TODO(noam): figure out how to show a summary for the remaining blocks. diff --git a/tensor2tensor/layers/common_image_attention.py b/tensor2tensor/layers/common_image_attention.py index ec844b611..c28f4040c 100644 --- a/tensor2tensor/layers/common_image_attention.py +++ b/tensor2tensor/layers/common_image_attention.py @@ -610,6 +610,7 @@ def prepare_decoder(targets, hparams): x = tf.reshape(x, [targets_shape[0], x_shape[1], x_shape[2], hparams.hidden_size]) x = add_pos_signals(x, hparams, "dec_pos") + x = tf.cast(x, targets.dtype) return x, x_shape[1], x_shape[2] diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py index 46ef92091..ac91b6f65 100644 --- a/tensor2tensor/layers/modalities.py +++ b/tensor2tensor/layers/modalities.py @@ -546,7 +546,7 @@ def loss(self, logits, targets): logits, targets, self._model_hparams.label_smoothing, - cutoff=0.001, + cutoff=0.02, weights_fn=self.targets_weights_fn) diff --git a/tensor2tensor/models/basic.py b/tensor2tensor/models/basic.py index 1541d8cfb..bd4ca3306 100644 --- a/tensor2tensor/models/basic.py +++ b/tensor2tensor/models/basic.py @@ -30,6 +30,7 @@ @registry.register_model class BasicFcRelu(t2t_model.T2TModel): + """Basic fully-connected + ReLU model.""" def body(self, features): hparams = self.hparams @@ -49,6 +50,7 @@ class BasicAutoencoder(t2t_model.T2TModel): def __init__(self, *args, **kwargs): super(BasicAutoencoder, self).__init__(*args, **kwargs) + self.cur_bottleneck_tensor = None self.is1d = None def bottleneck(self, x): @@ -120,6 +122,7 @@ def body(self, features): x = self.encoder(x) # Bottleneck (mix during early training, not too important but stable). b = self.bottleneck(x) + self.cur_bottleneck_tensor = b b_loss = self.bottleneck_loss(b) b = self.unbottleneck(b, common_layers.shape_list(x)[-1]) b = common_layers.mix(b, x, hparams.bottleneck_warmup_steps, is_training) @@ -153,8 +156,13 @@ def sample(self): # Sample in [-1, 1] as the bottleneck is under tanh. return 2.0 * tf.random_uniform(size) - 1.0 - def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1, - alpha=0.0): + def encode(self, x, *args, **kwargs): + """Auto-encode x and return the bottleneck.""" + features = {"targets": x} + self(features) # pylint: disable=not-callable + return self.cur_bottleneck_tensor + + def infer(self, features, *args, **kwargs): """Produce predictions from the model by sampling.""" # Inputs and features preparation needed to handle edge cases. if not features: diff --git a/tensor2tensor/models/image_transformer.py b/tensor2tensor/models/image_transformer.py index ed4a25692..b2a7d7ba9 100644 --- a/tensor2tensor/models/image_transformer.py +++ b/tensor2tensor/models/image_transformer.py @@ -46,7 +46,7 @@ def body(self, features): targets = features["targets"] if not (tf.get_variable_scope().reuse or hparams.mode == tf.contrib.learn.ModeKeys.INFER): - tf.summary.image("targets", targets, max_outputs=1) + tf.summary.image("targets", tf.to_float(targets), max_outputs=1) # Extra losses list if we want to use moe. losses = [] @@ -667,14 +667,16 @@ def update_hparams_for_tpu(hparams): @registry.register_hparams def imagetransformer_base_tpu(): - hparams = imagetransformer_base() + """Transformer base params for cifar-10.""" + hparams = imagetransformer_bas8l_8h_big_uncond_dr03_imgnet() update_hparams_for_tpu(hparams) hparams.batch_size = 4 hparams.num_heads = 4 # heads are expensive on tpu - hparams.hidden_size = 256 - hparams.filter_size = 512 - hparams.num_hidden_layers = 8 - hparams.sampling_method = "random" + hparams.num_decoder_layers = 12 + hparams.block_length = 128 + hparams.layer_preprocess_sequence = "none" + hparams.layer_postprocess_sequence = "dan" + hparams.layer_prepostprocess_dropout = 0.3 return hparams @@ -691,11 +693,16 @@ def imagetransformer_sep_channels_8l_tpu(): @registry.register_hparams def imagetransformer_b10l_4h_big_uncond_dr03_tpu(): + """Small model for tpu cifar 10.""" hparams = imagetransformer_bas8l_8h_big_uncond_dr03_imgnet() update_hparams_for_tpu(hparams) hparams.batch_size = 4 hparams.num_heads = 4 # heads are expensive on tpu hparams.num_decoder_layers = 10 + hparams.block_length = 128 + hparams.hidden_size = 256 + hparams.filter_size = 1024 + hparams.learning_rate = 0.2 hparams.layer_preprocess_sequence = "none" hparams.layer_postprocess_sequence = "dan" return hparams @@ -740,6 +747,8 @@ def imagetransformer_b12l_4h_big_uncond_dr03_tpu(): hparams.num_heads = 4 # heads are expensive on tpu hparams.num_decoder_layers = 12 hparams.block_length = 128 + hparams.hidden_size = 512 + hparams.filter_size = 1024 hparams.layer_preprocess_sequence = "none" hparams.layer_postprocess_sequence = "dan" hparams.layer_prepostprocess_dropout = 0.3 diff --git a/tensor2tensor/models/research/autoencoders.py b/tensor2tensor/models/research/autoencoders.py index aee732e42..f8cd0ee0b 100644 --- a/tensor2tensor/models/research/autoencoders.py +++ b/tensor2tensor/models/research/autoencoders.py @@ -20,6 +20,7 @@ # Dependency imports +from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_layers from tensor2tensor.layers import discretization from tensor2tensor.models import basic @@ -226,6 +227,7 @@ def decoder(self, x): name="residual_%d" % r) x += tf.nn.dropout(y, 1.0 - hparams.residual_dropout) x = common_layers.layer_norm(x) + x = common_attention.add_timing_signal_nd(x) return x @@ -297,6 +299,9 @@ def sample(self): class AutoencoderOrderedDiscrete(AutoencoderResidualDiscrete): """Ordered discrete autoencoder.""" + def bottleneck_loss(self, unused_b): + return 0.0 + def bottleneck(self, x): hparams = self.hparams noise = hparams.bottleneck_noise @@ -418,7 +423,7 @@ def autoencoder_autoregressive(): """Autoregressive autoencoder model.""" hparams = basic.basic_autoencoder() hparams.add_hparam("autoregressive_forget_base", False) - hparams.add_hparam("autoregressive_mode", "conv3") + hparams.add_hparam("autoregressive_mode", "none") hparams.add_hparam("autoregressive_dropout", 0.4) hparams.add_hparam("autoregressive_decode_steps", 0) hparams.add_hparam("autoregressive_eval_pure_autoencoder", False) @@ -429,10 +434,10 @@ def autoencoder_autoregressive(): def autoencoder_residual(): """Residual autoencoder model.""" hparams = autoencoder_autoregressive() - hparams.optimizer = "Adam" - hparams.learning_rate_constant = 0.0001 + hparams.optimizer = "Adafactor" + hparams.learning_rate_constant = 0.2 hparams.learning_rate_warmup_steps = 500 - hparams.learning_rate_schedule = "constant * linear_warmup" + hparams.learning_rate_schedule = "constant * linear_warmup * rsqrt_decay" hparams.dropout = 0.05 hparams.num_hidden_layers = 5 hparams.hidden_size = 64 @@ -494,6 +499,17 @@ def autoencoder_ordered_discrete(): return hparams +@registry.register_hparams +def autoencoder_discrete_pong(): + """Discrete autoencoder model for compressing pong frames.""" + hparams = autoencoder_ordered_discrete() + hparams.bottleneck_size = 24 + hparams.dropout = 0.2 + hparams.batch_size = 2 + hparams.bottleneck_noise = 0.4 + return hparams + + @registry.register_hparams def autoencoder_stacked(): """Stacked autoencoder model.""" diff --git a/tensor2tensor/models/research/basic_conv_gen.py b/tensor2tensor/models/research/basic_conv_gen.py index 77729f0ca..91a137744 100644 --- a/tensor2tensor/models/research/basic_conv_gen.py +++ b/tensor2tensor/models/research/basic_conv_gen.py @@ -22,6 +22,7 @@ import six +from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_hparams from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry @@ -106,6 +107,7 @@ def body(self, features): shape = common_layers.shape_list(y) x = x[:, :shape[1], :shape[2], :] x = common_layers.layer_norm(x + y) + x = common_attention.add_timing_signal_nd(x) # Cut down to original size. x = x[:, :inputs_shape[1], :inputs_shape[2], :] @@ -167,14 +169,14 @@ def basic_conv(): hparams.batch_size = 8 hparams.num_hidden_layers = 2 hparams.optimizer = "Adafactor" - hparams.learning_rate_constant = 0.5 + hparams.learning_rate_constant = 1.5 hparams.learning_rate_warmup_steps = 1500 hparams.learning_rate_schedule = "linear_warmup * constant * rsqrt_decay" hparams.label_smoothing = 0.0 hparams.initializer = "uniform_unit_scaling" hparams.initializer_gain = 1.0 hparams.weight_decay = 0.0 - hparams.dropout = 0.2 + hparams.dropout = 0.5 hparams.add_hparam("num_compress_steps", 6) hparams.add_hparam("filter_double_steps", 5) return hparams diff --git a/tensor2tensor/models/research/r_transformer.py b/tensor2tensor/models/research/r_transformer.py index 4631a1441..4498afad0 100644 --- a/tensor2tensor/models/research/r_transformer.py +++ b/tensor2tensor/models/research/r_transformer.py @@ -212,6 +212,55 @@ def body(self, features): return decoder_output + def _greedy_infer(self, features, decode_length): + """Fast version of greedy decoding. + + Args: + features: an map of string to `Tensor` + decode_length: an integer. How many additional timesteps to decode. + + Returns: + A dict of decoding results { + "outputs": integer `Tensor` of decoded ids of shape + [batch_size, <= decode_length] if beam_size == 1 or + [batch_size, top_beams, <= decode_length] + "scores": decoding log probs from the beam search, + None if using greedy decoding (beam_size=1) + } + + Raises: + NotImplementedError: If there are multiple data shards. + """ + with tf.variable_scope(self.name): + # TODO(dehghani): Support fast decoding for r-transofmer (needs caching) + return self._slow_greedy_infer(features, decode_length) + + def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha): + """Beam search decoding. + + Args: + features: an map of string to `Tensor` + decode_length: an integer. How many additional timesteps to decode. + beam_size: number of beams. + top_beams: an integer. How many of the beams to return. + alpha: Float that controls the length penalty. larger the alpha, stronger + the preference for longer translations. + + Returns: + A dict of decoding results { + "outputs": integer `Tensor` of decoded ids of shape + [batch_size, <= decode_length] if beam_size == 1 or + [batch_size, top_beams, <= decode_length] + "scores": decoding log probs from the beam search, + None if using greedy decoding (beam_size=1) + } + """ + with tf.variable_scope(self.name): + # Caching is not ebabled in r-transformer + # TODO(dehghani): Support fast decoding for r-transofmer(needs caching) + return self._beam_decode_slow(features, decode_length, beam_size, + top_beams, alpha) + @registry.register_model class RTransformerEncoder(transformer.Transformer): @@ -299,8 +348,11 @@ def update_hparams_for_r_transformer(hparams): hparams with default values for R-Transformers hyper-parameters """ + # If true, mixes vanilla transfomer with r-transformer. + hparams.add_hparam("mix_with_transformer", False) + # Type of recurrency: - # None(no-recurrency) basic, highway, skip, dwa, act, rnn, gru, lstm. + # basic, highway, skip, dwa, act, rnn, gru, lstm. hparams.add_hparam("recurrence_type", "basic") # Number of steps (which is equivalent to num layer in transformer). @@ -850,3 +902,33 @@ def r_transformer_step_position_timing_base(): hparams.pos = None hparams.add_step_timing_signal = True return hparams + + +@registry.register_hparams +def r_mix_transformer_base(): + hparams = r_transformer_base() + hparams.mix_with_transformer = True + return hparams + + +@registry.register_hparams +def r_mix_transformer_act_step_position_timing_base(): + hparams = r_transformer_base() + hparams.mix_with_transformer = True + hparams.recurrence_type = "act" + hparams.add_position_timing_signal = True + hparams.pos = None + hparams.add_step_timing_signal = True + return hparams + + +@registry.register_hparams +def r_mix_transformer_act_step_position_random_timing_base(): + hparams = r_transformer_base() + hparams.mix_with_transformer = True + hparams.recurrence_type = "act" + hparams.add_position_timing_signal = True + hparams.pos = None + hparams.position_start_index = "random" + hparams.add_step_timing_signal = True + return hparams diff --git a/tensor2tensor/models/research/r_transformer_util.py b/tensor2tensor/models/research/r_transformer_util.py index bb6028414..d4363d7f2 100644 --- a/tensor2tensor/models/research/r_transformer_util.py +++ b/tensor2tensor/models/research/r_transformer_util.py @@ -201,6 +201,18 @@ def r_transformer_layer(x, hparams, ffn_unit, attention_unit, pad_remover=None): """ with tf.variable_scope("r_transformer_%s" % hparams.recurrence_type): + if hparams.mix_with_transformer: + + if hparams.add_position_timing_signal: + # In case of add_position_timing_signal=true, we set hparams.pos=None + # and add position timing signal at the beginning of each step, so for + # the vanilla transformer part, we need to add timing signal here. + x = common_attention.add_timing_signal_1d(x) + + for layer in xrange(hparams.num_hidden_layers): + with tf.variable_scope("layer_%d" % layer): + x = ffn_unit(attention_unit(x)) + if hparams.recurrence_type == "act": return r_transformer_act(x, hparams, ffn_unit, attention_unit) diff --git a/tensor2tensor/models/research/transformer_vae.py b/tensor2tensor/models/research/transformer_vae.py index b1cca4792..1943528c2 100644 --- a/tensor2tensor/models/research/transformer_vae.py +++ b/tensor2tensor/models/research/transformer_vae.py @@ -18,7 +18,6 @@ from __future__ import division from __future__ import print_function -import copy import functools import math @@ -150,10 +149,6 @@ def decode_transformer(encoder_output, causal=True): """Original Transformer decoder.""" orig_hparams = hparams - if name == "extra": - hparams = hparams.ex - targets = tf.layers.dense( - targets, hparams.hidden_size, name="extra_tgt_embed") with tf.variable_scope(name): if task is None: task = hparams.task @@ -331,17 +326,6 @@ def ae_transformer_internal(inputs, if hparams.do_refine: _DO_SUMMARIES = False - # Change hyperparameters for the latent prediction model. - hparams_ex = copy.copy(hparams) - hparams_ex.filter_size *= 2 - hparams_ex.hidden_size *= 2 - hparams_ex.dropout = 0.0 - hparams_ex.relu_dropout = 0.0 - hparams_ex.z_dropout = 0.0 - hparams_ex.layer_prepostprocess_dropout = 0.0 - hparams_ex.symbol_dropout = 0.0 - hparams.ex = hparams_ex - # Prepare. if inputs is not None: batch_size = common_layers.shape_list(inputs)[0] @@ -352,10 +336,8 @@ def ae_transformer_internal(inputs, # Encoder. if inputs is not None: inputs = common_layers.flatten4d3d(inputs) - inputs_ex = tf.layers.dense( - tf.stop_gradient(inputs), hparams_ex.hidden_size, name="extra_embed") inputs, ed = encode(inputs, target_space, hparams, "input_enc") - inputs_ex, ed_ex = encode(inputs_ex, target_space, hparams_ex, "extra_ienc") + inputs_ex, ed_ex = inputs, ed else: ed, inputs_ex, ed_ex = None, None, None @@ -394,7 +376,7 @@ def ae_transformer_internal(inputs, if hparams.bottleneck_kind not in ["dense", "vae"]: latents_pred = decode_transformer( inputs_ex, ed_ex, - tf.stop_gradient(embed(latents_discrete)), hparams, "extra", + embed(latents_discrete), hparams, "extra", task="translate") _, latent_pred_loss = ae_latent_softmax( latents_pred, tf.stop_gradient(latents_discrete), hparams) @@ -440,6 +422,14 @@ def bn_inputs(): latents_dense = tf.pad(latents_dense, [[0, 0], [1, 0], [0, 0], [0, 0]]) + pos + # decompressing the dense latents + for i in range(hparams.num_compress_steps): + j = hparams.num_compress_steps - i - 1 + d = residual_conv(d, 1, (3, 1), hparams, "decompress_rc_%d" % j) + if hparams.do_attend_decompress: + d = attend(d, inputs, hparams, "decompress_attend_%d" % j) + d = decompress_step(d, hparams, i > 0, False, "decompress_%d" % j) + # Masking. if hparams.do_mask: masking = common_layers.inverse_lin_decay(hparams.mask_startup_steps) @@ -455,12 +445,7 @@ def bn_inputs(): mask = tf.less(masking, tf.random_uniform( common_layers.shape_list(targets)[:-1])) mask = tf.expand_dims(tf.to_float(mask), 3) - for i in range(hparams.num_compress_steps): - j = hparams.num_compress_steps - i - 1 - d = residual_conv(d, 1, (3, 1), hparams, "decompress_rc_%d" % j) - if hparams.do_attend_decompress: - d = attend(d, inputs, hparams, "decompress_attend_%d" % j) - d = decompress_step(d, hparams, i > 0, False, "decompress_%d" % j) + # targets is always [batch, length, 1, depth] targets = mask * targets + (1.0 - mask) * d # reshape back to 4d here @@ -487,10 +472,7 @@ def refine_res(): nonlatent_steps = hparams.mask_startup_steps latent_time = tf.less(nonlatent_steps, tf.to_int32(tf.train.get_global_step())) - # Learning rate warmup for the latent model for 20K steps. - latent_warmup = tf.to_float(tf.train.get_global_step()) - nonlatent_steps - latent_warmup = tf.maximum(0.0, tf.minimum(1.0, latent_warmup / 20000.0)) - losses["latent_pred"] *= tf.to_float(latent_time) * latent_warmup + losses["latent_pred"] *= tf.to_float(latent_time) return res, losses, cache diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 137829195..57b7477c6 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -290,7 +290,8 @@ def _fast_decode(self, if target_modality.is_class_modality: decode_length = 1 else: - decode_length = common_layers.shape_list(inputs)[1] + decode_length + decode_length = (common_layers.shape_list(inputs)[1] + + features.get("decode_length", decode_length)) # TODO(llion): Clean up this reshaping logic. inputs = tf.expand_dims(inputs, axis=1) @@ -327,7 +328,8 @@ def _fast_decode(self, partial_targets = tf.to_int64(partial_targets) partial_targets_shape = common_layers.shape_list(partial_targets) partial_targets_length = partial_targets_shape[1] - decode_length += partial_targets_length + decode_length = (partial_targets_length + + features.get("decode_length", decode_length)) batch_size = partial_targets_shape[0] if hparams.pos == "timing": @@ -412,7 +414,8 @@ def forced_logits(): beam_size=beam_size, top_beams=top_beams, alpha=alpha, - batch_size=batch_size) + batch_size=batch_size, + force_decode_length=self._decode_hparams.force_decode_length) if partial_targets is not None: if beam_size <= 1 or top_beams <= 1: ret["outputs"] = ret["outputs"][:, partial_targets_length:] @@ -431,7 +434,8 @@ def fast_decode(encoder_output, top_beams=1, alpha=1.0, eos_id=beam_search.EOS_ID, - batch_size=None): + batch_size=None, + force_decode_length=False): """Given encoder output and a symbols to logits function, does fast decoding. Implements both greedy and beam search decoding, uses beam search iff @@ -452,6 +456,8 @@ def fast_decode(encoder_output, the preference for longer translations. eos_id: End-of-sequence symbol in beam search. batch_size: an integer scalar - must be passed if there is no input + force_decode_length: bool, whether to force the full decode length, or if + False, stop when all beams hit eos_id. Returns: A dict of decoding results { @@ -505,14 +511,14 @@ def fast_decode(encoder_output, scores = scores[:, :top_beams] else: # Greedy - def inner_loop(i, finished, next_id, decoded_ids, cache, log_prob): + def inner_loop(i, hit_eos, next_id, decoded_ids, cache, log_prob): """One step of greedy decoding.""" logits, cache = symbols_to_logits_fn(next_id, i, cache) log_probs = common_layers.log_prob_from_logits(logits) temperature = (0.0 if hparams.sampling_method == "argmax" else hparams.sampling_temp) next_id = common_layers.sample_with_temperature(logits, temperature) - finished |= tf.equal(next_id, eos_id) + hit_eos |= tf.equal(next_id, eos_id) log_prob_indices = tf.stack( [tf.range(tf.to_int64(batch_size)), next_id], axis=1) @@ -520,19 +526,22 @@ def inner_loop(i, finished, next_id, decoded_ids, cache, log_prob): next_id = tf.expand_dims(next_id, axis=1) decoded_ids = tf.concat([decoded_ids, next_id], axis=1) - return i + 1, finished, next_id, decoded_ids, cache, log_prob + return i + 1, hit_eos, next_id, decoded_ids, cache, log_prob - def is_not_finished(i, finished, *_): - return (i < decode_length) & tf.logical_not(tf.reduce_all(finished)) + def is_not_finished(i, hit_eos, *_): + finished = i >= decode_length + if not force_decode_length: + finished |= tf.reduce_all(hit_eos) + return tf.logical_not(finished) decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int64) - finished = tf.fill([batch_size], False) + hit_eos = tf.fill([batch_size], False) next_id = tf.zeros([batch_size, 1], dtype=tf.int64) initial_log_prob = tf.zeros([batch_size], dtype=tf.float32) _, _, _, decoded_ids, _, log_prob = tf.while_loop( is_not_finished, inner_loop, [ - tf.constant(0), finished, next_id, decoded_ids, cache, + tf.constant(0), hit_eos, next_id, decoded_ids, cache, initial_log_prob ], shape_invariants=[ @@ -1094,6 +1103,7 @@ def transformer_base_v2(): @registry.register_hparams def transformer_base(): + """Base parameters for Transformer model.""" # Update parameters here, then occasionally cut a versioned set, e.g. # transformer_base_v2. hparams = transformer_base_v2() @@ -1607,8 +1617,8 @@ def transformer_lm_tpu_1(): @registry.register_hparams -def transformer_librispeech(): - """HParams for training ASR model on Librispeech.""" +def transformer_librispeech_v1(): + """HParams for training ASR model on LibriSpeech V1.""" hparams = transformer_base() hparams.num_heads = 4 @@ -1624,16 +1634,64 @@ def transformer_librispeech(): @registry.register_hparams -def transformer_librispeech_tpu(): - """HParams for training ASR model on Librispeech on TPU.""" - hparams = transformer_librispeech() +def transformer_librispeech_v2(): + """HParams for training ASR model on LibriSpeech V2.""" + hparams = transformer_base() + + hparams.max_length = 1240000 + hparams.max_input_seq_length = 1550 + hparams.max_target_seq_length = 350 + hparams.batch_size = 16 + hparams.num_decoder_layers = 4 + hparams.num_encoder_layers = 6 + hparams.hidden_size = 384 + hparams.learning_rate = 0.15 + hparams.daisy_chain_variables = False + hparams.filter_size = 1536 + hparams.num_heads = 2 + hparams.ffn_layer = "conv_relu_conv" + hparams.conv_first_kernel = 9 + hparams.weight_decay = 0 + hparams.layer_prepostprocess_dropout = 0.2 + hparams.relu_dropout = 0.2 + + return hparams + + +@registry.register_hparams +def transformer_librispeech_tpu_v1(): + """HParams for training ASR model on Librispeech on TPU v1.""" + hparams = transformer_librispeech_v1() update_hparams_for_tpu(hparams) - hparams.batch_size = 32 + hparams.batch_size = 16 librispeech.set_librispeech_length_hparams(hparams) return hparams +@registry.register_hparams +def transformer_librispeech_tpu_v2(): + """HParams for training ASR model on Librispeech on TPU v2.""" + hparams = transformer_librispeech_v2() + update_hparams_for_tpu(hparams) + + hparams.batch_size = 16 + librispeech.set_librispeech_length_hparams(hparams) + return hparams + + +@registry.register_hparams +def transformer_librispeech(): + """HParams for training ASR model on Librispeech.""" + return transformer_librispeech_v2() + + +@registry.register_hparams +def transformer_librispeech_tpu(): + """HParams for training ASR model on Librispeech on TPU.""" + return transformer_librispeech_tpu_v2() + + @registry.register_hparams def transformer_supervised_attention(): """HParams for supervised attention problems.""" diff --git a/tensor2tensor/notebooks/asr_transformer.ipynb b/tensor2tensor/notebooks/asr_transformer.ipynb new file mode 100644 index 000000000..b53653abe --- /dev/null +++ b/tensor2tensor/notebooks/asr_transformer.ipynb @@ -0,0 +1,383 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "7tB9m_fw9Xkl" + }, + "outputs": [], + "source": [ + "!pip install -qq tensorflow\n", + "!pip install -qq tensor2tensor\n", + "!pip install -qq pydub\n", + "!apt-get -qq update\n", + "!apt-get -qq install -y ffmpeg\n", + "!apt-get -qq install -y sox" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "hF_ZmvGjEyJd" + }, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import os\n", + "import collections\n", + "import base64\n", + "import cStringIO\n", + "import pydub\n", + "import shutil\n", + "from scipy.io import wavfile\n", + "\n", + "import IPython\n", + "import google.colab\n", + "\n", + "from tensor2tensor import models\n", + "from tensor2tensor import problems\n", + "from tensor2tensor.layers import common_layers\n", + "from tensor2tensor.utils import trainer_lib\n", + "from tensor2tensor.utils import t2t_model\n", + "from tensor2tensor.utils import registry\n", + "from tensor2tensor.utils import metrics\n", + "\n", + "# Enable TF Eager execution\n", + "from tensorflow.contrib.eager.python import tfe\n", + "tfe.enable_eager_execution()\n", + "\n", + "# Other setup\n", + "Modes = tf.estimator.ModeKeys\n", + "\n", + "# Setup some directories\n", + "data_dir = os.path.expanduser(\"~/t2t/data\")\n", + "tmp_dir = os.path.expanduser(\"~/t2t/tmp\")\n", + "train_dir = os.path.expanduser(\"~/t2t/train\")\n", + "checkpoint_dir = os.path.expanduser(\"~/t2t/checkpoints\")\n", + "tf.gfile.MakeDirs(data_dir)\n", + "tf.gfile.MakeDirs(tmp_dir)\n", + "tf.gfile.MakeDirs(train_dir)\n", + "tf.gfile.MakeDirs(checkpoint_dir)\n", + "\n", + "gs_ckpt_dir = \"gs://tensor2tensor-checkpoints/" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LwPvdJJ4xN6y" + }, + "source": [ + "\n", + "### Define problem, hparams, model, encoder and decoder" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "hH0FEHhDIGjM" + }, + "outputs": [], + "source": [ + "problem_name = \"librispeech_clean\"\n", + "asr_problem = problems.problem(problem_name)\n", + "encoders = asr_problem.feature_encoders(None)\n", + "\n", + "model_name = \"transformer\"\n", + "hparams_set = \"transformer_librispeech_tpu\"\n", + "\n", + "hparams = trainer_lib.create_hparams(hparams_set,data_dir=data_dir, problem_name=problem_name)\n", + "asr_model = registry.model(model_name)(hparams, Modes.PREDICT)\n", + "\n", + "def encode(x):\n", + " waveforms = encoders[\"waveforms\"].encode(x)\n", + " encoded_dict = asr_problem.preprocess_example({\"waveforms\":waveforms, \"targets\":[]}, Modes.PREDICT, hparams)\n", + " \n", + " return {\"inputs\" : tf.expand_dims(encoded_dict[\"inputs\"], 0), \"targets\" : tf.expand_dims(encoded_dict[\"targets\"], 0)}\n", + "\n", + "def decode(integers):\n", + " integers = list(np.squeeze(integers))\n", + " if 1 in integers:\n", + " integets = integers[:integers.index(1)]\n", + " return encoders[\"targets\"].decode(np.squeeze(integers))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "pGhUGptixYBd" + }, + "source": [ + "### Define path to checkpoint" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "p9D8OJdFezsJ" + }, + "outputs": [], + "source": [ + "# Copy the pretrained checkpoint locally\n", + "ckpt_name = \"transformer_asr_180214\"\n", + "gs_ckpt = os.path.join(gs_ckpt_dir, ckpt_name)\n", + "print(gs_ckpt)\n", + "!gsutil cp -R {gs_ckpt} {checkpoint_dir} \n", + "ckpt_path = tf.train.latest_checkpoint(os.path.join(checkpoint_dir, ckpt_name))\n", + "ckpt_path" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "arS1sXFPxvde" + }, + "source": [ + "### Define transcribe function" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "od7ZPT3wfkZs" + }, + "outputs": [], + "source": [ + "# Restore and transcribe!\n", + "def transcribe(inputs):\n", + " encoded_inputs = encode(inputs)\n", + " with tfe.restore_variables_on_create(ckpt_path): \n", + " model_output = asr_model.infer(encoded_inputs, beam_size=2, alpha=0.6, decode_length=1)[\"outputs\"]\n", + " return decode(model_output)\n", + "\n", + "def play_and_transcribe(inputs):\n", + " waveforms = encoders[\"waveforms\"].encode(inputs)\n", + " IPython.display.display(IPython.display.Audio(data=waveforms, rate=16000))\n", + " return transcribe(inputs) " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Qz5u2O5LvShm" + }, + "source": [ + "# Decoding prerecorded examples\n", + "\n", + "You can upload any .wav files. They will be transcribed if frame rate matches Librispeeche's frame rate (16000)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "xAstJTeyvXMf" + }, + "outputs": [], + "source": [ + "uploaded = google.colab.files.upload()\n", + "prerecorded_messages = []\n", + "\n", + "for fn in uploaded.keys():\n", + " print('User uploaded file \"{name}\" with length {length} bytes'.format(\n", + " name=fn, length=len(uploaded[fn])))\n", + " mem_file = cStringIO.StringIO(uploaded[fn])\n", + " \n", + " save_filename = os.path.join(tmp_dir, fn)\n", + " with open(save_filename, 'w') as fd:\n", + " mem_file.seek(0)\n", + " shutil.copyfileobj(mem_file, fd)\n", + " prerecorded_messages.append(save_filename)\n", + " \n", + " \n", + "for inputs in prerecorded_messages:\n", + " outputs = play_and_transcribe(inputs)\n", + "\n", + " print(\"Inputs: %s\" % inputs)\n", + " print(\"Outputs: %s\" % outputs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "mJvRjlHUrr65" + }, + "source": [ + "# Recording your own examples" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "oirqsdqVoElk" + }, + "outputs": [], + "source": [ + "# Records webm file and converts\n", + "def RecordNewAudioSample(filename=None, webm_filename=None):\n", + " \"\"\"Args:\n", + " filename - string, path for storing wav file\n", + " webm_filename - string, path for storing webm file\n", + " Returns:\n", + " string - path where wav file was saved. (=filename if specified)\n", + " \n", + " \"\"\"\n", + " # Create default filenames in tmp_dir if not specified.\n", + " if not filename:\n", + " filename = os.path.join(tmp_dir, \"recording.wav\")\n", + " if not webm_filename:\n", + " webm_filename = os.path.join(tmp_dir, \"recording.webm\")\n", + " \n", + " # Record webm file form colab.\n", + " \n", + " audio = google.colab._message.blocking_request('user_media', {\"audio\":True, \"video\":False, \"duration\":-1}, timeout_sec=600)\n", + " #audio = frontend.RecordMedia(True, False)\n", + " \n", + " # Convert the recording into in_memory file.\n", + " music_mem_file = cStringIO.StringIO(\n", + " base64.decodestring(audio[audio.index(',')+1:]))\n", + " \n", + " # Store webm recording in webm_filename. Storing is necessary for conversion.\n", + " with open(webm_filename, 'w') as fd:\n", + " music_mem_file.seek(0)\n", + " shutil.copyfileobj(music_mem_file, fd)\n", + " \n", + " # Open stored file and save it as wav with sample_rate=16000.\n", + " pydub.AudioSegment.from_file(webm_filename, codec=\"opus\"\n", + " ).set_frame_rate(16000).export(out_f=filename,\n", + " format=\"wav\")\n", + " return filename" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "90BjliFTCQm9" + }, + "outputs": [], + "source": [ + "# Record the sample\n", + "my_sample_filename = RecordNewAudioSample()\n", + "print my_sample_filename" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "PdBfEik0-pMv" + }, + "outputs": [], + "source": [ + "print play_and_transcribe(my_sample_filename)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "default_view": {}, + "name": "ASR with Transformer example notebook", + "provenance": [ + { + "file_id": "notebooks/SR_with_Transformer_example_notebook.ipynb", + "timestamp": 1525703542020 + }, + { + "file_id": "1hEMwW8LgaQPLngfka0tbobYB-ZTVqy34", + "timestamp": 1525702247248 + }, + { + "file_id": "1Pp4aSAceJRNpxtSrTevUKpHKudMxHyBF", + "timestamp": 1518630927690 + } + ], + "version": "0.3.2", + "views": {} + }, + "kernelspec": { + "display_name": "Python 2", + "name": "python2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tensor2tensor/rl/envs/simulated_batch_env.py b/tensor2tensor/rl/envs/simulated_batch_env.py index aecf2be20..75d38acc1 100644 --- a/tensor2tensor/rl/envs/simulated_batch_env.py +++ b/tensor2tensor/rl/envs/simulated_batch_env.py @@ -21,18 +21,13 @@ from __future__ import division from __future__ import print_function -import os - # Dependency imports -import gym - from tensor2tensor.rl.envs.in_graph_batch_env import InGraphBatchEnv from tensor2tensor.utils import registry from tensor2tensor.utils import trainer_lib import tensorflow as tf -import numpy as np flags = tf.flags @@ -61,14 +56,19 @@ def __init__(self, environment_lambda, length): self.action_shape = list(initalization_env.action_space.shape) self.action_dtype = tf.int32 - obs_1 = initalization_env.reset() + initalization_env.reset() + skip_frames = 20 + for _ in range(skip_frames): + initalization_env.step(0) + obs_1 = initalization_env.step(0)[0] obs_2 = initalization_env.step(0)[0] self.frame_1 = tf.expand_dims(tf.cast(obs_1, tf.float32), 0) self.frame_2 = tf.expand_dims(tf.cast(obs_2, tf.float32), 0) shape = (self.length,) + initalization_env.observation_space.shape - # TODO(blazej0) - make more generic - make higher number of previous observations possible. + # TODO(blazej0) - make more generic - make higher number of + # previous observations possible. self._observ = tf.Variable(tf.zeros(shape, tf.float32), trainable=False) self._prev_observ = tf.Variable(tf.zeros(shape, tf.float32), trainable=False) @@ -91,6 +91,7 @@ def simulate(self, action): model_output = self._model.infer(inputs) observ = model_output["targets"] observ = tf.cast(observ[:, 0, :, :, :], tf.float32) + # TODO(lukaszkaiser): instead of -1 use min_reward in the line below. reward = model_output["target_reward"][:, 0, 0, 0] - 1 reward = tf.cast(reward, tf.float32) done = tf.constant(False, tf.bool, shape=(self.length,)) diff --git a/tensor2tensor/rl/envs/tf_atari_wrappers.py b/tensor2tensor/rl/envs/tf_atari_wrappers.py index 83b9a9ae7..2e88fb8b4 100644 --- a/tensor2tensor/rl/envs/tf_atari_wrappers.py +++ b/tensor2tensor/rl/envs/tf_atari_wrappers.py @@ -172,16 +172,16 @@ def __init__(self, batch_env): assert self._length == 1, "We support only one environment" infinity = 10000000 self.speculum = tf.FIFOQueue(infinity, dtypes=[ - tf.string, tf.float32, tf.int32, tf.bool]) + tf.string, tf.float32, tf.int32, tf.bool, tf.uint8]) self._observ = self._batch_env.observ def simulate(self, action): with tf.name_scope("environment/simulate"): # Do we need this? reward, done = self._batch_env.simulate(action) - encoded_image = tf.image.encode_png( - tf.cast(self._batch_env.observ[0, ...], tf.uint8)) + image = tf.cast(self._batch_env.observ[0, ...], tf.uint8) + encoded_image = tf.image.encode_png(image) with tf.control_dependencies([reward, done]): enqueue_op = self.speculum.enqueue( - [encoded_image, reward, action, done]) + [encoded_image, reward, action, done, image]) with tf.control_dependencies([enqueue_op]): return tf.identity(reward), tf.identity(done) diff --git a/tensor2tensor/rl/envs/utils.py b/tensor2tensor/rl/envs/utils.py index 08e2d07fc..396ed88ad 100644 --- a/tensor2tensor/rl/envs/utils.py +++ b/tensor2tensor/rl/envs/utils.py @@ -275,7 +275,7 @@ def _worker(self, constructor, conn): break raise KeyError("Received message of unknown type {}".format(message)) except Exception: # pylint: disable=broad-except - stacktrace = "".join(traceback.format_exception(*sys.exc_info())) + stacktrace = "".join(traceback.format_exception(*sys.exc_info())) # pylint: disable=no-value-for-parameter tf.logging.error("Error in environment process: {}".format(stacktrace)) conn.send((self._EXCEPTION, stacktrace)) conn.close() @@ -307,5 +307,6 @@ def define_batch_env(constructor, num_agents, xvfb=False): def define_simulated_batch_env(environment_lambda, num_agents): - cur_batch_env = simulated_batch_env.SimulatedBatchEnv(environment_lambda, num_agents) + cur_batch_env = simulated_batch_env.SimulatedBatchEnv( + environment_lambda, num_agents) return cur_batch_env diff --git a/tensor2tensor/rl/model_rl_experiment.py b/tensor2tensor/rl/model_rl_experiment.py index bfc5075d3..067d9d420 100644 --- a/tensor2tensor/rl/model_rl_experiment.py +++ b/tensor2tensor/rl/model_rl_experiment.py @@ -51,7 +51,7 @@ def train(hparams, output_dir): time_delta = time.time() - start_time print(line+"Step {}.1. - generate data from policy. " "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta)))) - FLAGS.problem = "gym_discrete_problem_with_agent_on_{}".format(hparams.game) + FLAGS.problem = "gym_discrete_problem_with_agent_on_%s" % hparams.game FLAGS.agent_policy_path = last_model gym_problem = registry.problem(FLAGS.problem) gym_problem.settable_num_steps = hparams.true_env_generator_num_steps @@ -76,7 +76,7 @@ def train(hparams, output_dir): print(line+"Step {}.3. - evalue env model. " "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta)))) gym_simulated_problem = registry.problem( - "gym_simulated_discrete_problem_with_agent_on_{}".format(hparams.game)) + "gym_simulated_discrete_problem_with_agent_on_%s" % hparams.game) sim_steps = hparams.simulated_env_generator_num_steps gym_simulated_problem.settable_num_steps = sim_steps gym_simulated_problem.generate_data(iter_data_dir, tmp_dir) @@ -108,12 +108,12 @@ def train(hparams, output_dir): def main(_): hparams = tf.contrib.training.HParams( epochs=10, - true_env_generator_num_steps=10000, + true_env_generator_num_steps=50000, generative_model="basic_conv_gen", generative_model_params="basic_conv", - model_train_steps=25000, + model_train_steps=50000, simulated_env_generator_num_steps=300, - ppo_epochs_num=200, + ppo_epochs_num=2000, ppo_epoch_length=300, game="pong", ) diff --git a/tensor2tensor/rl/rl_trainer_lib.py b/tensor2tensor/rl/rl_trainer_lib.py index 4ede6438e..f2286dc37 100644 --- a/tensor2tensor/rl/rl_trainer_lib.py +++ b/tensor2tensor/rl/rl_trainer_lib.py @@ -34,6 +34,8 @@ import tensorflow as tf + + def define_train(hparams, environment_spec, event_dir): """Define the training setup.""" policy_lambda = hparams.network @@ -42,7 +44,7 @@ def define_train(hparams, environment_spec, event_dir): environment_spec = lambda: gym.make("PongNoFrameskip-v4") wrappers = hparams.in_graph_wrappers if hasattr( hparams, "in_graph_wrappers") else [] - wrappers.append((tf_atari_wrappers.MaxAndSkipEnv, {"skip": 4})) + wrappers.append((tf_atari_wrappers.MaxAndSkipWrapper, {"skip": 4})) hparams.in_graph_wrappers = wrappers if isinstance(environment_spec, str): env_lambda = lambda: gym.make(environment_spec) @@ -56,10 +58,11 @@ def define_train(hparams, environment_spec, event_dir): "network", functools.partial(policy_lambda, batch_env.action_space, hparams)) - memory, collect_summary = collect.define_collect( - policy_factory, batch_env, hparams, eval_phase=False) - ppo_summary = ppo.define_ppo_epoch(memory, policy_factory, hparams) - summary = tf.summary.merge([collect_summary, ppo_summary]) + with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): + memory, collect_summary = collect.define_collect( + policy_factory, batch_env, hparams, eval_phase=False) + ppo_summary = ppo.define_ppo_epoch(memory, policy_factory, hparams) + summary = tf.summary.merge([collect_summary, ppo_summary]) with tf.variable_scope("eval", reuse=tf.AUTO_REUSE): eval_env_lambda = env_lambda diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py index 2856c76ad..ce917b0cd 100644 --- a/tensor2tensor/utils/decoding.py +++ b/tensor2tensor/utils/decoding.py @@ -51,8 +51,9 @@ def decode_hparams(overrides=""): max_input_size=-1, identity_output=False, num_samples=-1, - delimiter="\n") - hp = hp.parse(overrides) + delimiter="\n", + force_decode_length=False) + hp.parse(overrides) return hp @@ -378,6 +379,7 @@ def input_fn(): def _decode_batch_input_fn(num_decode_batches, sorted_inputs, vocabulary, batch_size, max_input_size): + """Generator to produce batches of inputs.""" tf.logging.info(" batch %d" % num_decode_batches) # First reverse all the input sentences so that if you're going to get OOMs, # you'll see it in the first batch diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index 54d4aef0b..2a6f4fd5a 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -280,10 +280,13 @@ def bottom(self, features): target_modality = self._problem_hparams.target_modality if isinstance(target_modality, dict): for k, v in six.iteritems(target_modality): - with tf.variable_scope( - "%s/%s" % (v.name, k)): # TODO(aidangomez): share variables? - log_info("Transforming '%s' with %s.targets_bottom", k, v.name) - transformed_features[k] = v.targets_bottom(features[k]) + if k in features: + with tf.variable_scope( + "%s/%s" % (v.name, k)): # TODO(aidangomez): share variables? + log_info("Transforming '%s' with %s.targets_bottom", k, v.name) + transformed_features[k] = v.targets_bottom(features[k]) + else: + tf.logging.warn("Modality not found in features: %s", k) else: with tf.variable_scope(target_modality.name): if "targets" in features: diff --git a/tensor2tensor/utils/trainer_lib.py b/tensor2tensor/utils/trainer_lib.py index ddc2da83e..4d19518d9 100644 --- a/tensor2tensor/utils/trainer_lib.py +++ b/tensor2tensor/utils/trainer_lib.py @@ -180,6 +180,7 @@ def create_estimator(model_name, schedule="train_and_evaluate", decode_hparams=None, use_tpu=False): + """Create a T2T Estimator.""" model_fn = t2t_model.T2TModel.make_estimator_model_fn( model_name, hparams, decode_hparams=decode_hparams, use_tpu=use_tpu) diff --git a/tensor2tensor/visualization/visualization_test.py b/tensor2tensor/visualization/visualization_test.py index c40204ed2..b3b0b7ce4 100644 --- a/tensor2tensor/visualization/visualization_test.py +++ b/tensor2tensor/visualization/visualization_test.py @@ -43,7 +43,7 @@ def get_data_dir(): problem_name = 'translate_ende_wmt32k' model_name = 'transformer' -hparams_set = 'transformer_base_single_gpu' +hparams_set = 'transformer_tiny' class VisualizationTest(tf.test.TestCase):