From 739d3f295310c8b05aed5f325e67a7f8ad32300e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 19 Feb 2020 22:52:00 +0100
Subject: [PATCH 01/25] Added per-field custom datatype support

---
 takepod/datasets/iterator.py  | 96 +++++++++++++----------------------
 takepod/storage/field.py      | 50 +++++++++++++++---
 test/storage/test_field.py    | 20 ++++----
 test/storage/test_iterator.py | 67 ++++++++++--------------
 4 files changed, 115 insertions(+), 118 deletions(-)

diff --git a/takepod/datasets/iterator.py b/takepod/datasets/iterator.py
index 7c2a215d..ddafed76 100644
--- a/takepod/datasets/iterator.py
+++ b/takepod/datasets/iterator.py
@@ -26,7 +26,6 @@ class Iterator:
     def __init__(self,
                  dataset=None,
                  batch_size=32,
-                 batch_to_matrix=True,
                  sort_key=None,
                  shuffle=False,
                  seed=1,
@@ -84,7 +83,6 @@ def __init__(self,
         """
 
         self.batch_size = batch_size
-        self.batch_to_matrix = batch_to_matrix
 
         self.shuffle = shuffle
 
@@ -238,71 +236,51 @@ def set_internal_random_state(self, state):
 
     def _create_batch(self, examples):
 
-        if self.batch_to_matrix:
-            return self._create_matrix_batch(examples)
-
-        else:
-            return self._create_list_batch(examples)
-
-    def _create_matrix_batch(self, examples):
-
         # dicts that will be used to create the InputBatch and TargetBatch
         # objects
         input_batch_dict, target_batch_dict = {}, {}
 
         for field in self._dataset.fields:
-            # the length to which all the rows are padded (or truncated)
-            pad_length = Iterator._get_pad_length(field, examples)
-
-            # the last batch can have < batch_size examples
-            n_rows = min(self.batch_size, len(examples))
-
-            # empty matrix to be filled with numericalized fields
-            matrix = None  # np.empty(shape=(n_rows, pad_length))
-
-            # non-sequential fields all have length = 1, no padding necessary
-            should_pad = True if field.sequential else False
+            if field.is_numericalizable:
+                # the length to which all the rows are padded (or truncated)
+                pad_length = Iterator._get_pad_length(field, examples)
 
-            for i, example in enumerate(examples):
+                # the last batch can have < batch_size examples
+                n_rows = min(self.batch_size, len(examples))
 
-                # Get cached value
-                row = field.get_numericalization_for_example(example)
+                # empty matrix to be filled with numericalized fields
+                matrix = None  # np.empty(shape=(n_rows, pad_length))
 
-                if matrix is None:
-                    # Create matrix of the correct dtype
-                    matrix = np.empty(shape=(n_rows, pad_length), dtype=row.dtype)
+                # non-sequential fields all have length = 1, no padding necessary
+                should_pad = True if field.is_sequential else False
 
-                if should_pad:
-                    row = field.pad_to_length(row, pad_length)
+                for i, example in enumerate(examples):
 
-                # set the matrix row to the numericalized, padded array
-                matrix[i] = row
+                    # Get cached value
+                    row = field.get_numericalization_for_example(example)
 
-            if field.is_target:
-                target_batch_dict[field.name] = matrix
-            else:
-                input_batch_dict[field.name] = matrix
+                    if matrix is None:
+                        # Create matrix of the correct dtype
+                        matrix = np.empty(shape=(n_rows, pad_length), dtype=row.dtype)
 
-        input_batch = self.input_batch_class(**input_batch_dict)
-        target_batch = self.target_batch_class(**target_batch_dict)
+                    if should_pad:
+                        row = field.pad_to_length(row, pad_length)
 
-        return input_batch, target_batch
+                    # set the matrix row to the numericalized, padded array
+                    matrix[i] = row
 
-    def _create_list_batch(self, examples):
-        # dicts that will be used to create the InputBatch and TargetBatch
-        # objects
-        input_batch_dict, target_batch_dict = {}, {}
-        for field in self._dataset.fields:
+                batch_feature = matrix
 
-            vectors = [field.get_numericalization_for_example(ex)
-                       for ex
-                       in examples]
+            else:
+                # if the field is not representable as a matrix return a list of
+                # "tokens", which can be any data structure
+                batch_feature = [field.get_numericalization_for_example(example)
+                                 for example in examples]
 
             if field.is_target:
-                target_batch_dict[field.name] = vectors
-
+                target_batch_dict[field.name] = batch_feature
             else:
-                input_batch_dict[field.name] = vectors
+                input_batch_dict[field.name] = batch_feature
 
         input_batch = self.input_batch_class(**input_batch_dict)
         target_batch = self.target_batch_class(**target_batch_dict)
@@ -311,7 +289,7 @@ def _create_list_batch(self, examples):
 
     @staticmethod
     def _get_pad_length(field, examples):
-        if not field.sequential:
+        if not field.is_sequential:
             return 1
 
         # the fixed_length attribute of Field has priority over the max length
@@ -360,8 +338,7 @@ class SingleBatchIterator(Iterator):
 
     def __init__(
             self,
-            dataset: Dataset = None,
-            batch_to_matrix: bool = True):
+            dataset: Dataset = None):
         """Creates an Iterator that creates one batch per epoch
         containing all examples in the dataset.
 
@@ -375,8 +352,7 @@ def __init__(
             returned as a list of numpy vectors or a matrix where each row is a padded
             vector.
         """
-        super().__init__(dataset=dataset,
-                         batch_to_matrix=batch_to_matrix)
+        super().__init__(dataset=dataset)
 
     def set_dataset(self, dataset: Dataset):
         super().set_dataset(dataset)
@@ -405,7 +381,6 @@ def __init__(
             self,
             dataset,
             batch_size,
-            batch_to_matrix=True,
             sort_key=None,
             shuffle=True,
             seed=42,
@@ -441,14 +416,13 @@ def __init__(
         """
 
         if sort_key is None and bucket_sort_key is None:
-            error_msg = "For BucketIterator to work, either sort_key or "\
+            error_msg = "For BucketIterator to work, either sort_key or " \
                         "bucket_sort_key must be != None."
             _LOGGER.error(error_msg)
             raise ValueError(error_msg)
 
         super().__init__(dataset,
                          batch_size,
-                         batch_to_matrix=batch_to_matrix,
                          sort_key=sort_key,
                          shuffle=shuffle,
                          seed=seed)
@@ -574,14 +548,14 @@ def __init__(
         """
 
         if context_max_length is not None and context_max_length < 1:
-            error_msg = "'context_max_length' must not be less than 1. "\
-                        "If you don't want context, try flattening the dataset. "\
+            error_msg = "'context_max_length' must not be less than 1. " \
+                        "If you don't want context, try flattening the dataset. " \
                         "'context_max_length' : {})".format(context_max_length)
             _LOGGER.error(error_msg)
             raise ValueError(error_msg)
 
         if context_max_depth is not None and context_max_depth < 0:
-            error_msg = "'context_max_depth' must not be negative. "\
+            error_msg = "'context_max_depth' must not be negative. " \
                         "'context_max_depth' : {}".format(context_max_depth)
             _LOGGER.error(error_msg)
             raise ValueError(error_msg)
@@ -664,7 +638,7 @@ def _create_batch(self, nodes):
                 matrix = np.empty(shape=(n_rows, pad_length))
 
                 # non-sequential fields all have length = 1, no padding necessary
-                should_pad = True if field.sequential else False
+                should_pad = True if field.is_sequential else False
 
                 for i, example in enumerate(node_context_examples):
                     # Get cached value
diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index 36f67577..4f24474d 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -189,6 +189,7 @@ def __init__(self,
                  store_as_raw=True,
                  store_as_tokenized=False,
                  eager=True,
+                 is_numericalizable=True,
                  custom_numericalize=None,
                  is_target=False,
                  fixed_length=None,
@@ -238,6 +239,20 @@ def __init__(self,
         eager : bool
             Whether to build the vocabulary online, each time the field
             preprocesses raw data.
+        is_numericalizable : bool
+            Whether the output of tokenizer can be numericalized.
+
+            If true, the output of the tokenizer is presumed to be a list of tokens and
+            will be numericalized using the provided Vocab or custom_numericalize.
+            For numericalizable fields, Iterator will generate batch fields containing
+             numpy matrices.
+
+             If false, the out of the tokenizer is presumed to be a custom datatype.
+             Posttokenization hooks aren't allowed to be added as they can't be called
+             on custom datatypes. For non-numericalizable fields, Iterator will generate
+             batch fields containing lists of these custom data type instances returned
+             by the tokenizer.
+
         custom_numericalize : callable
             The numericalization function that will be called if the field
             doesn't use a vocabulary.
@@ -268,6 +283,7 @@ def __init__(self,
         self.name = name
         self.language = language
         self._tokenizer_arg = tokenizer
+        self.is_numericalizable = is_numericalizable
 
         if store_as_tokenized and tokenize:
             error_msg = "Store_as_tokenized' and 'tokenize' both set to True." \
@@ -290,7 +306,15 @@ def __init__(self,
             _LOGGER.error(error_msg)
             raise ValueError(error_msg)
 
-        self.sequential = store_as_tokenized or tokenize
+        if not is_numericalizable \
+                and (custom_numericalize is not None or vocab is not None):
+            error_msg = "Field that is not numericalizable can't have " \
+                        "custom_numericalize or vocab."
+
+            _LOGGER.error(error_msg)
+            raise ValueError(error_msg)
+
+        self.is_sequential = (store_as_tokenized or tokenize) and is_numericalizable
         self.store_as_raw = store_as_raw
         self.tokenize = tokenize
         self.store_as_tokenized = store_as_tokenized
@@ -373,6 +397,11 @@ def add_posttokenize_hook(self, hook):
         hook : callable
             The post-tokenization hook that we want to add to the field.
         """
+        if not self.is_numericalizable:
+            error_msg = "Field is declared as non numericalizable. Posttokenization " \
+                        "hooks aren't used in such fields."
+            _LOGGER.error(error_msg)
+            raise ValueError(error_msg)
 
         self.posttokenize_pipeline.add_hook(hook)
 
@@ -517,7 +546,8 @@ def _process_tokens(self, data, tokens):
             the data and tokens processed by posttokenization hooks.
         """
 
-        data, tokens = self._run_posttokenization_hooks(data, tokens)
+        if self.is_numericalizable:
+            data, tokens = self._run_posttokenization_hooks(data, tokens)
 
         if self.eager and self.use_vocab and not self.vocab.finalized:
             self.update_vocab(data, tokens)
@@ -558,7 +588,7 @@ def get_default_value(self):
                 empty numpy array if the field is sequential or numpy array with one
                 None value otherwise.
         """
-        if self.sequential:
+        if self.is_sequential:
             return np.empty(0)
 
         return np.array([np.nan])
@@ -595,7 +625,11 @@ def numericalize(self, data):
         # raw data is just a string, so we need to wrap it into an iterable
         tokens = tokenized if self.tokenize or self.store_as_tokenized else [raw]
 
-        return self._numericalize_tokens(tokens)
+        if self.is_numericalizable:
+            return self._numericalize_tokens(tokens)
+
+        else:
+            return tokens
 
     def pad_to_length(self, row, length, custom_pad_symbol=None,
                       pad_left=False, truncate_left=False):
@@ -717,8 +751,8 @@ def __setstate__(self, state):
         self.tokenizer = get_tokenizer(self._tokenizer_arg, self.language)
 
     def __str__(self):
-        return "{}[name: {}, sequential: {}, is_target: {}]".format(
-            self.__class__.__name__, self.name, self.sequential, self.is_target)
+        return "{}[name: {}, is_sequential: {}, is_target: {}]".format(
+            self.__class__.__name__, self.name, self.is_sequential, self.is_target)
 
     def get_output_fields(self):
         """Returns an Iterable of the contained output fields.
@@ -839,8 +873,8 @@ def finalize(self):
 
         if self.use_vocab and len(self.vocab) > self.num_of_classes:
             error_msg = "Number of classes in data is greater than the declared number " \
-                        "of classes. Declared: {}, Actual: {}".format(
-                            self.num_of_classes, len(self.vocab))
+                        "of classes. Declared: {}, Actual: {}"\
+                .format(self.num_of_classes, len(self.vocab))
             _LOGGER.error(error_msg)
             raise ValueError(error_msg)
 
diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index 68fa1db9..c19d3ca7 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -73,7 +73,7 @@ def test_field_preprocess_eager():
 
 
 @pytest.mark.parametrize(
-    "value, store_raw, sequential, expected_raw_value, "
+    "value, store_raw, is_sequential, expected_raw_value, "
     "expected_tokenized_value",
     [
         ("some text", True, True, "some text", ["some", "text"]),
@@ -81,10 +81,10 @@ def test_field_preprocess_eager():
         ("some text", False, True, None, ["some", "text"]),
     ]
 )
-def test_field_preprocess_raw_sequential(value, store_raw, sequential,
+def test_field_preprocess_raw_sequential(value, store_raw, is_sequential,
                                          expected_raw_value,
                                          expected_tokenized_value):
-    f = Field(name="F", store_as_raw=store_raw, tokenize=sequential)
+    f = Field(name="F", store_as_raw=store_raw, tokenize=is_sequential)
 
     (_, (received_raw_value, received_tokenized_value)), = f.preprocess(value)
 
@@ -93,7 +93,7 @@ def test_field_preprocess_raw_sequential(value, store_raw, sequential,
 
 
 @pytest.mark.parametrize(
-    "value, store_raw, sequential, expected_raw_value, "
+    "value, store_raw, is_sequential, expected_raw_value, "
     "expected_tokenized_value",
     [
         ("some text", True, True, "some text", ["some", "text"]),
@@ -101,10 +101,10 @@ def test_field_preprocess_raw_sequential(value, store_raw, sequential,
         ("some text", False, True, None, ["some", "text"]),
     ]
 )
-def test_field_pickle_tokenized(value, store_raw, sequential,
+def test_field_pickle_tokenized(value, store_raw, is_sequential,
                                 expected_raw_value,
                                 expected_tokenized_value, tmpdir):
-    fld = Field(name="F", store_as_raw=store_raw, tokenize=sequential)
+    fld = Field(name="F", store_as_raw=store_raw, tokenize=is_sequential)
 
     (_, (received_raw_value, received_tokenized_value)), = fld.preprocess(value)
 
@@ -124,7 +124,7 @@ def test_field_pickle_tokenized(value, store_raw, sequential,
         assert tokenized_value == expected_tokenized_value
         assert loaded_fld.name == "F"
         assert loaded_fld.store_as_raw == store_raw
-        assert loaded_fld.sequential == sequential
+        assert loaded_fld.is_sequential == is_sequential
 
 
 @pytest.mark.parametrize(
@@ -141,7 +141,7 @@ def test_field_use_vocab(vocab, expected_value):
 
 
 @pytest.mark.parametrize(
-    "use_vocab, sequential, expected_vocab_values",
+    "use_vocab, is_sequential, expected_vocab_values",
     [
         (False, False, []),
         (False, True, []),
@@ -149,10 +149,10 @@ def test_field_use_vocab(vocab, expected_value):
         (True, True, ["some", "text"]),
     ]
 )
-def test_field_update_vocab(use_vocab, sequential, expected_vocab_values):
+def test_field_update_vocab(use_vocab, is_sequential, expected_vocab_values):
     vocab = MockVocab()
     f = Field(name="F", vocab=vocab if use_vocab else None,
-              tokenize=sequential)
+              tokenize=is_sequential)
 
     raw_value = "some text"
     tokenized_value = ["some", "text"]
diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py
index 86fb0ba4..7a174e99 100644
--- a/test/storage/test_iterator.py
+++ b/test/storage/test_iterator.py
@@ -1,5 +1,4 @@
 import random
-import copy
 
 from test.storage.conftest import (
     create_tabular_dataset_from_json, tabular_dataset_fields, TABULAR_TEXT)
@@ -115,6 +114,34 @@ def test_create_batch(tabular_dataset):
             assert y_batch.rating.shape[0] == batch_size
 
 
+@pytest.mark.usefixtures("json_file_path")
+def test_not_numericalizable_field(json_file_path):
+    class MockCustomDataClass:
+
+        def __init__(self, data):
+            self.data = data
+
+    def custom_datatype_tokenizer(data):
+        return MockCustomDataClass(data)
+
+    fields = tabular_dataset_fields()
+    text_field = fields['text']
+    non_numericalizable_field = Field("non_numericalizable_field",
+                                      tokenizer=custom_datatype_tokenizer,
+                                      is_numericalizable=False)
+
+    fields['text'] = (text_field, non_numericalizable_field)
+
+    dataset = create_tabular_dataset_from_json(fields, json_file_path)
+    dataset.finalize_fields()
+
+    for x_batch, _ in Iterator(dataset, batch_size=len(dataset)):
+        assert isinstance(x_batch.non_numericalizable_field, (list, tuple))
+        for batch_data, real_data in zip(x_batch.non_numericalizable_field, TABULAR_TEXT):
+            assert isinstance(batch_data, MockCustomDataClass)
+            assert batch_data.data == real_data
+
+
 @pytest.mark.usefixtures("tabular_dataset")
 def test_lazy_numericalization_caching(tabular_dataset):
     tabular_dataset.finalize_fields()
@@ -339,44 +366,6 @@ def np_arrays_equal(arr_1, arr_2):
     return arrs_equal
 
 
-@pytest.mark.usefixtures("tabular_dataset")
-def test_batch_as_vector_list(tabular_dataset):
-    tabular_dataset.finalize_fields()
-    text_vocab = tabular_dataset.field_dict["text"].vocab
-
-    # case where we have both input and target fields
-    iterator = Iterator(tabular_dataset, batch_size=3, batch_to_matrix=False)
-
-    example_index = 0
-    for x_batch, y_batch in iterator:
-        assert isinstance(x_batch.text, list)
-        assert isinstance(y_batch.rating, list)
-
-        for x, y in zip(x_batch.text, y_batch.rating):
-            example = tabular_dataset[example_index]
-            assert all(x == text_vocab.numericalize(example.text[1]))
-            assert y == [example.rating[0]]
-            example_index += 1
-
-    # case where we have only input fields
-    tabular_dataset = copy.deepcopy(tabular_dataset)
-    tabular_dataset.field_dict["rating"].is_target = False
-
-    iterator = Iterator(tabular_dataset, batch_size=3, batch_to_matrix=False)
-
-    example_index = 0
-    for x_batch, y_batch in iterator:
-        assert isinstance(x_batch.text, list)
-        assert isinstance(x_batch.rating, list)
-        assert not y_batch
-
-        for example_text, example_rating in zip(x_batch.text, x_batch.rating):
-            example = tabular_dataset[example_index]
-            assert all(example_text == text_vocab.numericalize(example.text[1]))
-            assert example_rating == [example.rating[0]]
-            example_index += 1
-
-
 @pytest.fixture()
 def hierarchical_dataset_fields():
     name_field = Field(name="name", store_as_raw=True, tokenize=False, vocab=Vocab())

From 7c6740c63ec1f99f887df4b60203f333036300ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Thu, 20 Feb 2020 18:59:30 +0100
Subject: [PATCH 02/25] WIP: TfIdfVectorizer update pending

---
 takepod/datasets/iterator.py    | 25 ++++++++++++++----
 takepod/examples/ner_example.py |  2 +-
 takepod/storage/field.py        | 35 ++++++++++++++++++-------
 takepod/storage/vocab.py        |  9 +++----
 test/storage/conftest.py        |  7 ++---
 test/storage/test_field.py      | 46 ++++++++++++++++++++++++++-------
 test/storage/test_iterator.py   | 33 +++++++++++++++++------
 test/storage/test_vocab.py      |  4 +--
 8 files changed, 119 insertions(+), 42 deletions(-)

diff --git a/takepod/datasets/iterator.py b/takepod/datasets/iterator.py
index ddafed76..eb924593 100644
--- a/takepod/datasets/iterator.py
+++ b/takepod/datasets/iterator.py
@@ -242,6 +242,8 @@ def _create_batch(self, examples):
 
         for field in self._dataset.fields:
             if field.is_numericalizable:
+                # If this field is numericalizable, generate a possibly padded matrix
+
                 # the length to which all the rows are padded (or truncated)
                 pad_length = Iterator._get_pad_length(field, examples)
 
@@ -257,15 +259,24 @@ def _create_batch(self, examples):
                 for i, example in enumerate(examples):
 
                     # Get cached value
-                    row = field.get_numericalization_for_example(example)
+                    data = field.get_numericalization_for_example(example)
+
+                    if data is None:
+                        # If data is missing, fill row with missing data symbol indexes
+                        missing_data_symbol_index = field.get_default_value()
+                        # TODO cache missing data
+                        #      row for batch to avoid multiple instantiations?
+                        row = np.full(pad_length, missing_data_symbol_index)
+
+                    else:
+                        row = data
+                        if should_pad:
+                            row = field.pad_to_length(row, pad_length)
 
                     if matrix is None:
                         # Create matrix of the correct dtype
                         matrix = np.empty(shape=(n_rows, pad_length), dtype=row.dtype)
 
-                    if should_pad:
-                        row = field.pad_to_length(row, pad_length)
-
                     # set the matrix row to the numericalized, padded array
                     matrix[i] = row
 
@@ -301,7 +312,11 @@ def _get_pad_length(field, examples):
         # examples in the batch
         def length_of_field(example):
             _, tokens = getattr(example, field.name)
-            return len(tokens)
+            if tokens is None:
+                # missing data
+                return 1
+            else:
+                return len(tokens)
 
         return max(map(length_of_field, examples))
 
diff --git a/takepod/examples/ner_example.py b/takepod/examples/ner_example.py
index d71143bc..eb80ab52 100644
--- a/takepod/examples/ner_example.py
+++ b/takepod/examples/ner_example.py
@@ -127,7 +127,7 @@ def ner_croatian_blcc_example(fields, dataset, batch_transform_function):
     x_test, y_test = batch_transform_function(*next(test_iter.__iter__()))
     prediction = model.predict(X=x_test)[BLCCModel.PREDICTION_KEY]
 
-    pad_symbol = fields['labels'].vocab.pad_symbol()
+    pad_symbol = fields['labels'].vocab.pad_symbol_index()
     prediction_filtered, y_test_filtered = filter_out_padding(
         pad_symbol,
         prediction,
diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index 4f24474d..b92d16ef 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -6,6 +6,7 @@
 import numpy as np
 
 from takepod.preproc.tokenizers import get_tokenizer
+from takepod.storage.vocab import SpecialVocabSymbols
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -584,14 +585,25 @@ def get_default_value(self):
 
         Returns
         -------
-            missing : iterable
-                empty numpy array if the field is sequential or numpy array with one
+            missing_symbol index or None
+                The index of the missing data token, if this field is numericalizable.
                 None value otherwise.
+
+        Raises
+        ------
+        ValueError
+            If missing data is not allowed in this field.
         """
-        if self.is_sequential:
-            return np.empty(0)
+        if not self.allow_missing_data:
+            error_msg = "Missing data not allowed in field {}".format(self.name)
+            _LOGGER.error(error_msg)
+            raise ValueError(error_msg)
 
-        return np.array([np.nan])
+        if self.is_numericalizable:
+            return -1
+
+        else:
+            return None
 
     def numericalize(self, data):
         """Numericalize the already preprocessed data point based either on
@@ -608,8 +620,13 @@ def numericalize(self, data):
         Returns
         -------
         numpy array
-            Array of stoi indexes of the tokens.
+            Array of stoi indexes of the tokens, if data exists.
+            None, if data is missing and missing data is allowed.
 
+        Raises
+        ------
+        ValueError
+            If data is None and missing data is not allowed in this field.
         """
         raw, tokenized = data
 
@@ -620,7 +637,7 @@ def numericalize(self, data):
                 raise ValueError(error_msg)
 
             else:
-                return self.get_default_value()
+                return None
 
         # raw data is just a string, so we need to wrap it into an iterable
         tokens = tokenized if self.tokenize or self.store_as_tokenized else [raw]
@@ -673,7 +690,7 @@ def pad_to_length(self, row, length, custom_pad_symbol=None,
             # padding
 
             if self.use_vocab:
-                pad_symbol = self.vocab.pad_symbol()
+                pad_symbol = self.vocab.pad_symbol_index()
             else:
                 pad_symbol = custom_pad_symbol
 
@@ -873,7 +890,7 @@ def finalize(self):
 
         if self.use_vocab and len(self.vocab) > self.num_of_classes:
             error_msg = "Number of classes in data is greater than the declared number " \
-                        "of classes. Declared: {}, Actual: {}"\
+                        "of classes. Declared: {}, Actual: {}" \
                 .format(self.num_of_classes, len(self.vocab))
             _LOGGER.error(error_msg)
             raise ValueError(error_msg)
diff --git a/takepod/storage/vocab.py b/takepod/storage/vocab.py
index 58baf35c..f699abf4 100644
--- a/takepod/storage/vocab.py
+++ b/takepod/storage/vocab.py
@@ -174,22 +174,21 @@ def get_freqs(self):
             raise RuntimeError(error_msg)
         return self._freqs
 
-    def pad_symbol(self):
+    def pad_symbol_index(self):
         """Method returns padding symbol index.
 
         Returns
         -------
         pad_symbol_index : int
-            padding symbol index in the vocabullary
+            padding symbol index in the vocabulary
 
         Raises
         ------
         ValueError
-            if the padding symbol is not pressent in the vocabulary
+            if the padding symbol is not present in the vocabulary.
         """
         if SpecialVocabSymbols.PAD not in self.stoi:
-            error_msg = "Padding symbol is not in the vocabulary so" \
-                        " pad_symbol function raises exception."
+            error_msg = "Padding symbol is not in the vocabulary."
             _LOGGER.error(error_msg)
             raise ValueError(error_msg)
         return self.stoi[SpecialVocabSymbols.PAD]
diff --git a/test/storage/conftest.py b/test/storage/conftest.py
index fb02ea3b..c9ede150 100644
--- a/test/storage/conftest.py
+++ b/test/storage/conftest.py
@@ -60,7 +60,8 @@ def tabular_dataset(json_file_path):
 
 @pytest.fixture()
 def tabular_dataset_fields(fixed_length=None):
-    text = Field('text', eager=True, vocab=Vocab(), fixed_length=fixed_length)
+    text = Field('text', eager=True, vocab=Vocab(),
+                 fixed_length=fixed_length, allow_missing_data=True)
     rating = Field('rating', tokenize=False, eager=False, is_target=True,
                    custom_numericalize=float)
 
@@ -73,10 +74,10 @@ def tabular_dataset_fields(fixed_length=None):
     "a b c",
     "a",
     "a b c d",
-    "a",
+    None,
     "d b",
     "d c g",
-    "b b b b b b",
+    "b b b b b b"
 )
 
 TABULAR_RATINGS = (2.5, 3.2, 1.1, 2.1, 5.4, 2.8, 1.9)
diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index c19d3ca7..9db153b2 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -34,7 +34,7 @@ def __init__(self):
         self.finalized = False
         self.numericalized = False
 
-    def pad_symbol(self):
+    def pad_symbol_index(self):
         return PAD_NUM
 
     def __add__(self, values):
@@ -618,12 +618,12 @@ def test_multilabel_field_class_count():
 @pytest.mark.parametrize("tokens, expected_numericalization",
                          [
                              (
-                                 ["class1", "class2", "class3", "class4"],
-                                 np.array([1, 1, 1, 1, 0, 0])
+                                     ["class1", "class2", "class3", "class4"],
+                                     np.array([1, 1, 1, 1, 0, 0])
                              ),
                              (
-                                 [],
-                                 np.array([0, 0, 0, 0, 0, 0])
+                                     [],
+                                     np.array([0, 0, 0, 0, 0, 0])
                              )
                          ])
 def test_multilabel_field_custom_numericalization(tokens, expected_numericalization):
@@ -688,8 +688,8 @@ def test_missing_values_default_sequential():
     assert np.all(fld.numericalize(data_exists) == np.array([hash("data_string")]))
 
 
-def test_missing_values_default_not_sequential():
-    fld = Field(name="bla",
+def test_missing_values_custom_numericalize():
+    fld = Field(name="test_field",
                 store_as_raw=True,
                 tokenize=False,
                 custom_numericalize=int,
@@ -703,11 +703,39 @@ def test_missing_values_default_not_sequential():
 
     fld.finalize()
 
-    assert np.allclose(fld.numericalize(data_missing), np.array([np.nan]),
-                       equal_nan=True)
+    assert fld.numericalize(data_missing) is None
     assert np.all(fld.numericalize(data_exists) == np.array([404]))
 
 
+def test_missing_symbol_index_vocab():
+    vocab = Vocab()
+    fld = Field(name="test_field",
+                tokenizer='split',
+                store_as_raw=False,
+                tokenize=True,
+                vocab=vocab,
+                allow_missing_data=True)
+
+    fld.preprocess("a b c d")
+    ((_, data),) = fld.preprocess(None)
+    assert data == (None, None)
+
+    fld.finalize()
+    assert fld.numericalize((None, None)) is None
+    assert fld.get_default_value() == -1
+
+
+def test_missing_symbol_index_custom_numericalize():
+    fld = Field(name="test_field",
+                store_as_raw=True,
+                tokenize=False,
+                custom_numericalize=int,
+                allow_missing_data=True)
+
+    fld.finalize()
+    assert fld.get_default_value() == -1
+
+
 def test_missing_values_fail():
     fld = Field(name="bla",
                 store_as_raw=True,
diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py
index 7a174e99..622a6dca 100644
--- a/test/storage/test_iterator.py
+++ b/test/storage/test_iterator.py
@@ -29,7 +29,7 @@ def test_len(batch_size, expected_len, tabular_dataset):
 
     iterator = Iterator(dataset=tabular_dataset, batch_size=batch_size)
 
-    assert expected_len == len(iterator)
+    assert len(iterator) == expected_len
 
 
 @pytest.mark.parametrize(
@@ -57,9 +57,13 @@ def test_padding(fixed_length, expected_shape, json_file_path):
 
     assert input_batch.text.shape == expected_shape
 
-    pad_symbol = fields["text"].vocab.pad_symbol()
+    pad_symbol = fields["text"].vocab.pad_symbol_index()
 
     for i, row in enumerate(input_batch.text):
+        if TABULAR_TEXT[i] is None:
+            # if missing data
+            continue
+
         n_el = len(TABULAR_TEXT[i].split())
 
         assert (row[:n_el].astype(np.int32) != pad_symbol).all()
@@ -128,7 +132,8 @@ def custom_datatype_tokenizer(data):
     text_field = fields['text']
     non_numericalizable_field = Field("non_numericalizable_field",
                                       tokenizer=custom_datatype_tokenizer,
-                                      is_numericalizable=False)
+                                      is_numericalizable=False,
+                                      allow_missing_data=True)
 
     fields['text'] = (text_field, non_numericalizable_field)
 
@@ -137,9 +142,12 @@ def custom_datatype_tokenizer(data):
 
     for x_batch, _ in Iterator(dataset, batch_size=len(dataset)):
         assert isinstance(x_batch.non_numericalizable_field, (list, tuple))
-        for batch_data, real_data in zip(x_batch.non_numericalizable_field, TABULAR_TEXT):
-            assert isinstance(batch_data, MockCustomDataClass)
-            assert batch_data.data == real_data
+        for i, batch_data, real_data in zip(range(len(dataset)), x_batch.non_numericalizable_field, TABULAR_TEXT):
+            if i == 3:
+                assert batch_data is None
+            else:
+                assert isinstance(batch_data, MockCustomDataClass)
+                assert batch_data.data == real_data
 
 
 @pytest.mark.usefixtures("tabular_dataset")
@@ -171,7 +179,10 @@ def test_sort_key(tabular_dataset):
 
     def text_len_sort_key(example):
         tokens = example.text[1]
-        return len(tokens)
+        if tokens is None:
+            return 0
+        else:
+            return len(tokens)
 
     iterator = Iterator(dataset=tabular_dataset, batch_size=2,
                         sort_key=text_len_sort_key, shuffle=False)
@@ -283,8 +294,14 @@ def test_shuffle_random_state_exception(tabular_dataset):
 
 
 def text_len_key(example):
-    return len(example.text[1])
+    if example.text[1] is None:
+        return 0
+    else:
+        return len(example.text[1])
+
 
+def test_iterator_missing_data_in_batch():
+    pass
 
 @pytest.mark.parametrize(
     "look_ahead_multiplier, expected_row_lengths, bucket_sort_key, sort_key",
diff --git a/test/storage/test_vocab.py b/test/storage/test_vocab.py
index a6c18213..c8c74f34 100644
--- a/test/storage/test_vocab.py
+++ b/test/storage/test_vocab.py
@@ -100,7 +100,7 @@ def test_empty_specials_get_pad_symbol():
     voc = vocab.Vocab(specials=[])
     voc.finalize()
     with pytest.raises(ValueError):
-        voc.pad_symbol()
+        voc.pad_symbol_index()
 
 
 def test_empty_specials_stoi():
@@ -116,7 +116,7 @@ def test_specials_get_pad_symbol():
     voc = vocab.Vocab(specials=(vocab.SpecialVocabSymbols.PAD,))
     data = ["tree", "plant", "grass"]
     voc = (voc + set(data))
-    assert voc.pad_symbol() == 0
+    assert voc.pad_symbol_index() == 0
     voc.finalize()
     assert voc.itos[0] == vocab.SpecialVocabSymbols.PAD
 

From 553220630c4f452c97342ee9357546987a08b49b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Mon, 24 Feb 2020 12:22:22 +0100
Subject: [PATCH 03/25] Added option to define custom missing data symbol

---
 takepod/datasets/tabular_dataset.py |  1 +
 takepod/storage/field.py            | 14 +++++++++-----
 test/storage/test_field.py          |  2 +-
 test/storage/test_iterator.py       | 19 ++++++++++++++++---
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/takepod/datasets/tabular_dataset.py b/takepod/datasets/tabular_dataset.py
index 2a58d922..3013fa03 100644
--- a/takepod/datasets/tabular_dataset.py
+++ b/takepod/datasets/tabular_dataset.py
@@ -85,6 +85,7 @@ def __init__(self, path, format, fields, skip_header=False,
 
         # create a Dataset with lists of examples and fields
         super(TabularDataset, self).__init__(examples, fields, **kwargs)
+        self.finalize_fields()
 
 
 def create_examples(reader, format, fields, skip_header):
diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index b92d16ef..1b078341 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -194,7 +194,8 @@ def __init__(self,
                  custom_numericalize=None,
                  is_target=False,
                  fixed_length=None,
-                 allow_missing_data=False
+                 allow_missing_data=False,
+                 missing_data_token=-1
                  ):
         """Create a Field from arguments.
 
@@ -269,10 +270,12 @@ def __init__(self,
             is false and None is sent to be preprocessed, an ValueError will be raised.
             If 'allow_missing_data' is True, if a None is sent to be preprocessed, it will
             be stored and later numericalized properly.
-            If the field is sequential the numericalization of a missing data field will
-            be an empty numpy Array, else the numericalization will be a numpy Array
-            containing a single np.Nan ([np.Nan])
             Default: False
+        missing_data_token : number
+            Token to use to mark batch rows as missing. If data for a field is missing,
+            its matrix row will be filled with this value. For non numericalizable fields,
+            this parameter is ignored and the value will be None.
+            Default: -1
 
         Raises
         ------
@@ -332,6 +335,7 @@ def __init__(self,
         self.pretokenize_pipeline = PretokenizationPipeline()
         self.posttokenize_pipeline = PosttokenizationPipeline()
         self.allow_missing_data = allow_missing_data
+        self.missing_data_token = missing_data_token
 
     @property
     def use_vocab(self):
@@ -600,7 +604,7 @@ def get_default_value(self):
             raise ValueError(error_msg)
 
         if self.is_numericalizable:
-            return -1
+            return self.missing_data_token
 
         else:
             return None
diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index 9db153b2..af51233b 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -684,7 +684,7 @@ def test_missing_values_default_sequential():
     assert data_exists == (None, ["data_string"])
     fld.finalize()
 
-    assert np.all(fld.numericalize(data_missing) == np.empty(0))
+    assert fld.numericalize(data_missing) is None
     assert np.all(fld.numericalize(data_exists) == np.array([hash("data_string")]))
 
 
diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py
index 622a6dca..2139207c 100644
--- a/test/storage/test_iterator.py
+++ b/test/storage/test_iterator.py
@@ -299,9 +299,22 @@ def text_len_key(example):
     else:
         return len(example.text[1])
 
-
-def test_iterator_missing_data_in_batch():
-    pass
+@pytest.mark.usefixtures("json_file_path")
+def test_iterator_missing_data_in_batch(json_file_path):
+    missing_data_default_value = -99
+    fields = tabular_dataset_fields()
+    missing_value_field = Field("non_numericalizable_field",
+                                      tokenizer="split",
+                                      vocab=Vocab(),
+                                      allow_missing_data=True,
+                                      missing_data_token=missing_data_default_value)
+    fields['text'] = missing_value_field
+    ds = create_tabular_dataset_from_json(fields, json_file_path)
+
+    for x_batch, _ in Iterator(ds, batch_size=len(ds)):
+        # test if the value we know is missing is correctly filled out
+        missing_value_row = x_batch.non_numericalizable_field[3]
+        assert np.all(missing_value_row == missing_data_default_value)
 
 @pytest.mark.parametrize(
     "look_ahead_multiplier, expected_row_lengths, bucket_sort_key, sort_key",

From 280cc43ffd74ae26cf0eb9ff8d0d2ae502fae079 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Mon, 24 Feb 2020 12:47:23 +0100
Subject: [PATCH 04/25] Optimized handling od missing value rows in Iterator

---
 takepod/datasets/iterator.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/takepod/datasets/iterator.py b/takepod/datasets/iterator.py
index eb924593..f057cba0 100644
--- a/takepod/datasets/iterator.py
+++ b/takepod/datasets/iterator.py
@@ -266,19 +266,26 @@ def _create_batch(self, examples):
                         missing_data_symbol_index = field.get_default_value()
                         # TODO cache missing data
                         #      row for batch to avoid multiple instantiations?
-                        row = np.full(pad_length, missing_data_symbol_index)
+
+                        if matrix is None:
+                            # Create matrix of the correct dtype
+                            matrix = np.empty(shape=(n_rows, pad_length),
+                                              dtype=type(missing_data_symbol_index))
+
+                        matrix[i] = missing_data_symbol_index
 
                     else:
                         row = data
                         if should_pad:
                             row = field.pad_to_length(row, pad_length)
 
-                    if matrix is None:
-                        # Create matrix of the correct dtype
-                        matrix = np.empty(shape=(n_rows, pad_length), dtype=row.dtype)
+                        if matrix is None:
+                            # Create matrix of the correct dtype
+                            matrix = np.empty(shape=(n_rows, pad_length),
+                                              dtype=row.dtype)
 
-                    # set the matrix row to the numericalized, padded array
-                    matrix[i] = row
+                        # set the matrix row to the numericalized, padded array
+                        matrix[i] = row
 
                 batch_feature = matrix
 

From 2f505b27c85a5baa4bd686ef1043343a285d2b92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Thu, 27 Feb 2020 17:22:19 +0100
Subject: [PATCH 05/25] Made TfIdf vectorizer not support fields with missing
 data

---
 takepod/storage/field.py             |  1 -
 takepod/storage/vectorizers/tfidf.py |  8 ++++++++
 test/storage/conftest.py             | 19 ++++++++++++++++---
 test/storage/test_field.py           |  8 ++++----
 test/storage/test_iterator.py        | 20 ++++++++++++--------
 5 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index 1b078341..20c48686 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -6,7 +6,6 @@
 import numpy as np
 
 from takepod.preproc.tokenizers import get_tokenizer
-from takepod.storage.vocab import SpecialVocabSymbols
 
 _LOGGER = logging.getLogger(__name__)
 
diff --git a/takepod/storage/vectorizers/tfidf.py b/takepod/storage/vectorizers/tfidf.py
index e1838fde..6b42d782 100644
--- a/takepod/storage/vectorizers/tfidf.py
+++ b/takepod/storage/vectorizers/tfidf.py
@@ -153,6 +153,14 @@ def fit(self, dataset, field):
                         "or by providing field with a non-empty vocab property."
             _LOGGER.error(error_msg)
             raise ValueError(error_msg)
+
+        if field and field.allow_missing_data:
+            error_msg = "CountVectorizer doesn't support fields that " \
+                        "contain missing data: " \
+                        "{}, field: {}".format(str(dataset), str(field))
+            _LOGGER.error(error_msg)
+            raise ValueError(error_msg)
+
         self._vocab = field.vocab if self._vocab is None else self._vocab
         self._init_special_indexes()
         self._fitted = True
diff --git a/test/storage/conftest.py b/test/storage/conftest.py
index c9ede150..e612d523 100644
--- a/test/storage/conftest.py
+++ b/test/storage/conftest.py
@@ -61,16 +61,18 @@ def tabular_dataset(json_file_path):
 @pytest.fixture()
 def tabular_dataset_fields(fixed_length=None):
     text = Field('text', eager=True, vocab=Vocab(),
-                 fixed_length=fixed_length, allow_missing_data=True)
+                 fixed_length=fixed_length, allow_missing_data=False)
+    text_missing = Field('text_with_missing_data', eager=True, vocab=Vocab(),
+                         fixed_length=fixed_length, allow_missing_data=True)
     rating = Field('rating', tokenize=False, eager=False, is_target=True,
                    custom_numericalize=float)
 
-    fields = {"text": text, "rating": rating}
+    fields = {"text": text, "text_with_missing_data": text_missing, "rating": rating}
 
     return fields
 
 
-TABULAR_TEXT = (
+TABULAR_TEXT_WITH_MISSING = (
     "a b c",
     "a",
     "a b c d",
@@ -80,6 +82,16 @@ def tabular_dataset_fields(fixed_length=None):
     "b b b b b b"
 )
 
+TABULAR_TEXT = (
+    "a b c",
+    "a",
+    "a b c d",
+    "a",
+    "d b",
+    "d c g",
+    "b b b b b b"
+)
+
 TABULAR_RATINGS = (2.5, 3.2, 1.1, 2.1, 5.4, 2.8, 1.9)
 
 
@@ -87,6 +99,7 @@ def tabular_dataset_fields(fixed_length=None):
 def tabular_data():
     return {
         "text": TABULAR_TEXT,
+        "text_with_missing_data": TABULAR_TEXT_WITH_MISSING,
         "rating": TABULAR_RATINGS,
     }
 
diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index af51233b..761475e5 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -618,12 +618,12 @@ def test_multilabel_field_class_count():
 @pytest.mark.parametrize("tokens, expected_numericalization",
                          [
                              (
-                                     ["class1", "class2", "class3", "class4"],
-                                     np.array([1, 1, 1, 1, 0, 0])
+                                 ["class1", "class2", "class3", "class4"],
+                                 np.array([1, 1, 1, 1, 0, 0])
                              ),
                              (
-                                     [],
-                                     np.array([0, 0, 0, 0, 0, 0])
+                                 [],
+                                 np.array([0, 0, 0, 0, 0, 0])
                              )
                          ])
 def test_multilabel_field_custom_numericalization(tokens, expected_numericalization):
diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py
index 2139207c..6785ca4c 100644
--- a/test/storage/test_iterator.py
+++ b/test/storage/test_iterator.py
@@ -129,20 +129,22 @@ def custom_datatype_tokenizer(data):
         return MockCustomDataClass(data)
 
     fields = tabular_dataset_fields()
-    text_field = fields['text']
+    text_field = fields['text_with_missing_data']
     non_numericalizable_field = Field("non_numericalizable_field",
                                       tokenizer=custom_datatype_tokenizer,
                                       is_numericalizable=False,
                                       allow_missing_data=True)
 
-    fields['text'] = (text_field, non_numericalizable_field)
+    fields['text_with_missing_data'] = (text_field, non_numericalizable_field)
 
     dataset = create_tabular_dataset_from_json(fields, json_file_path)
     dataset.finalize_fields()
 
     for x_batch, _ in Iterator(dataset, batch_size=len(dataset)):
         assert isinstance(x_batch.non_numericalizable_field, (list, tuple))
-        for i, batch_data, real_data in zip(range(len(dataset)), x_batch.non_numericalizable_field, TABULAR_TEXT):
+        for i, batch_data, real_data in zip(
+                range(len(dataset)), x_batch.non_numericalizable_field, TABULAR_TEXT
+        ):
             if i == 3:
                 assert batch_data is None
             else:
@@ -299,16 +301,17 @@ def text_len_key(example):
     else:
         return len(example.text[1])
 
+
 @pytest.mark.usefixtures("json_file_path")
 def test_iterator_missing_data_in_batch(json_file_path):
     missing_data_default_value = -99
     fields = tabular_dataset_fields()
     missing_value_field = Field("non_numericalizable_field",
-                                      tokenizer="split",
-                                      vocab=Vocab(),
-                                      allow_missing_data=True,
-                                      missing_data_token=missing_data_default_value)
-    fields['text'] = missing_value_field
+                                tokenizer="split",
+                                vocab=Vocab(),
+                                allow_missing_data=True,
+                                missing_data_token=missing_data_default_value)
+    fields['text_with_missing_data'] = missing_value_field
     ds = create_tabular_dataset_from_json(fields, json_file_path)
 
     for x_batch, _ in Iterator(ds, batch_size=len(ds)):
@@ -316,6 +319,7 @@ def test_iterator_missing_data_in_batch(json_file_path):
         missing_value_row = x_batch.non_numericalizable_field[3]
         assert np.all(missing_value_row == missing_data_default_value)
 
+
 @pytest.mark.parametrize(
     "look_ahead_multiplier, expected_row_lengths, bucket_sort_key, sort_key",
     [

From cb833dcd676c720f0cdd8025089dea8e27b45b9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Fri, 28 Feb 2020 12:53:30 +0100
Subject: [PATCH 06/25] Added missing data support to subclasses of Field

---
 takepod/storage/field.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index 692289db..74eb672d 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -796,7 +796,8 @@ def __init__(self,
                  vocab=None,
                  eager=True,
                  custom_numericalize=None,
-                 allow_missing_data=False
+                 allow_missing_data=False,
+                 missing_data_token=-1
                  ):
         if vocab is not None and vocab.has_specials:
             error_msg = "Vocab contains special symbols." \
@@ -816,9 +817,8 @@ def __init__(self,
                          custom_numericalize=custom_numericalize,
                          is_target=True,
                          fixed_length=1,
-                         allow_missing_data=allow_missing_data
-                         # TODO add default missing value token when merged
-                         #  with missing value branch
+                         allow_missing_data=allow_missing_data,
+                         missing_data_token=missing_data_token
                          )
 
 
@@ -835,7 +835,8 @@ def __init__(self,
                  custom_numericalize=None,
                  is_target=False,
                  fixed_length=None,
-                 allow_missing_data=False):
+                 allow_missing_data=False,
+                 missing_data_token=-1):
         super().__init__(
             name=name,
             vocab=vocab,
@@ -846,7 +847,8 @@ def __init__(self,
             custom_numericalize=custom_numericalize,
             is_target=is_target,
             fixed_length=fixed_length,
-            allow_missing_data=allow_missing_data
+            allow_missing_data=allow_missing_data,
+            missing_data_token=missing_data_token
         )
 
 
@@ -860,8 +862,9 @@ def __init__(self,
                  num_of_classes=None,
                  vocab=None,
                  eager=True,
+                 custom_numericalize=None,
                  allow_missing_data=False,
-                 custom_numericalize=None):
+                 missing_data_token=-1):
         """Create a MultilabelField from arguments.
 
                 Parameters
@@ -921,7 +924,8 @@ def __init__(self,
                          custom_numericalize=custom_numericalize,
                          is_target=True,
                          fixed_length=num_of_classes,
-                         allow_missing_data=allow_missing_data)
+                         allow_missing_data=allow_missing_data,
+                         missing_data_token=missing_data_token)
 
     def finalize(self):
         super().finalize()

From 00dc2fca5babf4f432b6b8cf5150e0b7532d3607 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 18 Mar 2020 19:29:34 +0100
Subject: [PATCH 07/25] Fixed a test

---
 test/storage/test_iterator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py
index 09d30eb8..89ae8b2e 100644
--- a/test/storage/test_iterator.py
+++ b/test/storage/test_iterator.py
@@ -306,7 +306,7 @@ def text_len_key(example):
 def test_iterator_missing_data_in_batch(json_file_path):
     missing_data_default_value = -99
     fields = tabular_dataset_fields()
-    missing_value_field = Field("non_numericalizable_field",
+    missing_value_field = Field("missing_value_field",
                                 tokenizer="split",
                                 vocab=Vocab(),
                                 allow_missing_data=True,
@@ -314,9 +314,9 @@ def test_iterator_missing_data_in_batch(json_file_path):
     fields['text_with_missing_data'] = missing_value_field
     ds = create_tabular_dataset_from_json(fields, json_file_path)
 
-    for x_batch, _ in Iterator(ds, batch_size=len(ds)):
+    for x_batch, _ in Iterator(ds, batch_size=len(ds), shuffle=False):
         # test if the value we know is missing is correctly filled out
-        missing_value_row = x_batch.non_numericalizable_field[3]
+        missing_value_row = x_batch.missing_value_field[3]
         assert np.all(missing_value_row == missing_data_default_value)
 
 

From c4a11849571c4c6355cb322b300ece3c5ae56e58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 18 Mar 2020 20:42:50 +0100
Subject: [PATCH 08/25] Added custom padding token to field for use with
 custom_numericalize

---
 takepod/storage/field.py   | 13 ++++++++++++-
 test/storage/test_field.py | 21 +++++++++++++++++----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index 74eb672d..003a872f 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -191,6 +191,7 @@ def __init__(self,
                  eager=True,
                  is_numericalizable=True,
                  custom_numericalize=None,
+                 custom_numericalize_padding_token=-1,
                  is_target=False,
                  fixed_length=None,
                  allow_missing_data=False,
@@ -253,10 +254,13 @@ def __init__(self,
              on custom datatypes. For non-numericalizable fields, Iterator will generate
              batch fields containing lists of these custom data type instances returned
              by the tokenizer.
-
         custom_numericalize : callable
             The numericalization function that will be called if the field
             doesn't use a vocabulary.
+        custom_numericalize_padding_token : int
+            If custom_numericalize is provided and padding the batch matrix is needed,
+            this token is used to pad the end of the matrix row.
+            If custom_numericalize is None, this is ignored.
         is_target : bool
             Whether this field is a target variable. Affects iteration over
             batches. Default: False.
@@ -331,6 +335,7 @@ def __init__(self,
             self.tokenizer = None
 
         self.custom_numericalize = custom_numericalize
+        self.custom_numericalize_padding_token = custom_numericalize_padding_token
 
         self.is_target = is_target
         self.fixed_length = fixed_length
@@ -698,6 +703,10 @@ def pad_to_length(self, row, length, custom_pad_symbol=None,
 
             if self.use_vocab:
                 pad_symbol = self.vocab.pad_symbol_index()
+
+            elif self.custom_numericalize:
+                pad_symbol = self.custom_numericalize_padding_token
+
             else:
                 pad_symbol = custom_pad_symbol
 
@@ -833,6 +842,7 @@ def __init__(self,
                  vocab=None,
                  eager=True,
                  custom_numericalize=None,
+                 custom_numericalize_padding_token=-1,
                  is_target=False,
                  fixed_length=None,
                  allow_missing_data=False,
@@ -845,6 +855,7 @@ def __init__(self,
             store_as_tokenized=True,
             eager=eager,
             custom_numericalize=custom_numericalize,
+            custom_numericalize_padding_token=custom_numericalize_padding_token,
             is_target=is_target,
             fixed_length=fixed_length,
             allow_missing_data=allow_missing_data,
diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index fdca68bb..a0cc6ea9 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -226,6 +226,19 @@ def test_field_pad_to_length(row, length, expected_row, pad_left,
     assert received_row.tolist() == expected_row
 
 
+def test_field_pad_custom_numericalize():
+    custom_padding_token = -999
+    f = Field("test_field",
+              custom_numericalize=int,
+              custom_numericalize_padding_token=custom_padding_token,
+              tokenizer='split')
+    mock_numericalization = np.array([1, 2, 3, 4])
+    expected_numericalization = np.array([1, 2, 3, 4] + [custom_padding_token] * 6)
+
+    padded = f.pad_to_length(mock_numericalization, 10, pad_left=False)
+    assert np.all(padded == expected_numericalization)
+
+
 @pytest.mark.parametrize(
     "row, length, expected_row",
     [
@@ -618,12 +631,12 @@ def test_multilabel_field_class_count():
 @pytest.mark.parametrize("tokens, expected_numericalization",
                          [
                              (
-                                 ["class1", "class2", "class3", "class4"],
-                                 np.array([1, 1, 1, 1, 0, 0])
+                                     ["class1", "class2", "class3", "class4"],
+                                     np.array([1, 1, 1, 1, 0, 0])
                              ),
                              (
-                                 [],
-                                 np.array([0, 0, 0, 0, 0, 0])
+                                     [],
+                                     np.array([0, 0, 0, 0, 0, 0])
                              )
                          ])
 def test_multilabel_field_custom_numericalization(tokens, expected_numericalization):

From a51b3b48a6366be8fa3c5cc322f07edc3ed2bbf4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 18 Mar 2020 20:43:59 +0100
Subject: [PATCH 09/25] flake8

---
 test/storage/test_field.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index a0cc6ea9..e0a818bc 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -631,12 +631,12 @@ def test_multilabel_field_class_count():
 @pytest.mark.parametrize("tokens, expected_numericalization",
                          [
                              (
-                                     ["class1", "class2", "class3", "class4"],
-                                     np.array([1, 1, 1, 1, 0, 0])
+                                 ["class1", "class2", "class3", "class4"],
+                                 np.array([1, 1, 1, 1, 0, 0])
                              ),
                              (
-                                     [],
-                                     np.array([0, 0, 0, 0, 0, 0])
+                                 [],
+                                 np.array([0, 0, 0, 0, 0, 0])
                              )
                          ])
 def test_multilabel_field_custom_numericalization(tokens, expected_numericalization):

From 5bff1b08fc1a482b93967e38c62e1b3ac9376d4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 18 Mar 2020 22:04:18 +0100
Subject: [PATCH 10/25] WIP, testing

---
 takepod/datasets/iterator.py |  8 +++----
 takepod/storage/__init__.py  |  4 ++--
 takepod/storage/field.py     | 22 +++++++++++++++++-
 test/storage/test_field.py   | 45 ++++++++++++++++++------------------
 4 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/takepod/datasets/iterator.py b/takepod/datasets/iterator.py
index 82333b7c..0af2bb75 100644
--- a/takepod/datasets/iterator.py
+++ b/takepod/datasets/iterator.py
@@ -268,7 +268,7 @@ def _create_batch(self, examples):
                 matrix = None  # np.empty(shape=(n_rows, pad_length))
 
                 # non-sequential fields all have length = 1, no padding necessary
-                should_pad = True if field.is_sequential else False
+                should_pad = field.is_sequential
 
                 for i, example in enumerate(examples):
 
@@ -321,14 +321,14 @@ def _create_batch(self, examples):
 
     @staticmethod
     def _get_pad_length(field, examples):
-        if not field.is_sequential:
-            return 1
-
         # the fixed_length attribute of Field has priority over the max length
         # of all the examples in the batch
         if field.fixed_length is not None:
             return field.fixed_length
 
+        if not field.is_sequential:
+            return 1
+
         # if fixed_length is None, then return the maximum length of all the
         # examples in the batch
         def length_of_field(example):
diff --git a/takepod/storage/__init__.py b/takepod/storage/__init__.py
index ee192886..44a1c04f 100644
--- a/takepod/storage/__init__.py
+++ b/takepod/storage/__init__.py
@@ -2,7 +2,7 @@
 
 from .example_factory import ExampleFactory, ExampleFormat
 from .field import Field, TokenizedField, MultilabelField, MultioutputField, \
-    unpack_fields, LabelField
+    unpack_fields, LabelField, SentenceEmbeddingField
 from .resources.downloader import (BaseDownloader, SCPDownloader, HttpDownloader,
                                    SimpleHttpDownloader)
 from .resources.large_resource import LargeResource, SCPLargeResource
@@ -21,6 +21,6 @@
 
 __all__ = ["BaseDownloader", "SCPDownloader", "HttpDownloader", "SimpleHttpDownloader",
            "Field", "TokenizedField", "LabelField", "MultilabelField", "MultioutputField",
-           "unpack_fields", "LargeResource", "SCPLargeResource",
+           "unpack_fields", "LargeResource", "SCPLargeResource", "SentenceEmbeddingField",
            "VectorStorage", "BasicVectorStorage", "SpecialVocabSymbols", "Vocab",
            "ExampleFactory", "ExampleFormat", "TfIdfVectorizer"]
diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index 003a872f..b179fe0f 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -2,6 +2,7 @@
 import logging
 import itertools
 from collections import deque
+from typing import Callable
 
 import numpy as np
 
@@ -648,7 +649,7 @@ def numericalize(self, data):
                 _LOGGER.error(error_msg)
                 raise ValueError(error_msg)
 
-            else:
+            elif not self.custom_numericalize:
                 return None
 
         # raw data is just a string, so we need to wrap it into an iterable
@@ -960,6 +961,25 @@ def _numericalize_tokens(self, tokens):
         return numericalize_multihot(tokens, token_numericalize, self.num_of_classes)
 
 
+class SentenceEmbeddingField(Field):
+    """Field used for sentence-level multidimensional embeddings."""
+
+    def __init__(self,
+                 name: str,
+                 embedding_fn: Callable[[str], np.array],
+                 embedding_size: int):
+        super().__init__(name,
+                         custom_numericalize=embedding_fn,
+                         tokenizer=None,
+                         language=None,
+                         vocab=None,
+                         tokenize=False,
+                         store_as_raw=True,
+                         store_as_tokenized=False,
+                         is_target=False,
+                         fixed_length=embedding_size,
+                         allow_missing_data=True)
+
 def numericalize_multihot(tokens, token_indexer, num_of_classes):
     active_classes = list(map(token_indexer, tokens))
     multihot_encoding = np.zeros(num_of_classes, dtype=np.bool)
diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index e0a818bc..486b2d7b 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -5,7 +5,7 @@
 from mock import patch
 
 from takepod.storage import Field, TokenizedField, MultilabelField, \
-    Vocab, SpecialVocabSymbols, MultioutputField, LabelField
+    Vocab, SpecialVocabSymbols, MultioutputField, LabelField, SentenceEmbeddingField
 
 ONE_TO_FIVE = [1, 2, 3, 4, 5]
 
@@ -690,36 +690,14 @@ def test_missing_values_default_sequential():
                 custom_numericalize=lambda x: hash(x),
                 allow_missing_data=True)
 
-    _, data_missing = fld.preprocess(None)[0]
     _, data_exists = fld.preprocess("data_string")[0]
 
-    assert data_missing == (None, None)
     assert data_exists == (None, ["data_string"])
     fld.finalize()
 
-    assert fld.numericalize(data_missing) is None
     assert np.all(fld.numericalize(data_exists) == np.array([hash("data_string")]))
 
 
-def test_missing_values_custom_numericalize():
-    fld = Field(name="test_field",
-                store_as_raw=True,
-                tokenize=False,
-                custom_numericalize=int,
-                allow_missing_data=True)
-
-    _, data_missing = fld.preprocess(None)[0]
-    _, data_exists = fld.preprocess("404")[0]
-
-    assert data_missing == (None, None)
-    assert data_exists == ("404", None)
-
-    fld.finalize()
-
-    assert fld.numericalize(data_missing) is None
-    assert np.all(fld.numericalize(data_exists) == np.array([404]))
-
-
 def test_missing_symbol_index_vocab():
     vocab = Vocab()
     fld = Field(name="test_field",
@@ -875,3 +853,24 @@ def test_label_field():
         _, example = x[0]
         raw, _ = example
         assert label_field.numericalize(example) == vocab.stoi[raw]
+
+
+def test_sentence_embedding_field():
+    def mock_embedding_fn(sentence):
+        if sentence == "test_sentence":
+            return np.array([1, 2, 3, 4])
+
+        if sentence is None:
+            return np.zeros(4)
+
+    field = SentenceEmbeddingField("test_field",
+                                   embedding_fn=mock_embedding_fn,
+                                   embedding_size=4)
+
+    (_, data), = field.preprocess("test_sentence")
+    numericalization_1 = field.numericalize(data)
+    assert np.all(numericalization_1 == np.array([1, 2, 3, 4]))
+
+    (_, data), = field.preprocess(None)
+    numericalization_2 = field.numericalize(data)
+    assert np.all(numericalization_2 == np.zeros(4))

From feeac4e5ead47c4aee908fc8d91107b67e5b18eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 18 Mar 2020 22:10:22 +0100
Subject: [PATCH 11/25] flake8

---
 takepod/storage/field.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index b179fe0f..7d65432b 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -980,6 +980,7 @@ def __init__(self,
                          fixed_length=embedding_size,
                          allow_missing_data=True)
 
+
 def numericalize_multihot(tokens, token_indexer, num_of_classes):
     active_classes = list(map(token_indexer, tokens))
     multihot_encoding = np.zeros(num_of_classes, dtype=np.bool)

From c1196256ac466c77acab2d53bc46916df589956f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 18 Mar 2020 22:22:23 +0100
Subject: [PATCH 12/25] Added documentation

---
 takepod/storage/field.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index 7d65432b..c4ba5e83 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -968,6 +968,18 @@ def __init__(self,
                  name: str,
                  embedding_fn: Callable[[str], np.array],
                  embedding_size: int):
+        """
+        Field used for sentence-level multidimensional embeddings.
+
+        Parameters
+        ----------
+        name: str
+            Field name, used for referencing data in the dataset.
+        embedding_fn: Callable[[str], np.array]
+            Callable that takes a string and returns a fixed-width embedding.
+        embedding_size: int
+            Width of the embedding.
+        """
         super().__init__(name,
                          custom_numericalize=embedding_fn,
                          tokenizer=None,

From c5fa93e4eca31394ba24ba3e50c484962afa993e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 19 Feb 2020 22:52:00 +0100
Subject: [PATCH 13/25] Added per-field custom datatype support

---
 takepod/datasets/iterator.py |  1 -
 takepod/storage/field.py     | 13 +++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/takepod/datasets/iterator.py b/takepod/datasets/iterator.py
index 061cd194..fecac261 100644
--- a/takepod/datasets/iterator.py
+++ b/takepod/datasets/iterator.py
@@ -257,7 +257,6 @@ def _create_batch(self, examples):
         for field in self._dataset.fields:
             if field.is_numericalizable:
                 # If this field is numericalizable, generate a possibly padded matrix
-
                 # the length to which all the rows are padded (or truncated)
                 pad_length = Iterator._get_pad_length(field, examples)
 
diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index 0c854baf..26d0a615 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -248,13 +248,14 @@ def __init__(self,
             If true, the output of the tokenizer is presumed to be a list of tokens and
             will be numericalized using the provided Vocab or custom_numericalize.
             For numericalizable fields, Iterator will generate batch fields containing
-             numpy matrices.
+            numpy matrices.
+
+            If false, the out of the tokenizer is presumed to be a custom datatype.
+            Posttokenization hooks aren't allowed to be added as they can't be called
+            on custom datatypes. For non-numericalizable fields, Iterator will generate
+            batch fields containing lists of these custom data type instances returned
+            by the tokenizer.
 
-             If false, the out of the tokenizer is presumed to be a custom datatype.
-             Posttokenization hooks aren't allowed to be added as they can't be called
-             on custom datatypes. For non-numericalizable fields, Iterator will generate
-             batch fields containing lists of these custom data type instances returned
-             by the tokenizer.
         custom_numericalize : callable
             The numericalization function that will be called if the field
             doesn't use a vocabulary. If using custom_numericalize and padding is

From 3a7efc3ff420ce3569379afadd8fe5b55da0b862 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Thu, 20 Feb 2020 18:59:30 +0100
Subject: [PATCH 14/25] WIP: TfIdfVectorizer update pending

---
 takepod/storage/field.py   | 2 +-
 test/storage/conftest.py   | 2 +-
 test/storage/test_field.py | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index 26d0a615..cd1cce5e 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 from takepod.preproc.tokenizers import get_tokenizer
-from takepod.storage.vocab import Vocab
+from takepod.storage.vocab import Vocab, SpecialVocabSymbols
 
 _LOGGER = logging.getLogger(__name__)
 
diff --git a/test/storage/conftest.py b/test/storage/conftest.py
index e612d523..82ef9ac0 100644
--- a/test/storage/conftest.py
+++ b/test/storage/conftest.py
@@ -86,7 +86,7 @@ def tabular_dataset_fields(fixed_length=None):
     "a b c",
     "a",
     "a b c d",
-    "a",
+    None,
     "d b",
     "d c g",
     "b b b b b b"
diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index 0baf53b0..f4759641 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -631,12 +631,12 @@ def test_multilabel_field_class_count():
 @pytest.mark.parametrize("tokens, expected_numericalization",
                          [
                              (
-                                 ["class1", "class2", "class3", "class4"],
-                                 np.array([1, 1, 1, 1, 0, 0])
+                                     ["class1", "class2", "class3", "class4"],
+                                     np.array([1, 1, 1, 1, 0, 0])
                              ),
                              (
-                                 [],
-                                 np.array([0, 0, 0, 0, 0, 0])
+                                     [],
+                                     np.array([0, 0, 0, 0, 0, 0])
                              )
                          ])
 def test_multilabel_field_custom_numericalization(tokens, expected_numericalization):

From 318a36bc3aae1c7c436c9f07ba11bbe169bd44d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Mon, 24 Feb 2020 12:22:22 +0100
Subject: [PATCH 15/25] Added option to define custom missing data symbol

---
 test/storage/test_iterator.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py
index 308546e5..c335f37e 100644
--- a/test/storage/test_iterator.py
+++ b/test/storage/test_iterator.py
@@ -301,6 +301,17 @@ def text_len_key(example):
     else:
         return len(example.text[1])
 
+@pytest.mark.usefixtures("json_file_path")
+def test_iterator_missing_data_in_batch(json_file_path):
+    missing_data_default_value = -99
+    fields = tabular_dataset_fields()
+    missing_value_field = Field("non_numericalizable_field",
+                                      tokenizer="split",
+                                      vocab=Vocab(),
+                                      allow_missing_data=True,
+                                      missing_data_token=missing_data_default_value)
+    fields['text'] = missing_value_field
+    ds = create_tabular_dataset_from_json(fields, json_file_path)
 
 @pytest.mark.usefixtures("json_file_path")
 def test_iterator_missing_data_in_batch(json_file_path):

From 41b1aa7e0c1b6c4753b4337cfc115ea05b41a596 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Thu, 27 Feb 2020 17:22:19 +0100
Subject: [PATCH 16/25] Made TfIdf vectorizer not support fields with missing
 data

---
 test/storage/conftest.py      |  2 +-
 test/storage/test_field.py    |  8 ++++----
 test/storage/test_iterator.py | 11 ++++++-----
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/test/storage/conftest.py b/test/storage/conftest.py
index 82ef9ac0..e612d523 100644
--- a/test/storage/conftest.py
+++ b/test/storage/conftest.py
@@ -86,7 +86,7 @@ def tabular_dataset_fields(fixed_length=None):
     "a b c",
     "a",
     "a b c d",
-    None,
+    "a",
     "d b",
     "d c g",
     "b b b b b b"
diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index f4759641..0baf53b0 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -631,12 +631,12 @@ def test_multilabel_field_class_count():
 @pytest.mark.parametrize("tokens, expected_numericalization",
                          [
                              (
-                                     ["class1", "class2", "class3", "class4"],
-                                     np.array([1, 1, 1, 1, 0, 0])
+                                 ["class1", "class2", "class3", "class4"],
+                                 np.array([1, 1, 1, 1, 0, 0])
                              ),
                              (
-                                     [],
-                                     np.array([0, 0, 0, 0, 0, 0])
+                                 [],
+                                 np.array([0, 0, 0, 0, 0, 0])
                              )
                          ])
 def test_multilabel_field_custom_numericalization(tokens, expected_numericalization):
diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py
index c335f37e..b44417bf 100644
--- a/test/storage/test_iterator.py
+++ b/test/storage/test_iterator.py
@@ -301,16 +301,17 @@ def text_len_key(example):
     else:
         return len(example.text[1])
 
+
 @pytest.mark.usefixtures("json_file_path")
 def test_iterator_missing_data_in_batch(json_file_path):
     missing_data_default_value = -99
     fields = tabular_dataset_fields()
     missing_value_field = Field("non_numericalizable_field",
-                                      tokenizer="split",
-                                      vocab=Vocab(),
-                                      allow_missing_data=True,
-                                      missing_data_token=missing_data_default_value)
-    fields['text'] = missing_value_field
+                                tokenizer="split",
+                                vocab=Vocab(),
+                                allow_missing_data=True,
+                                missing_data_token=missing_data_default_value)
+    fields['text_with_missing_data'] = missing_value_field
     ds = create_tabular_dataset_from_json(fields, json_file_path)
 
 @pytest.mark.usefixtures("json_file_path")

From dcaba7c9a3a133a9e679b0a05bea176d8e0b4e1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 18 Mar 2020 19:29:34 +0100
Subject: [PATCH 17/25] Fixed a test

---
 test/storage/test_iterator.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py
index b44417bf..308546e5 100644
--- a/test/storage/test_iterator.py
+++ b/test/storage/test_iterator.py
@@ -302,18 +302,6 @@ def text_len_key(example):
         return len(example.text[1])
 
 
-@pytest.mark.usefixtures("json_file_path")
-def test_iterator_missing_data_in_batch(json_file_path):
-    missing_data_default_value = -99
-    fields = tabular_dataset_fields()
-    missing_value_field = Field("non_numericalizable_field",
-                                tokenizer="split",
-                                vocab=Vocab(),
-                                allow_missing_data=True,
-                                missing_data_token=missing_data_default_value)
-    fields['text_with_missing_data'] = missing_value_field
-    ds = create_tabular_dataset_from_json(fields, json_file_path)
-
 @pytest.mark.usefixtures("json_file_path")
 def test_iterator_missing_data_in_batch(json_file_path):
     missing_data_default_value = -99

From 09cbfb696debd23dec35a1bb0868aec01d55b90c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 18 Mar 2020 20:42:50 +0100
Subject: [PATCH 18/25] Added custom padding token to field for use with
 custom_numericalize

---
 test/storage/test_field.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index 0baf53b0..f4759641 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -631,12 +631,12 @@ def test_multilabel_field_class_count():
 @pytest.mark.parametrize("tokens, expected_numericalization",
                          [
                              (
-                                 ["class1", "class2", "class3", "class4"],
-                                 np.array([1, 1, 1, 1, 0, 0])
+                                     ["class1", "class2", "class3", "class4"],
+                                     np.array([1, 1, 1, 1, 0, 0])
                              ),
                              (
-                                 [],
-                                 np.array([0, 0, 0, 0, 0, 0])
+                                     [],
+                                     np.array([0, 0, 0, 0, 0, 0])
                              )
                          ])
 def test_multilabel_field_custom_numericalization(tokens, expected_numericalization):

From 39f322f876d965e28a94469010b19266313d2745 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 18 Mar 2020 20:43:59 +0100
Subject: [PATCH 19/25] flake8

---
 test/storage/test_field.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index f4759641..0baf53b0 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -631,12 +631,12 @@ def test_multilabel_field_class_count():
 @pytest.mark.parametrize("tokens, expected_numericalization",
                          [
                              (
-                                     ["class1", "class2", "class3", "class4"],
-                                     np.array([1, 1, 1, 1, 0, 0])
+                                 ["class1", "class2", "class3", "class4"],
+                                 np.array([1, 1, 1, 1, 0, 0])
                              ),
                              (
-                                     [],
-                                     np.array([0, 0, 0, 0, 0, 0])
+                                 [],
+                                 np.array([0, 0, 0, 0, 0, 0])
                              )
                          ])
 def test_multilabel_field_custom_numericalization(tokens, expected_numericalization):

From 56fb576bb3f4fb92edf8f6ae4e68971ad06f7afb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 18 Mar 2020 22:04:18 +0100
Subject: [PATCH 20/25] WIP, testing

---
 takepod/datasets/iterator.py |  8 +++----
 takepod/storage/__init__.py  |  4 ++--
 takepod/storage/field.py     | 22 +++++++++++++++++-
 test/storage/test_field.py   | 45 ++++++++++++++++++------------------
 4 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/takepod/datasets/iterator.py b/takepod/datasets/iterator.py
index fecac261..9c0e686f 100644
--- a/takepod/datasets/iterator.py
+++ b/takepod/datasets/iterator.py
@@ -267,7 +267,7 @@ def _create_batch(self, examples):
                 matrix = None  # np.empty(shape=(n_rows, pad_length))
 
                 # non-sequential fields all have length = 1, no padding necessary
-                should_pad = True if field.is_sequential else False
+                should_pad = field.is_sequential
 
                 for i, example in enumerate(examples):
 
@@ -320,14 +320,14 @@ def _create_batch(self, examples):
 
     @staticmethod
     def _get_pad_length(field, examples):
-        if not field.is_sequential:
-            return 1
-
         # the fixed_length attribute of Field has priority over the max length
         # of all the examples in the batch
         if field.fixed_length is not None:
             return field.fixed_length
 
+        if not field.is_sequential:
+            return 1
+
         # if fixed_length is None, then return the maximum length of all the
         # examples in the batch
         def length_of_field(example):
diff --git a/takepod/storage/__init__.py b/takepod/storage/__init__.py
index ee192886..44a1c04f 100644
--- a/takepod/storage/__init__.py
+++ b/takepod/storage/__init__.py
@@ -2,7 +2,7 @@
 
 from .example_factory import ExampleFactory, ExampleFormat
 from .field import Field, TokenizedField, MultilabelField, MultioutputField, \
-    unpack_fields, LabelField
+    unpack_fields, LabelField, SentenceEmbeddingField
 from .resources.downloader import (BaseDownloader, SCPDownloader, HttpDownloader,
                                    SimpleHttpDownloader)
 from .resources.large_resource import LargeResource, SCPLargeResource
@@ -21,6 +21,6 @@
 
 __all__ = ["BaseDownloader", "SCPDownloader", "HttpDownloader", "SimpleHttpDownloader",
            "Field", "TokenizedField", "LabelField", "MultilabelField", "MultioutputField",
-           "unpack_fields", "LargeResource", "SCPLargeResource",
+           "unpack_fields", "LargeResource", "SCPLargeResource", "SentenceEmbeddingField",
            "VectorStorage", "BasicVectorStorage", "SpecialVocabSymbols", "Vocab",
            "ExampleFactory", "ExampleFormat", "TfIdfVectorizer"]
diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index cd1cce5e..97413f5d 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -2,6 +2,7 @@
 import logging
 import itertools
 from collections import deque
+from typing import Callable
 
 import numpy as np
 
@@ -655,7 +656,7 @@ def numericalize(self, data):
                 _LOGGER.error(error_msg)
                 raise ValueError(error_msg)
 
-            else:
+            elif not self.custom_numericalize:
                 return None
 
         # raw data is just a string, so we need to wrap it into an iterable
@@ -970,6 +971,25 @@ def _numericalize_tokens(self, tokens):
         return numericalize_multihot(tokens, token_numericalize, self.num_of_classes)
 
 
+class SentenceEmbeddingField(Field):
+    """Field used for sentence-level multidimensional embeddings."""
+
+    def __init__(self,
+                 name: str,
+                 embedding_fn: Callable[[str], np.array],
+                 embedding_size: int):
+        super().__init__(name,
+                         custom_numericalize=embedding_fn,
+                         tokenizer=None,
+                         language=None,
+                         vocab=None,
+                         tokenize=False,
+                         store_as_raw=True,
+                         store_as_tokenized=False,
+                         is_target=False,
+                         fixed_length=embedding_size,
+                         allow_missing_data=True)
+
 def numericalize_multihot(tokens, token_indexer, num_of_classes):
     active_classes = list(map(token_indexer, tokens))
     multihot_encoding = np.zeros(num_of_classes, dtype=np.bool)
diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index 0baf53b0..d44e138b 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -5,7 +5,7 @@
 from mock import patch
 
 from takepod.storage import Field, TokenizedField, MultilabelField, \
-    Vocab, SpecialVocabSymbols, MultioutputField, LabelField
+    Vocab, SpecialVocabSymbols, MultioutputField, LabelField, SentenceEmbeddingField
 
 ONE_TO_FIVE = [1, 2, 3, 4, 5]
 
@@ -690,36 +690,14 @@ def test_missing_values_default_sequential():
                 custom_numericalize=lambda x: hash(x),
                 allow_missing_data=True)
 
-    _, data_missing = fld.preprocess(None)[0]
     _, data_exists = fld.preprocess("data_string")[0]
 
-    assert data_missing == (None, None)
     assert data_exists == (None, ["data_string"])
     fld.finalize()
 
-    assert fld.numericalize(data_missing) is None
     assert np.all(fld.numericalize(data_exists) == np.array([hash("data_string")]))
 
 
-def test_missing_values_custom_numericalize():
-    fld = Field(name="test_field",
-                store_as_raw=True,
-                tokenize=False,
-                custom_numericalize=int,
-                allow_missing_data=True)
-
-    _, data_missing = fld.preprocess(None)[0]
-    _, data_exists = fld.preprocess("404")[0]
-
-    assert data_missing == (None, None)
-    assert data_exists == ("404", None)
-
-    fld.finalize()
-
-    assert fld.numericalize(data_missing) is None
-    assert np.all(fld.numericalize(data_exists) == np.array([404]))
-
-
 def test_missing_symbol_index_vocab():
     vocab = Vocab()
     fld = Field(name="test_field",
@@ -875,3 +853,24 @@ def test_label_field():
         _, example = x[0]
         raw, _ = example
         assert label_field.numericalize(example) == vocab.stoi[raw]
+
+
+def test_sentence_embedding_field():
+    def mock_embedding_fn(sentence):
+        if sentence == "test_sentence":
+            return np.array([1, 2, 3, 4])
+
+        if sentence is None:
+            return np.zeros(4)
+
+    field = SentenceEmbeddingField("test_field",
+                                   embedding_fn=mock_embedding_fn,
+                                   embedding_size=4)
+
+    (_, data), = field.preprocess("test_sentence")
+    numericalization_1 = field.numericalize(data)
+    assert np.all(numericalization_1 == np.array([1, 2, 3, 4]))
+
+    (_, data), = field.preprocess(None)
+    numericalization_2 = field.numericalize(data)
+    assert np.all(numericalization_2 == np.zeros(4))

From f914f8f14ed82db141c418aab119aef24300f9f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 18 Mar 2020 22:10:22 +0100
Subject: [PATCH 21/25] flake8

---
 takepod/storage/field.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index 97413f5d..78cf0ca0 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -990,6 +990,7 @@ def __init__(self,
                          fixed_length=embedding_size,
                          allow_missing_data=True)
 
+
 def numericalize_multihot(tokens, token_indexer, num_of_classes):
     active_classes = list(map(token_indexer, tokens))
     multihot_encoding = np.zeros(num_of_classes, dtype=np.bool)

From 37c0ed58c4f84705b6d64ff1e09d2855145f3d8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Wed, 18 Mar 2020 22:22:23 +0100
Subject: [PATCH 22/25] Added documentation

---
 takepod/storage/field.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index 78cf0ca0..db7b2972 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -978,6 +978,18 @@ def __init__(self,
                  name: str,
                  embedding_fn: Callable[[str], np.array],
                  embedding_size: int):
+        """
+        Field used for sentence-level multidimensional embeddings.
+
+        Parameters
+        ----------
+        name: str
+            Field name, used for referencing data in the dataset.
+        embedding_fn: Callable[[str], np.array]
+            Callable that takes a string and returns a fixed-width embedding.
+        embedding_size: int
+            Width of the embedding.
+        """
         super().__init__(name,
                          custom_numericalize=embedding_fn,
                          tokenizer=None,

From 19141e0377645cc7becd57db086155d4262b5c4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Thu, 2 Apr 2020 22:33:22 +0200
Subject: [PATCH 23/25] rebased to master

---
 takepod/storage/field.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index db7b2972..7e1dfef7 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -7,7 +7,7 @@
 import numpy as np
 
 from takepod.preproc.tokenizers import get_tokenizer
-from takepod.storage.vocab import Vocab, SpecialVocabSymbols
+from takepod.storage.vocab import Vocab
 
 _LOGGER = logging.getLogger(__name__)
 

From c67546fc44adc341b32df8e499dcbc12d65ba7bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Fri, 17 Apr 2020 15:02:32 +0200
Subject: [PATCH 24/25] Added language, vocab, is_target, allow_missing_data

---
 takepod/storage/field.py   | 22 +++++++++++++++++-----
 test/storage/test_field.py |  3 ++-
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/takepod/storage/field.py b/takepod/storage/field.py
index a6eba063..c6270c97 100644
--- a/takepod/storage/field.py
+++ b/takepod/storage/field.py
@@ -976,7 +976,11 @@ class SentenceEmbeddingField(Field):
     def __init__(self,
                  name: str,
                  embedding_fn: Callable[[str], np.array],
-                 embedding_size: int):
+                 embedding_size: int,
+                 vocab=None,
+                 is_target=False,
+                 language='en',
+                 allow_missing_data=False):
         """
         Field used for sentence-level multidimensional embeddings.
 
@@ -986,20 +990,28 @@ def __init__(self,
             Field name, used for referencing data in the dataset.
         embedding_fn: Callable[[str], np.array]
             Callable that takes a string and returns a fixed-width embedding.
+            In case of missing data, this callable will be passed a None.
         embedding_size: int
             Width of the embedding.
+        vocab: Vocab
+            Vocab that will be updated with the sentences passed to this field.
+            Keep in mind that whole sentences will be passed to the vocab.
+        language: str
+            Langage of the data. Not used in this field.
+        allow_missing_data: bool
+            Whether this field will allow the processing of missing data.
         """
         super().__init__(name,
                          custom_numericalize=embedding_fn,
                          tokenizer=None,
-                         language=None,
-                         vocab=None,
+                         language=language,
+                         vocab=vocab,
                          tokenize=False,
                          store_as_raw=True,
                          store_as_tokenized=False,
-                         is_target=False,
+                         is_target=is_target,
                          fixed_length=embedding_size,
-                         allow_missing_data=True)
+                         allow_missing_data=allow_missing_data)
 
 
 def numericalize_multihot(tokens, token_indexer, num_of_classes):
diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index d44e138b..5f7a6ea9 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -865,7 +865,8 @@ def mock_embedding_fn(sentence):
 
     field = SentenceEmbeddingField("test_field",
                                    embedding_fn=mock_embedding_fn,
-                                   embedding_size=4)
+                                   embedding_size=4,
+                                   allow_missing_data=True)
 
     (_, data), = field.preprocess("test_sentence")
     numericalization_1 = field.numericalize(data)

From 17980ba12505168ca22fb4418e30f240f181cbbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= <ivasmo95@gmail.com>
Date: Fri, 17 Apr 2020 15:05:46 +0200
Subject: [PATCH 25/25] merged master

---
 test/storage/test_field.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/storage/test_field.py b/test/storage/test_field.py
index 35701994..8b777e78 100644
--- a/test/storage/test_field.py
+++ b/test/storage/test_field.py
@@ -5,7 +5,7 @@
 from mock import patch
 
 from podium.storage import Field, TokenizedField, MultilabelField, \
-    Vocab, SpecialVocabSymbols, MultioutputField, LabelField
+    Vocab, SpecialVocabSymbols, MultioutputField, LabelField, SentenceEmbeddingField
 
 ONE_TO_FIVE = [1, 2, 3, 4, 5]