From 739d3f295310c8b05aed5f325e67a7f8ad32300e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 19 Feb 2020 22:52:00 +0100 Subject: [PATCH 01/25] Added per-field custom datatype support --- takepod/datasets/iterator.py | 96 +++++++++++++---------------------- takepod/storage/field.py | 50 +++++++++++++++--- test/storage/test_field.py | 20 ++++---- test/storage/test_iterator.py | 67 ++++++++++-------------- 4 files changed, 115 insertions(+), 118 deletions(-) diff --git a/takepod/datasets/iterator.py b/takepod/datasets/iterator.py index 7c2a215d..ddafed76 100644 --- a/takepod/datasets/iterator.py +++ b/takepod/datasets/iterator.py @@ -26,7 +26,6 @@ class Iterator: def __init__(self, dataset=None, batch_size=32, - batch_to_matrix=True, sort_key=None, shuffle=False, seed=1, @@ -84,7 +83,6 @@ def __init__(self, """ self.batch_size = batch_size - self.batch_to_matrix = batch_to_matrix self.shuffle = shuffle @@ -238,71 +236,51 @@ def set_internal_random_state(self, state): def _create_batch(self, examples): - if self.batch_to_matrix: - return self._create_matrix_batch(examples) - - else: - return self._create_list_batch(examples) - - def _create_matrix_batch(self, examples): - # dicts that will be used to create the InputBatch and TargetBatch # objects input_batch_dict, target_batch_dict = {}, {} for field in self._dataset.fields: - # the length to which all the rows are padded (or truncated) - pad_length = Iterator._get_pad_length(field, examples) - - # the last batch can have < batch_size examples - n_rows = min(self.batch_size, len(examples)) - - # empty matrix to be filled with numericalized fields - matrix = None # np.empty(shape=(n_rows, pad_length)) - - # non-sequential fields all have length = 1, no padding necessary - should_pad = True if field.sequential else False + if field.is_numericalizable: + # the length to which all the rows are padded (or truncated) + pad_length = Iterator._get_pad_length(field, examples) - for i, example in enumerate(examples): + # the last batch can have < batch_size examples + n_rows = min(self.batch_size, len(examples)) - # Get cached value - row = field.get_numericalization_for_example(example) + # empty matrix to be filled with numericalized fields + matrix = None # np.empty(shape=(n_rows, pad_length)) - if matrix is None: - # Create matrix of the correct dtype - matrix = np.empty(shape=(n_rows, pad_length), dtype=row.dtype) + # non-sequential fields all have length = 1, no padding necessary + should_pad = True if field.is_sequential else False - if should_pad: - row = field.pad_to_length(row, pad_length) + for i, example in enumerate(examples): - # set the matrix row to the numericalized, padded array - matrix[i] = row + # Get cached value + row = field.get_numericalization_for_example(example) - if field.is_target: - target_batch_dict[field.name] = matrix - else: - input_batch_dict[field.name] = matrix + if matrix is None: + # Create matrix of the correct dtype + matrix = np.empty(shape=(n_rows, pad_length), dtype=row.dtype) - input_batch = self.input_batch_class(**input_batch_dict) - target_batch = self.target_batch_class(**target_batch_dict) + if should_pad: + row = field.pad_to_length(row, pad_length) - return input_batch, target_batch + # set the matrix row to the numericalized, padded array + matrix[i] = row - def _create_list_batch(self, examples): - # dicts that will be used to create the InputBatch and TargetBatch - # objects - input_batch_dict, target_batch_dict = {}, {} - for field in self._dataset.fields: + batch_feature = matrix - vectors = [field.get_numericalization_for_example(ex) - for ex - in examples] + else: + # if the field is not representable as a matrix return a list of + # "tokens", which can be any data structure + batch_feature = [field.get_numericalization_for_example(example) + for example in examples] if field.is_target: - target_batch_dict[field.name] = vectors - + target_batch_dict[field.name] = batch_feature else: - input_batch_dict[field.name] = vectors + input_batch_dict[field.name] = batch_feature input_batch = self.input_batch_class(**input_batch_dict) target_batch = self.target_batch_class(**target_batch_dict) @@ -311,7 +289,7 @@ def _create_list_batch(self, examples): @staticmethod def _get_pad_length(field, examples): - if not field.sequential: + if not field.is_sequential: return 1 # the fixed_length attribute of Field has priority over the max length @@ -360,8 +338,7 @@ class SingleBatchIterator(Iterator): def __init__( self, - dataset: Dataset = None, - batch_to_matrix: bool = True): + dataset: Dataset = None): """Creates an Iterator that creates one batch per epoch containing all examples in the dataset. @@ -375,8 +352,7 @@ def __init__( returned as a list of numpy vectors or a matrix where each row is a padded vector. """ - super().__init__(dataset=dataset, - batch_to_matrix=batch_to_matrix) + super().__init__(dataset=dataset) def set_dataset(self, dataset: Dataset): super().set_dataset(dataset) @@ -405,7 +381,6 @@ def __init__( self, dataset, batch_size, - batch_to_matrix=True, sort_key=None, shuffle=True, seed=42, @@ -441,14 +416,13 @@ def __init__( """ if sort_key is None and bucket_sort_key is None: - error_msg = "For BucketIterator to work, either sort_key or "\ + error_msg = "For BucketIterator to work, either sort_key or " \ "bucket_sort_key must be != None." _LOGGER.error(error_msg) raise ValueError(error_msg) super().__init__(dataset, batch_size, - batch_to_matrix=batch_to_matrix, sort_key=sort_key, shuffle=shuffle, seed=seed) @@ -574,14 +548,14 @@ def __init__( """ if context_max_length is not None and context_max_length < 1: - error_msg = "'context_max_length' must not be less than 1. "\ - "If you don't want context, try flattening the dataset. "\ + error_msg = "'context_max_length' must not be less than 1. " \ + "If you don't want context, try flattening the dataset. " \ "'context_max_length' : {})".format(context_max_length) _LOGGER.error(error_msg) raise ValueError(error_msg) if context_max_depth is not None and context_max_depth < 0: - error_msg = "'context_max_depth' must not be negative. "\ + error_msg = "'context_max_depth' must not be negative. " \ "'context_max_depth' : {}".format(context_max_depth) _LOGGER.error(error_msg) raise ValueError(error_msg) @@ -664,7 +638,7 @@ def _create_batch(self, nodes): matrix = np.empty(shape=(n_rows, pad_length)) # non-sequential fields all have length = 1, no padding necessary - should_pad = True if field.sequential else False + should_pad = True if field.is_sequential else False for i, example in enumerate(node_context_examples): # Get cached value diff --git a/takepod/storage/field.py b/takepod/storage/field.py index 36f67577..4f24474d 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -189,6 +189,7 @@ def __init__(self, store_as_raw=True, store_as_tokenized=False, eager=True, + is_numericalizable=True, custom_numericalize=None, is_target=False, fixed_length=None, @@ -238,6 +239,20 @@ def __init__(self, eager : bool Whether to build the vocabulary online, each time the field preprocesses raw data. + is_numericalizable : bool + Whether the output of tokenizer can be numericalized. + + If true, the output of the tokenizer is presumed to be a list of tokens and + will be numericalized using the provided Vocab or custom_numericalize. + For numericalizable fields, Iterator will generate batch fields containing + numpy matrices. + + If false, the out of the tokenizer is presumed to be a custom datatype. + Posttokenization hooks aren't allowed to be added as they can't be called + on custom datatypes. For non-numericalizable fields, Iterator will generate + batch fields containing lists of these custom data type instances returned + by the tokenizer. + custom_numericalize : callable The numericalization function that will be called if the field doesn't use a vocabulary. @@ -268,6 +283,7 @@ def __init__(self, self.name = name self.language = language self._tokenizer_arg = tokenizer + self.is_numericalizable = is_numericalizable if store_as_tokenized and tokenize: error_msg = "Store_as_tokenized' and 'tokenize' both set to True." \ @@ -290,7 +306,15 @@ def __init__(self, _LOGGER.error(error_msg) raise ValueError(error_msg) - self.sequential = store_as_tokenized or tokenize + if not is_numericalizable \ + and (custom_numericalize is not None or vocab is not None): + error_msg = "Field that is not numericalizable can't have " \ + "custom_numericalize or vocab." + + _LOGGER.error(error_msg) + raise ValueError(error_msg) + + self.is_sequential = (store_as_tokenized or tokenize) and is_numericalizable self.store_as_raw = store_as_raw self.tokenize = tokenize self.store_as_tokenized = store_as_tokenized @@ -373,6 +397,11 @@ def add_posttokenize_hook(self, hook): hook : callable The post-tokenization hook that we want to add to the field. """ + if not self.is_numericalizable: + error_msg = "Field is declared as non numericalizable. Posttokenization " \ + "hooks aren't used in such fields." + _LOGGER.error(error_msg) + raise ValueError(error_msg) self.posttokenize_pipeline.add_hook(hook) @@ -517,7 +546,8 @@ def _process_tokens(self, data, tokens): the data and tokens processed by posttokenization hooks. """ - data, tokens = self._run_posttokenization_hooks(data, tokens) + if self.is_numericalizable: + data, tokens = self._run_posttokenization_hooks(data, tokens) if self.eager and self.use_vocab and not self.vocab.finalized: self.update_vocab(data, tokens) @@ -558,7 +588,7 @@ def get_default_value(self): empty numpy array if the field is sequential or numpy array with one None value otherwise. """ - if self.sequential: + if self.is_sequential: return np.empty(0) return np.array([np.nan]) @@ -595,7 +625,11 @@ def numericalize(self, data): # raw data is just a string, so we need to wrap it into an iterable tokens = tokenized if self.tokenize or self.store_as_tokenized else [raw] - return self._numericalize_tokens(tokens) + if self.is_numericalizable: + return self._numericalize_tokens(tokens) + + else: + return tokens def pad_to_length(self, row, length, custom_pad_symbol=None, pad_left=False, truncate_left=False): @@ -717,8 +751,8 @@ def __setstate__(self, state): self.tokenizer = get_tokenizer(self._tokenizer_arg, self.language) def __str__(self): - return "{}[name: {}, sequential: {}, is_target: {}]".format( - self.__class__.__name__, self.name, self.sequential, self.is_target) + return "{}[name: {}, is_sequential: {}, is_target: {}]".format( + self.__class__.__name__, self.name, self.is_sequential, self.is_target) def get_output_fields(self): """Returns an Iterable of the contained output fields. @@ -839,8 +873,8 @@ def finalize(self): if self.use_vocab and len(self.vocab) > self.num_of_classes: error_msg = "Number of classes in data is greater than the declared number " \ - "of classes. Declared: {}, Actual: {}".format( - self.num_of_classes, len(self.vocab)) + "of classes. Declared: {}, Actual: {}"\ + .format(self.num_of_classes, len(self.vocab)) _LOGGER.error(error_msg) raise ValueError(error_msg) diff --git a/test/storage/test_field.py b/test/storage/test_field.py index 68fa1db9..c19d3ca7 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -73,7 +73,7 @@ def test_field_preprocess_eager(): @pytest.mark.parametrize( - "value, store_raw, sequential, expected_raw_value, " + "value, store_raw, is_sequential, expected_raw_value, " "expected_tokenized_value", [ ("some text", True, True, "some text", ["some", "text"]), @@ -81,10 +81,10 @@ def test_field_preprocess_eager(): ("some text", False, True, None, ["some", "text"]), ] ) -def test_field_preprocess_raw_sequential(value, store_raw, sequential, +def test_field_preprocess_raw_sequential(value, store_raw, is_sequential, expected_raw_value, expected_tokenized_value): - f = Field(name="F", store_as_raw=store_raw, tokenize=sequential) + f = Field(name="F", store_as_raw=store_raw, tokenize=is_sequential) (_, (received_raw_value, received_tokenized_value)), = f.preprocess(value) @@ -93,7 +93,7 @@ def test_field_preprocess_raw_sequential(value, store_raw, sequential, @pytest.mark.parametrize( - "value, store_raw, sequential, expected_raw_value, " + "value, store_raw, is_sequential, expected_raw_value, " "expected_tokenized_value", [ ("some text", True, True, "some text", ["some", "text"]), @@ -101,10 +101,10 @@ def test_field_preprocess_raw_sequential(value, store_raw, sequential, ("some text", False, True, None, ["some", "text"]), ] ) -def test_field_pickle_tokenized(value, store_raw, sequential, +def test_field_pickle_tokenized(value, store_raw, is_sequential, expected_raw_value, expected_tokenized_value, tmpdir): - fld = Field(name="F", store_as_raw=store_raw, tokenize=sequential) + fld = Field(name="F", store_as_raw=store_raw, tokenize=is_sequential) (_, (received_raw_value, received_tokenized_value)), = fld.preprocess(value) @@ -124,7 +124,7 @@ def test_field_pickle_tokenized(value, store_raw, sequential, assert tokenized_value == expected_tokenized_value assert loaded_fld.name == "F" assert loaded_fld.store_as_raw == store_raw - assert loaded_fld.sequential == sequential + assert loaded_fld.is_sequential == is_sequential @pytest.mark.parametrize( @@ -141,7 +141,7 @@ def test_field_use_vocab(vocab, expected_value): @pytest.mark.parametrize( - "use_vocab, sequential, expected_vocab_values", + "use_vocab, is_sequential, expected_vocab_values", [ (False, False, []), (False, True, []), @@ -149,10 +149,10 @@ def test_field_use_vocab(vocab, expected_value): (True, True, ["some", "text"]), ] ) -def test_field_update_vocab(use_vocab, sequential, expected_vocab_values): +def test_field_update_vocab(use_vocab, is_sequential, expected_vocab_values): vocab = MockVocab() f = Field(name="F", vocab=vocab if use_vocab else None, - tokenize=sequential) + tokenize=is_sequential) raw_value = "some text" tokenized_value = ["some", "text"] diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py index 86fb0ba4..7a174e99 100644 --- a/test/storage/test_iterator.py +++ b/test/storage/test_iterator.py @@ -1,5 +1,4 @@ import random -import copy from test.storage.conftest import ( create_tabular_dataset_from_json, tabular_dataset_fields, TABULAR_TEXT) @@ -115,6 +114,34 @@ def test_create_batch(tabular_dataset): assert y_batch.rating.shape[0] == batch_size +@pytest.mark.usefixtures("json_file_path") +def test_not_numericalizable_field(json_file_path): + class MockCustomDataClass: + + def __init__(self, data): + self.data = data + + def custom_datatype_tokenizer(data): + return MockCustomDataClass(data) + + fields = tabular_dataset_fields() + text_field = fields['text'] + non_numericalizable_field = Field("non_numericalizable_field", + tokenizer=custom_datatype_tokenizer, + is_numericalizable=False) + + fields['text'] = (text_field, non_numericalizable_field) + + dataset = create_tabular_dataset_from_json(fields, json_file_path) + dataset.finalize_fields() + + for x_batch, _ in Iterator(dataset, batch_size=len(dataset)): + assert isinstance(x_batch.non_numericalizable_field, (list, tuple)) + for batch_data, real_data in zip(x_batch.non_numericalizable_field, TABULAR_TEXT): + assert isinstance(batch_data, MockCustomDataClass) + assert batch_data.data == real_data + + @pytest.mark.usefixtures("tabular_dataset") def test_lazy_numericalization_caching(tabular_dataset): tabular_dataset.finalize_fields() @@ -339,44 +366,6 @@ def np_arrays_equal(arr_1, arr_2): return arrs_equal -@pytest.mark.usefixtures("tabular_dataset") -def test_batch_as_vector_list(tabular_dataset): - tabular_dataset.finalize_fields() - text_vocab = tabular_dataset.field_dict["text"].vocab - - # case where we have both input and target fields - iterator = Iterator(tabular_dataset, batch_size=3, batch_to_matrix=False) - - example_index = 0 - for x_batch, y_batch in iterator: - assert isinstance(x_batch.text, list) - assert isinstance(y_batch.rating, list) - - for x, y in zip(x_batch.text, y_batch.rating): - example = tabular_dataset[example_index] - assert all(x == text_vocab.numericalize(example.text[1])) - assert y == [example.rating[0]] - example_index += 1 - - # case where we have only input fields - tabular_dataset = copy.deepcopy(tabular_dataset) - tabular_dataset.field_dict["rating"].is_target = False - - iterator = Iterator(tabular_dataset, batch_size=3, batch_to_matrix=False) - - example_index = 0 - for x_batch, y_batch in iterator: - assert isinstance(x_batch.text, list) - assert isinstance(x_batch.rating, list) - assert not y_batch - - for example_text, example_rating in zip(x_batch.text, x_batch.rating): - example = tabular_dataset[example_index] - assert all(example_text == text_vocab.numericalize(example.text[1])) - assert example_rating == [example.rating[0]] - example_index += 1 - - @pytest.fixture() def hierarchical_dataset_fields(): name_field = Field(name="name", store_as_raw=True, tokenize=False, vocab=Vocab()) From 7c6740c63ec1f99f887df4b60203f333036300ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Thu, 20 Feb 2020 18:59:30 +0100 Subject: [PATCH 02/25] WIP: TfIdfVectorizer update pending --- takepod/datasets/iterator.py | 25 ++++++++++++++---- takepod/examples/ner_example.py | 2 +- takepod/storage/field.py | 35 ++++++++++++++++++------- takepod/storage/vocab.py | 9 +++---- test/storage/conftest.py | 7 ++--- test/storage/test_field.py | 46 ++++++++++++++++++++++++++------- test/storage/test_iterator.py | 33 +++++++++++++++++------ test/storage/test_vocab.py | 4 +-- 8 files changed, 119 insertions(+), 42 deletions(-) diff --git a/takepod/datasets/iterator.py b/takepod/datasets/iterator.py index ddafed76..eb924593 100644 --- a/takepod/datasets/iterator.py +++ b/takepod/datasets/iterator.py @@ -242,6 +242,8 @@ def _create_batch(self, examples): for field in self._dataset.fields: if field.is_numericalizable: + # If this field is numericalizable, generate a possibly padded matrix + # the length to which all the rows are padded (or truncated) pad_length = Iterator._get_pad_length(field, examples) @@ -257,15 +259,24 @@ def _create_batch(self, examples): for i, example in enumerate(examples): # Get cached value - row = field.get_numericalization_for_example(example) + data = field.get_numericalization_for_example(example) + + if data is None: + # If data is missing, fill row with missing data symbol indexes + missing_data_symbol_index = field.get_default_value() + # TODO cache missing data + # row for batch to avoid multiple instantiations? + row = np.full(pad_length, missing_data_symbol_index) + + else: + row = data + if should_pad: + row = field.pad_to_length(row, pad_length) if matrix is None: # Create matrix of the correct dtype matrix = np.empty(shape=(n_rows, pad_length), dtype=row.dtype) - if should_pad: - row = field.pad_to_length(row, pad_length) - # set the matrix row to the numericalized, padded array matrix[i] = row @@ -301,7 +312,11 @@ def _get_pad_length(field, examples): # examples in the batch def length_of_field(example): _, tokens = getattr(example, field.name) - return len(tokens) + if tokens is None: + # missing data + return 1 + else: + return len(tokens) return max(map(length_of_field, examples)) diff --git a/takepod/examples/ner_example.py b/takepod/examples/ner_example.py index d71143bc..eb80ab52 100644 --- a/takepod/examples/ner_example.py +++ b/takepod/examples/ner_example.py @@ -127,7 +127,7 @@ def ner_croatian_blcc_example(fields, dataset, batch_transform_function): x_test, y_test = batch_transform_function(*next(test_iter.__iter__())) prediction = model.predict(X=x_test)[BLCCModel.PREDICTION_KEY] - pad_symbol = fields['labels'].vocab.pad_symbol() + pad_symbol = fields['labels'].vocab.pad_symbol_index() prediction_filtered, y_test_filtered = filter_out_padding( pad_symbol, prediction, diff --git a/takepod/storage/field.py b/takepod/storage/field.py index 4f24474d..b92d16ef 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -6,6 +6,7 @@ import numpy as np from takepod.preproc.tokenizers import get_tokenizer +from takepod.storage.vocab import SpecialVocabSymbols _LOGGER = logging.getLogger(__name__) @@ -584,14 +585,25 @@ def get_default_value(self): Returns ------- - missing : iterable - empty numpy array if the field is sequential or numpy array with one + missing_symbol index or None + The index of the missing data token, if this field is numericalizable. None value otherwise. + + Raises + ------ + ValueError + If missing data is not allowed in this field. """ - if self.is_sequential: - return np.empty(0) + if not self.allow_missing_data: + error_msg = "Missing data not allowed in field {}".format(self.name) + _LOGGER.error(error_msg) + raise ValueError(error_msg) - return np.array([np.nan]) + if self.is_numericalizable: + return -1 + + else: + return None def numericalize(self, data): """Numericalize the already preprocessed data point based either on @@ -608,8 +620,13 @@ def numericalize(self, data): Returns ------- numpy array - Array of stoi indexes of the tokens. + Array of stoi indexes of the tokens, if data exists. + None, if data is missing and missing data is allowed. + Raises + ------ + ValueError + If data is None and missing data is not allowed in this field. """ raw, tokenized = data @@ -620,7 +637,7 @@ def numericalize(self, data): raise ValueError(error_msg) else: - return self.get_default_value() + return None # raw data is just a string, so we need to wrap it into an iterable tokens = tokenized if self.tokenize or self.store_as_tokenized else [raw] @@ -673,7 +690,7 @@ def pad_to_length(self, row, length, custom_pad_symbol=None, # padding if self.use_vocab: - pad_symbol = self.vocab.pad_symbol() + pad_symbol = self.vocab.pad_symbol_index() else: pad_symbol = custom_pad_symbol @@ -873,7 +890,7 @@ def finalize(self): if self.use_vocab and len(self.vocab) > self.num_of_classes: error_msg = "Number of classes in data is greater than the declared number " \ - "of classes. Declared: {}, Actual: {}"\ + "of classes. Declared: {}, Actual: {}" \ .format(self.num_of_classes, len(self.vocab)) _LOGGER.error(error_msg) raise ValueError(error_msg) diff --git a/takepod/storage/vocab.py b/takepod/storage/vocab.py index 58baf35c..f699abf4 100644 --- a/takepod/storage/vocab.py +++ b/takepod/storage/vocab.py @@ -174,22 +174,21 @@ def get_freqs(self): raise RuntimeError(error_msg) return self._freqs - def pad_symbol(self): + def pad_symbol_index(self): """Method returns padding symbol index. Returns ------- pad_symbol_index : int - padding symbol index in the vocabullary + padding symbol index in the vocabulary Raises ------ ValueError - if the padding symbol is not pressent in the vocabulary + if the padding symbol is not present in the vocabulary. """ if SpecialVocabSymbols.PAD not in self.stoi: - error_msg = "Padding symbol is not in the vocabulary so" \ - " pad_symbol function raises exception." + error_msg = "Padding symbol is not in the vocabulary." _LOGGER.error(error_msg) raise ValueError(error_msg) return self.stoi[SpecialVocabSymbols.PAD] diff --git a/test/storage/conftest.py b/test/storage/conftest.py index fb02ea3b..c9ede150 100644 --- a/test/storage/conftest.py +++ b/test/storage/conftest.py @@ -60,7 +60,8 @@ def tabular_dataset(json_file_path): @pytest.fixture() def tabular_dataset_fields(fixed_length=None): - text = Field('text', eager=True, vocab=Vocab(), fixed_length=fixed_length) + text = Field('text', eager=True, vocab=Vocab(), + fixed_length=fixed_length, allow_missing_data=True) rating = Field('rating', tokenize=False, eager=False, is_target=True, custom_numericalize=float) @@ -73,10 +74,10 @@ def tabular_dataset_fields(fixed_length=None): "a b c", "a", "a b c d", - "a", + None, "d b", "d c g", - "b b b b b b", + "b b b b b b" ) TABULAR_RATINGS = (2.5, 3.2, 1.1, 2.1, 5.4, 2.8, 1.9) diff --git a/test/storage/test_field.py b/test/storage/test_field.py index c19d3ca7..9db153b2 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -34,7 +34,7 @@ def __init__(self): self.finalized = False self.numericalized = False - def pad_symbol(self): + def pad_symbol_index(self): return PAD_NUM def __add__(self, values): @@ -618,12 +618,12 @@ def test_multilabel_field_class_count(): @pytest.mark.parametrize("tokens, expected_numericalization", [ ( - ["class1", "class2", "class3", "class4"], - np.array([1, 1, 1, 1, 0, 0]) + ["class1", "class2", "class3", "class4"], + np.array([1, 1, 1, 1, 0, 0]) ), ( - [], - np.array([0, 0, 0, 0, 0, 0]) + [], + np.array([0, 0, 0, 0, 0, 0]) ) ]) def test_multilabel_field_custom_numericalization(tokens, expected_numericalization): @@ -688,8 +688,8 @@ def test_missing_values_default_sequential(): assert np.all(fld.numericalize(data_exists) == np.array([hash("data_string")])) -def test_missing_values_default_not_sequential(): - fld = Field(name="bla", +def test_missing_values_custom_numericalize(): + fld = Field(name="test_field", store_as_raw=True, tokenize=False, custom_numericalize=int, @@ -703,11 +703,39 @@ def test_missing_values_default_not_sequential(): fld.finalize() - assert np.allclose(fld.numericalize(data_missing), np.array([np.nan]), - equal_nan=True) + assert fld.numericalize(data_missing) is None assert np.all(fld.numericalize(data_exists) == np.array([404])) +def test_missing_symbol_index_vocab(): + vocab = Vocab() + fld = Field(name="test_field", + tokenizer='split', + store_as_raw=False, + tokenize=True, + vocab=vocab, + allow_missing_data=True) + + fld.preprocess("a b c d") + ((_, data),) = fld.preprocess(None) + assert data == (None, None) + + fld.finalize() + assert fld.numericalize((None, None)) is None + assert fld.get_default_value() == -1 + + +def test_missing_symbol_index_custom_numericalize(): + fld = Field(name="test_field", + store_as_raw=True, + tokenize=False, + custom_numericalize=int, + allow_missing_data=True) + + fld.finalize() + assert fld.get_default_value() == -1 + + def test_missing_values_fail(): fld = Field(name="bla", store_as_raw=True, diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py index 7a174e99..622a6dca 100644 --- a/test/storage/test_iterator.py +++ b/test/storage/test_iterator.py @@ -29,7 +29,7 @@ def test_len(batch_size, expected_len, tabular_dataset): iterator = Iterator(dataset=tabular_dataset, batch_size=batch_size) - assert expected_len == len(iterator) + assert len(iterator) == expected_len @pytest.mark.parametrize( @@ -57,9 +57,13 @@ def test_padding(fixed_length, expected_shape, json_file_path): assert input_batch.text.shape == expected_shape - pad_symbol = fields["text"].vocab.pad_symbol() + pad_symbol = fields["text"].vocab.pad_symbol_index() for i, row in enumerate(input_batch.text): + if TABULAR_TEXT[i] is None: + # if missing data + continue + n_el = len(TABULAR_TEXT[i].split()) assert (row[:n_el].astype(np.int32) != pad_symbol).all() @@ -128,7 +132,8 @@ def custom_datatype_tokenizer(data): text_field = fields['text'] non_numericalizable_field = Field("non_numericalizable_field", tokenizer=custom_datatype_tokenizer, - is_numericalizable=False) + is_numericalizable=False, + allow_missing_data=True) fields['text'] = (text_field, non_numericalizable_field) @@ -137,9 +142,12 @@ def custom_datatype_tokenizer(data): for x_batch, _ in Iterator(dataset, batch_size=len(dataset)): assert isinstance(x_batch.non_numericalizable_field, (list, tuple)) - for batch_data, real_data in zip(x_batch.non_numericalizable_field, TABULAR_TEXT): - assert isinstance(batch_data, MockCustomDataClass) - assert batch_data.data == real_data + for i, batch_data, real_data in zip(range(len(dataset)), x_batch.non_numericalizable_field, TABULAR_TEXT): + if i == 3: + assert batch_data is None + else: + assert isinstance(batch_data, MockCustomDataClass) + assert batch_data.data == real_data @pytest.mark.usefixtures("tabular_dataset") @@ -171,7 +179,10 @@ def test_sort_key(tabular_dataset): def text_len_sort_key(example): tokens = example.text[1] - return len(tokens) + if tokens is None: + return 0 + else: + return len(tokens) iterator = Iterator(dataset=tabular_dataset, batch_size=2, sort_key=text_len_sort_key, shuffle=False) @@ -283,8 +294,14 @@ def test_shuffle_random_state_exception(tabular_dataset): def text_len_key(example): - return len(example.text[1]) + if example.text[1] is None: + return 0 + else: + return len(example.text[1]) + +def test_iterator_missing_data_in_batch(): + pass @pytest.mark.parametrize( "look_ahead_multiplier, expected_row_lengths, bucket_sort_key, sort_key", diff --git a/test/storage/test_vocab.py b/test/storage/test_vocab.py index a6c18213..c8c74f34 100644 --- a/test/storage/test_vocab.py +++ b/test/storage/test_vocab.py @@ -100,7 +100,7 @@ def test_empty_specials_get_pad_symbol(): voc = vocab.Vocab(specials=[]) voc.finalize() with pytest.raises(ValueError): - voc.pad_symbol() + voc.pad_symbol_index() def test_empty_specials_stoi(): @@ -116,7 +116,7 @@ def test_specials_get_pad_symbol(): voc = vocab.Vocab(specials=(vocab.SpecialVocabSymbols.PAD,)) data = ["tree", "plant", "grass"] voc = (voc + set(data)) - assert voc.pad_symbol() == 0 + assert voc.pad_symbol_index() == 0 voc.finalize() assert voc.itos[0] == vocab.SpecialVocabSymbols.PAD From 553220630c4f452c97342ee9357546987a08b49b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Mon, 24 Feb 2020 12:22:22 +0100 Subject: [PATCH 03/25] Added option to define custom missing data symbol --- takepod/datasets/tabular_dataset.py | 1 + takepod/storage/field.py | 14 +++++++++----- test/storage/test_field.py | 2 +- test/storage/test_iterator.py | 19 ++++++++++++++++--- 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/takepod/datasets/tabular_dataset.py b/takepod/datasets/tabular_dataset.py index 2a58d922..3013fa03 100644 --- a/takepod/datasets/tabular_dataset.py +++ b/takepod/datasets/tabular_dataset.py @@ -85,6 +85,7 @@ def __init__(self, path, format, fields, skip_header=False, # create a Dataset with lists of examples and fields super(TabularDataset, self).__init__(examples, fields, **kwargs) + self.finalize_fields() def create_examples(reader, format, fields, skip_header): diff --git a/takepod/storage/field.py b/takepod/storage/field.py index b92d16ef..1b078341 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -194,7 +194,8 @@ def __init__(self, custom_numericalize=None, is_target=False, fixed_length=None, - allow_missing_data=False + allow_missing_data=False, + missing_data_token=-1 ): """Create a Field from arguments. @@ -269,10 +270,12 @@ def __init__(self, is false and None is sent to be preprocessed, an ValueError will be raised. If 'allow_missing_data' is True, if a None is sent to be preprocessed, it will be stored and later numericalized properly. - If the field is sequential the numericalization of a missing data field will - be an empty numpy Array, else the numericalization will be a numpy Array - containing a single np.Nan ([np.Nan]) Default: False + missing_data_token : number + Token to use to mark batch rows as missing. If data for a field is missing, + its matrix row will be filled with this value. For non numericalizable fields, + this parameter is ignored and the value will be None. + Default: -1 Raises ------ @@ -332,6 +335,7 @@ def __init__(self, self.pretokenize_pipeline = PretokenizationPipeline() self.posttokenize_pipeline = PosttokenizationPipeline() self.allow_missing_data = allow_missing_data + self.missing_data_token = missing_data_token @property def use_vocab(self): @@ -600,7 +604,7 @@ def get_default_value(self): raise ValueError(error_msg) if self.is_numericalizable: - return -1 + return self.missing_data_token else: return None diff --git a/test/storage/test_field.py b/test/storage/test_field.py index 9db153b2..af51233b 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -684,7 +684,7 @@ def test_missing_values_default_sequential(): assert data_exists == (None, ["data_string"]) fld.finalize() - assert np.all(fld.numericalize(data_missing) == np.empty(0)) + assert fld.numericalize(data_missing) is None assert np.all(fld.numericalize(data_exists) == np.array([hash("data_string")])) diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py index 622a6dca..2139207c 100644 --- a/test/storage/test_iterator.py +++ b/test/storage/test_iterator.py @@ -299,9 +299,22 @@ def text_len_key(example): else: return len(example.text[1]) - -def test_iterator_missing_data_in_batch(): - pass +@pytest.mark.usefixtures("json_file_path") +def test_iterator_missing_data_in_batch(json_file_path): + missing_data_default_value = -99 + fields = tabular_dataset_fields() + missing_value_field = Field("non_numericalizable_field", + tokenizer="split", + vocab=Vocab(), + allow_missing_data=True, + missing_data_token=missing_data_default_value) + fields['text'] = missing_value_field + ds = create_tabular_dataset_from_json(fields, json_file_path) + + for x_batch, _ in Iterator(ds, batch_size=len(ds)): + # test if the value we know is missing is correctly filled out + missing_value_row = x_batch.non_numericalizable_field[3] + assert np.all(missing_value_row == missing_data_default_value) @pytest.mark.parametrize( "look_ahead_multiplier, expected_row_lengths, bucket_sort_key, sort_key", From 280cc43ffd74ae26cf0eb9ff8d0d2ae502fae079 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Mon, 24 Feb 2020 12:47:23 +0100 Subject: [PATCH 04/25] Optimized handling od missing value rows in Iterator --- takepod/datasets/iterator.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/takepod/datasets/iterator.py b/takepod/datasets/iterator.py index eb924593..f057cba0 100644 --- a/takepod/datasets/iterator.py +++ b/takepod/datasets/iterator.py @@ -266,19 +266,26 @@ def _create_batch(self, examples): missing_data_symbol_index = field.get_default_value() # TODO cache missing data # row for batch to avoid multiple instantiations? - row = np.full(pad_length, missing_data_symbol_index) + + if matrix is None: + # Create matrix of the correct dtype + matrix = np.empty(shape=(n_rows, pad_length), + dtype=type(missing_data_symbol_index)) + + matrix[i] = missing_data_symbol_index else: row = data if should_pad: row = field.pad_to_length(row, pad_length) - if matrix is None: - # Create matrix of the correct dtype - matrix = np.empty(shape=(n_rows, pad_length), dtype=row.dtype) + if matrix is None: + # Create matrix of the correct dtype + matrix = np.empty(shape=(n_rows, pad_length), + dtype=row.dtype) - # set the matrix row to the numericalized, padded array - matrix[i] = row + # set the matrix row to the numericalized, padded array + matrix[i] = row batch_feature = matrix From 2f505b27c85a5baa4bd686ef1043343a285d2b92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Thu, 27 Feb 2020 17:22:19 +0100 Subject: [PATCH 05/25] Made TfIdf vectorizer not support fields with missing data --- takepod/storage/field.py | 1 - takepod/storage/vectorizers/tfidf.py | 8 ++++++++ test/storage/conftest.py | 19 ++++++++++++++++--- test/storage/test_field.py | 8 ++++---- test/storage/test_iterator.py | 20 ++++++++++++-------- 5 files changed, 40 insertions(+), 16 deletions(-) diff --git a/takepod/storage/field.py b/takepod/storage/field.py index 1b078341..20c48686 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -6,7 +6,6 @@ import numpy as np from takepod.preproc.tokenizers import get_tokenizer -from takepod.storage.vocab import SpecialVocabSymbols _LOGGER = logging.getLogger(__name__) diff --git a/takepod/storage/vectorizers/tfidf.py b/takepod/storage/vectorizers/tfidf.py index e1838fde..6b42d782 100644 --- a/takepod/storage/vectorizers/tfidf.py +++ b/takepod/storage/vectorizers/tfidf.py @@ -153,6 +153,14 @@ def fit(self, dataset, field): "or by providing field with a non-empty vocab property." _LOGGER.error(error_msg) raise ValueError(error_msg) + + if field and field.allow_missing_data: + error_msg = "CountVectorizer doesn't support fields that " \ + "contain missing data: " \ + "{}, field: {}".format(str(dataset), str(field)) + _LOGGER.error(error_msg) + raise ValueError(error_msg) + self._vocab = field.vocab if self._vocab is None else self._vocab self._init_special_indexes() self._fitted = True diff --git a/test/storage/conftest.py b/test/storage/conftest.py index c9ede150..e612d523 100644 --- a/test/storage/conftest.py +++ b/test/storage/conftest.py @@ -61,16 +61,18 @@ def tabular_dataset(json_file_path): @pytest.fixture() def tabular_dataset_fields(fixed_length=None): text = Field('text', eager=True, vocab=Vocab(), - fixed_length=fixed_length, allow_missing_data=True) + fixed_length=fixed_length, allow_missing_data=False) + text_missing = Field('text_with_missing_data', eager=True, vocab=Vocab(), + fixed_length=fixed_length, allow_missing_data=True) rating = Field('rating', tokenize=False, eager=False, is_target=True, custom_numericalize=float) - fields = {"text": text, "rating": rating} + fields = {"text": text, "text_with_missing_data": text_missing, "rating": rating} return fields -TABULAR_TEXT = ( +TABULAR_TEXT_WITH_MISSING = ( "a b c", "a", "a b c d", @@ -80,6 +82,16 @@ def tabular_dataset_fields(fixed_length=None): "b b b b b b" ) +TABULAR_TEXT = ( + "a b c", + "a", + "a b c d", + "a", + "d b", + "d c g", + "b b b b b b" +) + TABULAR_RATINGS = (2.5, 3.2, 1.1, 2.1, 5.4, 2.8, 1.9) @@ -87,6 +99,7 @@ def tabular_dataset_fields(fixed_length=None): def tabular_data(): return { "text": TABULAR_TEXT, + "text_with_missing_data": TABULAR_TEXT_WITH_MISSING, "rating": TABULAR_RATINGS, } diff --git a/test/storage/test_field.py b/test/storage/test_field.py index af51233b..761475e5 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -618,12 +618,12 @@ def test_multilabel_field_class_count(): @pytest.mark.parametrize("tokens, expected_numericalization", [ ( - ["class1", "class2", "class3", "class4"], - np.array([1, 1, 1, 1, 0, 0]) + ["class1", "class2", "class3", "class4"], + np.array([1, 1, 1, 1, 0, 0]) ), ( - [], - np.array([0, 0, 0, 0, 0, 0]) + [], + np.array([0, 0, 0, 0, 0, 0]) ) ]) def test_multilabel_field_custom_numericalization(tokens, expected_numericalization): diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py index 2139207c..6785ca4c 100644 --- a/test/storage/test_iterator.py +++ b/test/storage/test_iterator.py @@ -129,20 +129,22 @@ def custom_datatype_tokenizer(data): return MockCustomDataClass(data) fields = tabular_dataset_fields() - text_field = fields['text'] + text_field = fields['text_with_missing_data'] non_numericalizable_field = Field("non_numericalizable_field", tokenizer=custom_datatype_tokenizer, is_numericalizable=False, allow_missing_data=True) - fields['text'] = (text_field, non_numericalizable_field) + fields['text_with_missing_data'] = (text_field, non_numericalizable_field) dataset = create_tabular_dataset_from_json(fields, json_file_path) dataset.finalize_fields() for x_batch, _ in Iterator(dataset, batch_size=len(dataset)): assert isinstance(x_batch.non_numericalizable_field, (list, tuple)) - for i, batch_data, real_data in zip(range(len(dataset)), x_batch.non_numericalizable_field, TABULAR_TEXT): + for i, batch_data, real_data in zip( + range(len(dataset)), x_batch.non_numericalizable_field, TABULAR_TEXT + ): if i == 3: assert batch_data is None else: @@ -299,16 +301,17 @@ def text_len_key(example): else: return len(example.text[1]) + @pytest.mark.usefixtures("json_file_path") def test_iterator_missing_data_in_batch(json_file_path): missing_data_default_value = -99 fields = tabular_dataset_fields() missing_value_field = Field("non_numericalizable_field", - tokenizer="split", - vocab=Vocab(), - allow_missing_data=True, - missing_data_token=missing_data_default_value) - fields['text'] = missing_value_field + tokenizer="split", + vocab=Vocab(), + allow_missing_data=True, + missing_data_token=missing_data_default_value) + fields['text_with_missing_data'] = missing_value_field ds = create_tabular_dataset_from_json(fields, json_file_path) for x_batch, _ in Iterator(ds, batch_size=len(ds)): @@ -316,6 +319,7 @@ def test_iterator_missing_data_in_batch(json_file_path): missing_value_row = x_batch.non_numericalizable_field[3] assert np.all(missing_value_row == missing_data_default_value) + @pytest.mark.parametrize( "look_ahead_multiplier, expected_row_lengths, bucket_sort_key, sort_key", [ From cb833dcd676c720f0cdd8025089dea8e27b45b9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Fri, 28 Feb 2020 12:53:30 +0100 Subject: [PATCH 06/25] Added missing data support to subclasses of Field --- takepod/storage/field.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/takepod/storage/field.py b/takepod/storage/field.py index 692289db..74eb672d 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -796,7 +796,8 @@ def __init__(self, vocab=None, eager=True, custom_numericalize=None, - allow_missing_data=False + allow_missing_data=False, + missing_data_token=-1 ): if vocab is not None and vocab.has_specials: error_msg = "Vocab contains special symbols." \ @@ -816,9 +817,8 @@ def __init__(self, custom_numericalize=custom_numericalize, is_target=True, fixed_length=1, - allow_missing_data=allow_missing_data - # TODO add default missing value token when merged - # with missing value branch + allow_missing_data=allow_missing_data, + missing_data_token=missing_data_token ) @@ -835,7 +835,8 @@ def __init__(self, custom_numericalize=None, is_target=False, fixed_length=None, - allow_missing_data=False): + allow_missing_data=False, + missing_data_token=-1): super().__init__( name=name, vocab=vocab, @@ -846,7 +847,8 @@ def __init__(self, custom_numericalize=custom_numericalize, is_target=is_target, fixed_length=fixed_length, - allow_missing_data=allow_missing_data + allow_missing_data=allow_missing_data, + missing_data_token=missing_data_token ) @@ -860,8 +862,9 @@ def __init__(self, num_of_classes=None, vocab=None, eager=True, + custom_numericalize=None, allow_missing_data=False, - custom_numericalize=None): + missing_data_token=-1): """Create a MultilabelField from arguments. Parameters @@ -921,7 +924,8 @@ def __init__(self, custom_numericalize=custom_numericalize, is_target=True, fixed_length=num_of_classes, - allow_missing_data=allow_missing_data) + allow_missing_data=allow_missing_data, + missing_data_token=missing_data_token) def finalize(self): super().finalize() From 00dc2fca5babf4f432b6b8cf5150e0b7532d3607 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 18 Mar 2020 19:29:34 +0100 Subject: [PATCH 07/25] Fixed a test --- test/storage/test_iterator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py index 09d30eb8..89ae8b2e 100644 --- a/test/storage/test_iterator.py +++ b/test/storage/test_iterator.py @@ -306,7 +306,7 @@ def text_len_key(example): def test_iterator_missing_data_in_batch(json_file_path): missing_data_default_value = -99 fields = tabular_dataset_fields() - missing_value_field = Field("non_numericalizable_field", + missing_value_field = Field("missing_value_field", tokenizer="split", vocab=Vocab(), allow_missing_data=True, @@ -314,9 +314,9 @@ def test_iterator_missing_data_in_batch(json_file_path): fields['text_with_missing_data'] = missing_value_field ds = create_tabular_dataset_from_json(fields, json_file_path) - for x_batch, _ in Iterator(ds, batch_size=len(ds)): + for x_batch, _ in Iterator(ds, batch_size=len(ds), shuffle=False): # test if the value we know is missing is correctly filled out - missing_value_row = x_batch.non_numericalizable_field[3] + missing_value_row = x_batch.missing_value_field[3] assert np.all(missing_value_row == missing_data_default_value) From c4a11849571c4c6355cb322b300ece3c5ae56e58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 18 Mar 2020 20:42:50 +0100 Subject: [PATCH 08/25] Added custom padding token to field for use with custom_numericalize --- takepod/storage/field.py | 13 ++++++++++++- test/storage/test_field.py | 21 +++++++++++++++++---- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/takepod/storage/field.py b/takepod/storage/field.py index 74eb672d..003a872f 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -191,6 +191,7 @@ def __init__(self, eager=True, is_numericalizable=True, custom_numericalize=None, + custom_numericalize_padding_token=-1, is_target=False, fixed_length=None, allow_missing_data=False, @@ -253,10 +254,13 @@ def __init__(self, on custom datatypes. For non-numericalizable fields, Iterator will generate batch fields containing lists of these custom data type instances returned by the tokenizer. - custom_numericalize : callable The numericalization function that will be called if the field doesn't use a vocabulary. + custom_numericalize_padding_token : int + If custom_numericalize is provided and padding the batch matrix is needed, + this token is used to pad the end of the matrix row. + If custom_numericalize is None, this is ignored. is_target : bool Whether this field is a target variable. Affects iteration over batches. Default: False. @@ -331,6 +335,7 @@ def __init__(self, self.tokenizer = None self.custom_numericalize = custom_numericalize + self.custom_numericalize_padding_token = custom_numericalize_padding_token self.is_target = is_target self.fixed_length = fixed_length @@ -698,6 +703,10 @@ def pad_to_length(self, row, length, custom_pad_symbol=None, if self.use_vocab: pad_symbol = self.vocab.pad_symbol_index() + + elif self.custom_numericalize: + pad_symbol = self.custom_numericalize_padding_token + else: pad_symbol = custom_pad_symbol @@ -833,6 +842,7 @@ def __init__(self, vocab=None, eager=True, custom_numericalize=None, + custom_numericalize_padding_token=-1, is_target=False, fixed_length=None, allow_missing_data=False, @@ -845,6 +855,7 @@ def __init__(self, store_as_tokenized=True, eager=eager, custom_numericalize=custom_numericalize, + custom_numericalize_padding_token=custom_numericalize_padding_token, is_target=is_target, fixed_length=fixed_length, allow_missing_data=allow_missing_data, diff --git a/test/storage/test_field.py b/test/storage/test_field.py index fdca68bb..a0cc6ea9 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -226,6 +226,19 @@ def test_field_pad_to_length(row, length, expected_row, pad_left, assert received_row.tolist() == expected_row +def test_field_pad_custom_numericalize(): + custom_padding_token = -999 + f = Field("test_field", + custom_numericalize=int, + custom_numericalize_padding_token=custom_padding_token, + tokenizer='split') + mock_numericalization = np.array([1, 2, 3, 4]) + expected_numericalization = np.array([1, 2, 3, 4] + [custom_padding_token] * 6) + + padded = f.pad_to_length(mock_numericalization, 10, pad_left=False) + assert np.all(padded == expected_numericalization) + + @pytest.mark.parametrize( "row, length, expected_row", [ @@ -618,12 +631,12 @@ def test_multilabel_field_class_count(): @pytest.mark.parametrize("tokens, expected_numericalization", [ ( - ["class1", "class2", "class3", "class4"], - np.array([1, 1, 1, 1, 0, 0]) + ["class1", "class2", "class3", "class4"], + np.array([1, 1, 1, 1, 0, 0]) ), ( - [], - np.array([0, 0, 0, 0, 0, 0]) + [], + np.array([0, 0, 0, 0, 0, 0]) ) ]) def test_multilabel_field_custom_numericalization(tokens, expected_numericalization): From a51b3b48a6366be8fa3c5cc322f07edc3ed2bbf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 18 Mar 2020 20:43:59 +0100 Subject: [PATCH 09/25] flake8 --- test/storage/test_field.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/storage/test_field.py b/test/storage/test_field.py index a0cc6ea9..e0a818bc 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -631,12 +631,12 @@ def test_multilabel_field_class_count(): @pytest.mark.parametrize("tokens, expected_numericalization", [ ( - ["class1", "class2", "class3", "class4"], - np.array([1, 1, 1, 1, 0, 0]) + ["class1", "class2", "class3", "class4"], + np.array([1, 1, 1, 1, 0, 0]) ), ( - [], - np.array([0, 0, 0, 0, 0, 0]) + [], + np.array([0, 0, 0, 0, 0, 0]) ) ]) def test_multilabel_field_custom_numericalization(tokens, expected_numericalization): From 5bff1b08fc1a482b93967e38c62e1b3ac9376d4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 18 Mar 2020 22:04:18 +0100 Subject: [PATCH 10/25] WIP, testing --- takepod/datasets/iterator.py | 8 +++---- takepod/storage/__init__.py | 4 ++-- takepod/storage/field.py | 22 +++++++++++++++++- test/storage/test_field.py | 45 ++++++++++++++++++------------------ 4 files changed, 49 insertions(+), 30 deletions(-) diff --git a/takepod/datasets/iterator.py b/takepod/datasets/iterator.py index 82333b7c..0af2bb75 100644 --- a/takepod/datasets/iterator.py +++ b/takepod/datasets/iterator.py @@ -268,7 +268,7 @@ def _create_batch(self, examples): matrix = None # np.empty(shape=(n_rows, pad_length)) # non-sequential fields all have length = 1, no padding necessary - should_pad = True if field.is_sequential else False + should_pad = field.is_sequential for i, example in enumerate(examples): @@ -321,14 +321,14 @@ def _create_batch(self, examples): @staticmethod def _get_pad_length(field, examples): - if not field.is_sequential: - return 1 - # the fixed_length attribute of Field has priority over the max length # of all the examples in the batch if field.fixed_length is not None: return field.fixed_length + if not field.is_sequential: + return 1 + # if fixed_length is None, then return the maximum length of all the # examples in the batch def length_of_field(example): diff --git a/takepod/storage/__init__.py b/takepod/storage/__init__.py index ee192886..44a1c04f 100644 --- a/takepod/storage/__init__.py +++ b/takepod/storage/__init__.py @@ -2,7 +2,7 @@ from .example_factory import ExampleFactory, ExampleFormat from .field import Field, TokenizedField, MultilabelField, MultioutputField, \ - unpack_fields, LabelField + unpack_fields, LabelField, SentenceEmbeddingField from .resources.downloader import (BaseDownloader, SCPDownloader, HttpDownloader, SimpleHttpDownloader) from .resources.large_resource import LargeResource, SCPLargeResource @@ -21,6 +21,6 @@ __all__ = ["BaseDownloader", "SCPDownloader", "HttpDownloader", "SimpleHttpDownloader", "Field", "TokenizedField", "LabelField", "MultilabelField", "MultioutputField", - "unpack_fields", "LargeResource", "SCPLargeResource", + "unpack_fields", "LargeResource", "SCPLargeResource", "SentenceEmbeddingField", "VectorStorage", "BasicVectorStorage", "SpecialVocabSymbols", "Vocab", "ExampleFactory", "ExampleFormat", "TfIdfVectorizer"] diff --git a/takepod/storage/field.py b/takepod/storage/field.py index 003a872f..b179fe0f 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -2,6 +2,7 @@ import logging import itertools from collections import deque +from typing import Callable import numpy as np @@ -648,7 +649,7 @@ def numericalize(self, data): _LOGGER.error(error_msg) raise ValueError(error_msg) - else: + elif not self.custom_numericalize: return None # raw data is just a string, so we need to wrap it into an iterable @@ -960,6 +961,25 @@ def _numericalize_tokens(self, tokens): return numericalize_multihot(tokens, token_numericalize, self.num_of_classes) +class SentenceEmbeddingField(Field): + """Field used for sentence-level multidimensional embeddings.""" + + def __init__(self, + name: str, + embedding_fn: Callable[[str], np.array], + embedding_size: int): + super().__init__(name, + custom_numericalize=embedding_fn, + tokenizer=None, + language=None, + vocab=None, + tokenize=False, + store_as_raw=True, + store_as_tokenized=False, + is_target=False, + fixed_length=embedding_size, + allow_missing_data=True) + def numericalize_multihot(tokens, token_indexer, num_of_classes): active_classes = list(map(token_indexer, tokens)) multihot_encoding = np.zeros(num_of_classes, dtype=np.bool) diff --git a/test/storage/test_field.py b/test/storage/test_field.py index e0a818bc..486b2d7b 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -5,7 +5,7 @@ from mock import patch from takepod.storage import Field, TokenizedField, MultilabelField, \ - Vocab, SpecialVocabSymbols, MultioutputField, LabelField + Vocab, SpecialVocabSymbols, MultioutputField, LabelField, SentenceEmbeddingField ONE_TO_FIVE = [1, 2, 3, 4, 5] @@ -690,36 +690,14 @@ def test_missing_values_default_sequential(): custom_numericalize=lambda x: hash(x), allow_missing_data=True) - _, data_missing = fld.preprocess(None)[0] _, data_exists = fld.preprocess("data_string")[0] - assert data_missing == (None, None) assert data_exists == (None, ["data_string"]) fld.finalize() - assert fld.numericalize(data_missing) is None assert np.all(fld.numericalize(data_exists) == np.array([hash("data_string")])) -def test_missing_values_custom_numericalize(): - fld = Field(name="test_field", - store_as_raw=True, - tokenize=False, - custom_numericalize=int, - allow_missing_data=True) - - _, data_missing = fld.preprocess(None)[0] - _, data_exists = fld.preprocess("404")[0] - - assert data_missing == (None, None) - assert data_exists == ("404", None) - - fld.finalize() - - assert fld.numericalize(data_missing) is None - assert np.all(fld.numericalize(data_exists) == np.array([404])) - - def test_missing_symbol_index_vocab(): vocab = Vocab() fld = Field(name="test_field", @@ -875,3 +853,24 @@ def test_label_field(): _, example = x[0] raw, _ = example assert label_field.numericalize(example) == vocab.stoi[raw] + + +def test_sentence_embedding_field(): + def mock_embedding_fn(sentence): + if sentence == "test_sentence": + return np.array([1, 2, 3, 4]) + + if sentence is None: + return np.zeros(4) + + field = SentenceEmbeddingField("test_field", + embedding_fn=mock_embedding_fn, + embedding_size=4) + + (_, data), = field.preprocess("test_sentence") + numericalization_1 = field.numericalize(data) + assert np.all(numericalization_1 == np.array([1, 2, 3, 4])) + + (_, data), = field.preprocess(None) + numericalization_2 = field.numericalize(data) + assert np.all(numericalization_2 == np.zeros(4)) From feeac4e5ead47c4aee908fc8d91107b67e5b18eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 18 Mar 2020 22:10:22 +0100 Subject: [PATCH 11/25] flake8 --- takepod/storage/field.py | 1 + 1 file changed, 1 insertion(+) diff --git a/takepod/storage/field.py b/takepod/storage/field.py index b179fe0f..7d65432b 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -980,6 +980,7 @@ def __init__(self, fixed_length=embedding_size, allow_missing_data=True) + def numericalize_multihot(tokens, token_indexer, num_of_classes): active_classes = list(map(token_indexer, tokens)) multihot_encoding = np.zeros(num_of_classes, dtype=np.bool) From c1196256ac466c77acab2d53bc46916df589956f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 18 Mar 2020 22:22:23 +0100 Subject: [PATCH 12/25] Added documentation --- takepod/storage/field.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/takepod/storage/field.py b/takepod/storage/field.py index 7d65432b..c4ba5e83 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -968,6 +968,18 @@ def __init__(self, name: str, embedding_fn: Callable[[str], np.array], embedding_size: int): + """ + Field used for sentence-level multidimensional embeddings. + + Parameters + ---------- + name: str + Field name, used for referencing data in the dataset. + embedding_fn: Callable[[str], np.array] + Callable that takes a string and returns a fixed-width embedding. + embedding_size: int + Width of the embedding. + """ super().__init__(name, custom_numericalize=embedding_fn, tokenizer=None, From c5fa93e4eca31394ba24ba3e50c484962afa993e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 19 Feb 2020 22:52:00 +0100 Subject: [PATCH 13/25] Added per-field custom datatype support --- takepod/datasets/iterator.py | 1 - takepod/storage/field.py | 13 +++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/takepod/datasets/iterator.py b/takepod/datasets/iterator.py index 061cd194..fecac261 100644 --- a/takepod/datasets/iterator.py +++ b/takepod/datasets/iterator.py @@ -257,7 +257,6 @@ def _create_batch(self, examples): for field in self._dataset.fields: if field.is_numericalizable: # If this field is numericalizable, generate a possibly padded matrix - # the length to which all the rows are padded (or truncated) pad_length = Iterator._get_pad_length(field, examples) diff --git a/takepod/storage/field.py b/takepod/storage/field.py index 0c854baf..26d0a615 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -248,13 +248,14 @@ def __init__(self, If true, the output of the tokenizer is presumed to be a list of tokens and will be numericalized using the provided Vocab or custom_numericalize. For numericalizable fields, Iterator will generate batch fields containing - numpy matrices. + numpy matrices. + + If false, the out of the tokenizer is presumed to be a custom datatype. + Posttokenization hooks aren't allowed to be added as they can't be called + on custom datatypes. For non-numericalizable fields, Iterator will generate + batch fields containing lists of these custom data type instances returned + by the tokenizer. - If false, the out of the tokenizer is presumed to be a custom datatype. - Posttokenization hooks aren't allowed to be added as they can't be called - on custom datatypes. For non-numericalizable fields, Iterator will generate - batch fields containing lists of these custom data type instances returned - by the tokenizer. custom_numericalize : callable The numericalization function that will be called if the field doesn't use a vocabulary. If using custom_numericalize and padding is From 3a7efc3ff420ce3569379afadd8fe5b55da0b862 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Thu, 20 Feb 2020 18:59:30 +0100 Subject: [PATCH 14/25] WIP: TfIdfVectorizer update pending --- takepod/storage/field.py | 2 +- test/storage/conftest.py | 2 +- test/storage/test_field.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/takepod/storage/field.py b/takepod/storage/field.py index 26d0a615..cd1cce5e 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -6,7 +6,7 @@ import numpy as np from takepod.preproc.tokenizers import get_tokenizer -from takepod.storage.vocab import Vocab +from takepod.storage.vocab import Vocab, SpecialVocabSymbols _LOGGER = logging.getLogger(__name__) diff --git a/test/storage/conftest.py b/test/storage/conftest.py index e612d523..82ef9ac0 100644 --- a/test/storage/conftest.py +++ b/test/storage/conftest.py @@ -86,7 +86,7 @@ def tabular_dataset_fields(fixed_length=None): "a b c", "a", "a b c d", - "a", + None, "d b", "d c g", "b b b b b b" diff --git a/test/storage/test_field.py b/test/storage/test_field.py index 0baf53b0..f4759641 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -631,12 +631,12 @@ def test_multilabel_field_class_count(): @pytest.mark.parametrize("tokens, expected_numericalization", [ ( - ["class1", "class2", "class3", "class4"], - np.array([1, 1, 1, 1, 0, 0]) + ["class1", "class2", "class3", "class4"], + np.array([1, 1, 1, 1, 0, 0]) ), ( - [], - np.array([0, 0, 0, 0, 0, 0]) + [], + np.array([0, 0, 0, 0, 0, 0]) ) ]) def test_multilabel_field_custom_numericalization(tokens, expected_numericalization): From 318a36bc3aae1c7c436c9f07ba11bbe169bd44d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Mon, 24 Feb 2020 12:22:22 +0100 Subject: [PATCH 15/25] Added option to define custom missing data symbol --- test/storage/test_iterator.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py index 308546e5..c335f37e 100644 --- a/test/storage/test_iterator.py +++ b/test/storage/test_iterator.py @@ -301,6 +301,17 @@ def text_len_key(example): else: return len(example.text[1]) +@pytest.mark.usefixtures("json_file_path") +def test_iterator_missing_data_in_batch(json_file_path): + missing_data_default_value = -99 + fields = tabular_dataset_fields() + missing_value_field = Field("non_numericalizable_field", + tokenizer="split", + vocab=Vocab(), + allow_missing_data=True, + missing_data_token=missing_data_default_value) + fields['text'] = missing_value_field + ds = create_tabular_dataset_from_json(fields, json_file_path) @pytest.mark.usefixtures("json_file_path") def test_iterator_missing_data_in_batch(json_file_path): From 41b1aa7e0c1b6c4753b4337cfc115ea05b41a596 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Thu, 27 Feb 2020 17:22:19 +0100 Subject: [PATCH 16/25] Made TfIdf vectorizer not support fields with missing data --- test/storage/conftest.py | 2 +- test/storage/test_field.py | 8 ++++---- test/storage/test_iterator.py | 11 ++++++----- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/test/storage/conftest.py b/test/storage/conftest.py index 82ef9ac0..e612d523 100644 --- a/test/storage/conftest.py +++ b/test/storage/conftest.py @@ -86,7 +86,7 @@ def tabular_dataset_fields(fixed_length=None): "a b c", "a", "a b c d", - None, + "a", "d b", "d c g", "b b b b b b" diff --git a/test/storage/test_field.py b/test/storage/test_field.py index f4759641..0baf53b0 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -631,12 +631,12 @@ def test_multilabel_field_class_count(): @pytest.mark.parametrize("tokens, expected_numericalization", [ ( - ["class1", "class2", "class3", "class4"], - np.array([1, 1, 1, 1, 0, 0]) + ["class1", "class2", "class3", "class4"], + np.array([1, 1, 1, 1, 0, 0]) ), ( - [], - np.array([0, 0, 0, 0, 0, 0]) + [], + np.array([0, 0, 0, 0, 0, 0]) ) ]) def test_multilabel_field_custom_numericalization(tokens, expected_numericalization): diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py index c335f37e..b44417bf 100644 --- a/test/storage/test_iterator.py +++ b/test/storage/test_iterator.py @@ -301,16 +301,17 @@ def text_len_key(example): else: return len(example.text[1]) + @pytest.mark.usefixtures("json_file_path") def test_iterator_missing_data_in_batch(json_file_path): missing_data_default_value = -99 fields = tabular_dataset_fields() missing_value_field = Field("non_numericalizable_field", - tokenizer="split", - vocab=Vocab(), - allow_missing_data=True, - missing_data_token=missing_data_default_value) - fields['text'] = missing_value_field + tokenizer="split", + vocab=Vocab(), + allow_missing_data=True, + missing_data_token=missing_data_default_value) + fields['text_with_missing_data'] = missing_value_field ds = create_tabular_dataset_from_json(fields, json_file_path) @pytest.mark.usefixtures("json_file_path") From dcaba7c9a3a133a9e679b0a05bea176d8e0b4e1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 18 Mar 2020 19:29:34 +0100 Subject: [PATCH 17/25] Fixed a test --- test/storage/test_iterator.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/test/storage/test_iterator.py b/test/storage/test_iterator.py index b44417bf..308546e5 100644 --- a/test/storage/test_iterator.py +++ b/test/storage/test_iterator.py @@ -302,18 +302,6 @@ def text_len_key(example): return len(example.text[1]) -@pytest.mark.usefixtures("json_file_path") -def test_iterator_missing_data_in_batch(json_file_path): - missing_data_default_value = -99 - fields = tabular_dataset_fields() - missing_value_field = Field("non_numericalizable_field", - tokenizer="split", - vocab=Vocab(), - allow_missing_data=True, - missing_data_token=missing_data_default_value) - fields['text_with_missing_data'] = missing_value_field - ds = create_tabular_dataset_from_json(fields, json_file_path) - @pytest.mark.usefixtures("json_file_path") def test_iterator_missing_data_in_batch(json_file_path): missing_data_default_value = -99 From 09cbfb696debd23dec35a1bb0868aec01d55b90c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 18 Mar 2020 20:42:50 +0100 Subject: [PATCH 18/25] Added custom padding token to field for use with custom_numericalize --- test/storage/test_field.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/storage/test_field.py b/test/storage/test_field.py index 0baf53b0..f4759641 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -631,12 +631,12 @@ def test_multilabel_field_class_count(): @pytest.mark.parametrize("tokens, expected_numericalization", [ ( - ["class1", "class2", "class3", "class4"], - np.array([1, 1, 1, 1, 0, 0]) + ["class1", "class2", "class3", "class4"], + np.array([1, 1, 1, 1, 0, 0]) ), ( - [], - np.array([0, 0, 0, 0, 0, 0]) + [], + np.array([0, 0, 0, 0, 0, 0]) ) ]) def test_multilabel_field_custom_numericalization(tokens, expected_numericalization): From 39f322f876d965e28a94469010b19266313d2745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 18 Mar 2020 20:43:59 +0100 Subject: [PATCH 19/25] flake8 --- test/storage/test_field.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/storage/test_field.py b/test/storage/test_field.py index f4759641..0baf53b0 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -631,12 +631,12 @@ def test_multilabel_field_class_count(): @pytest.mark.parametrize("tokens, expected_numericalization", [ ( - ["class1", "class2", "class3", "class4"], - np.array([1, 1, 1, 1, 0, 0]) + ["class1", "class2", "class3", "class4"], + np.array([1, 1, 1, 1, 0, 0]) ), ( - [], - np.array([0, 0, 0, 0, 0, 0]) + [], + np.array([0, 0, 0, 0, 0, 0]) ) ]) def test_multilabel_field_custom_numericalization(tokens, expected_numericalization): From 56fb576bb3f4fb92edf8f6ae4e68971ad06f7afb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 18 Mar 2020 22:04:18 +0100 Subject: [PATCH 20/25] WIP, testing --- takepod/datasets/iterator.py | 8 +++---- takepod/storage/__init__.py | 4 ++-- takepod/storage/field.py | 22 +++++++++++++++++- test/storage/test_field.py | 45 ++++++++++++++++++------------------ 4 files changed, 49 insertions(+), 30 deletions(-) diff --git a/takepod/datasets/iterator.py b/takepod/datasets/iterator.py index fecac261..9c0e686f 100644 --- a/takepod/datasets/iterator.py +++ b/takepod/datasets/iterator.py @@ -267,7 +267,7 @@ def _create_batch(self, examples): matrix = None # np.empty(shape=(n_rows, pad_length)) # non-sequential fields all have length = 1, no padding necessary - should_pad = True if field.is_sequential else False + should_pad = field.is_sequential for i, example in enumerate(examples): @@ -320,14 +320,14 @@ def _create_batch(self, examples): @staticmethod def _get_pad_length(field, examples): - if not field.is_sequential: - return 1 - # the fixed_length attribute of Field has priority over the max length # of all the examples in the batch if field.fixed_length is not None: return field.fixed_length + if not field.is_sequential: + return 1 + # if fixed_length is None, then return the maximum length of all the # examples in the batch def length_of_field(example): diff --git a/takepod/storage/__init__.py b/takepod/storage/__init__.py index ee192886..44a1c04f 100644 --- a/takepod/storage/__init__.py +++ b/takepod/storage/__init__.py @@ -2,7 +2,7 @@ from .example_factory import ExampleFactory, ExampleFormat from .field import Field, TokenizedField, MultilabelField, MultioutputField, \ - unpack_fields, LabelField + unpack_fields, LabelField, SentenceEmbeddingField from .resources.downloader import (BaseDownloader, SCPDownloader, HttpDownloader, SimpleHttpDownloader) from .resources.large_resource import LargeResource, SCPLargeResource @@ -21,6 +21,6 @@ __all__ = ["BaseDownloader", "SCPDownloader", "HttpDownloader", "SimpleHttpDownloader", "Field", "TokenizedField", "LabelField", "MultilabelField", "MultioutputField", - "unpack_fields", "LargeResource", "SCPLargeResource", + "unpack_fields", "LargeResource", "SCPLargeResource", "SentenceEmbeddingField", "VectorStorage", "BasicVectorStorage", "SpecialVocabSymbols", "Vocab", "ExampleFactory", "ExampleFormat", "TfIdfVectorizer"] diff --git a/takepod/storage/field.py b/takepod/storage/field.py index cd1cce5e..97413f5d 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -2,6 +2,7 @@ import logging import itertools from collections import deque +from typing import Callable import numpy as np @@ -655,7 +656,7 @@ def numericalize(self, data): _LOGGER.error(error_msg) raise ValueError(error_msg) - else: + elif not self.custom_numericalize: return None # raw data is just a string, so we need to wrap it into an iterable @@ -970,6 +971,25 @@ def _numericalize_tokens(self, tokens): return numericalize_multihot(tokens, token_numericalize, self.num_of_classes) +class SentenceEmbeddingField(Field): + """Field used for sentence-level multidimensional embeddings.""" + + def __init__(self, + name: str, + embedding_fn: Callable[[str], np.array], + embedding_size: int): + super().__init__(name, + custom_numericalize=embedding_fn, + tokenizer=None, + language=None, + vocab=None, + tokenize=False, + store_as_raw=True, + store_as_tokenized=False, + is_target=False, + fixed_length=embedding_size, + allow_missing_data=True) + def numericalize_multihot(tokens, token_indexer, num_of_classes): active_classes = list(map(token_indexer, tokens)) multihot_encoding = np.zeros(num_of_classes, dtype=np.bool) diff --git a/test/storage/test_field.py b/test/storage/test_field.py index 0baf53b0..d44e138b 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -5,7 +5,7 @@ from mock import patch from takepod.storage import Field, TokenizedField, MultilabelField, \ - Vocab, SpecialVocabSymbols, MultioutputField, LabelField + Vocab, SpecialVocabSymbols, MultioutputField, LabelField, SentenceEmbeddingField ONE_TO_FIVE = [1, 2, 3, 4, 5] @@ -690,36 +690,14 @@ def test_missing_values_default_sequential(): custom_numericalize=lambda x: hash(x), allow_missing_data=True) - _, data_missing = fld.preprocess(None)[0] _, data_exists = fld.preprocess("data_string")[0] - assert data_missing == (None, None) assert data_exists == (None, ["data_string"]) fld.finalize() - assert fld.numericalize(data_missing) is None assert np.all(fld.numericalize(data_exists) == np.array([hash("data_string")])) -def test_missing_values_custom_numericalize(): - fld = Field(name="test_field", - store_as_raw=True, - tokenize=False, - custom_numericalize=int, - allow_missing_data=True) - - _, data_missing = fld.preprocess(None)[0] - _, data_exists = fld.preprocess("404")[0] - - assert data_missing == (None, None) - assert data_exists == ("404", None) - - fld.finalize() - - assert fld.numericalize(data_missing) is None - assert np.all(fld.numericalize(data_exists) == np.array([404])) - - def test_missing_symbol_index_vocab(): vocab = Vocab() fld = Field(name="test_field", @@ -875,3 +853,24 @@ def test_label_field(): _, example = x[0] raw, _ = example assert label_field.numericalize(example) == vocab.stoi[raw] + + +def test_sentence_embedding_field(): + def mock_embedding_fn(sentence): + if sentence == "test_sentence": + return np.array([1, 2, 3, 4]) + + if sentence is None: + return np.zeros(4) + + field = SentenceEmbeddingField("test_field", + embedding_fn=mock_embedding_fn, + embedding_size=4) + + (_, data), = field.preprocess("test_sentence") + numericalization_1 = field.numericalize(data) + assert np.all(numericalization_1 == np.array([1, 2, 3, 4])) + + (_, data), = field.preprocess(None) + numericalization_2 = field.numericalize(data) + assert np.all(numericalization_2 == np.zeros(4)) From f914f8f14ed82db141c418aab119aef24300f9f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 18 Mar 2020 22:10:22 +0100 Subject: [PATCH 21/25] flake8 --- takepod/storage/field.py | 1 + 1 file changed, 1 insertion(+) diff --git a/takepod/storage/field.py b/takepod/storage/field.py index 97413f5d..78cf0ca0 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -990,6 +990,7 @@ def __init__(self, fixed_length=embedding_size, allow_missing_data=True) + def numericalize_multihot(tokens, token_indexer, num_of_classes): active_classes = list(map(token_indexer, tokens)) multihot_encoding = np.zeros(num_of_classes, dtype=np.bool) From 37c0ed58c4f84705b6d64ff1e09d2855145f3d8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 18 Mar 2020 22:22:23 +0100 Subject: [PATCH 22/25] Added documentation --- takepod/storage/field.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/takepod/storage/field.py b/takepod/storage/field.py index 78cf0ca0..db7b2972 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -978,6 +978,18 @@ def __init__(self, name: str, embedding_fn: Callable[[str], np.array], embedding_size: int): + """ + Field used for sentence-level multidimensional embeddings. + + Parameters + ---------- + name: str + Field name, used for referencing data in the dataset. + embedding_fn: Callable[[str], np.array] + Callable that takes a string and returns a fixed-width embedding. + embedding_size: int + Width of the embedding. + """ super().__init__(name, custom_numericalize=embedding_fn, tokenizer=None, From 19141e0377645cc7becd57db086155d4262b5c4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Thu, 2 Apr 2020 22:33:22 +0200 Subject: [PATCH 23/25] rebased to master --- takepod/storage/field.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/takepod/storage/field.py b/takepod/storage/field.py index db7b2972..7e1dfef7 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -7,7 +7,7 @@ import numpy as np from takepod.preproc.tokenizers import get_tokenizer -from takepod.storage.vocab import Vocab, SpecialVocabSymbols +from takepod.storage.vocab import Vocab _LOGGER = logging.getLogger(__name__) From c67546fc44adc341b32df8e499dcbc12d65ba7bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Fri, 17 Apr 2020 15:02:32 +0200 Subject: [PATCH 24/25] Added language, vocab, is_target, allow_missing_data --- takepod/storage/field.py | 22 +++++++++++++++++----- test/storage/test_field.py | 3 ++- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/takepod/storage/field.py b/takepod/storage/field.py index a6eba063..c6270c97 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -976,7 +976,11 @@ class SentenceEmbeddingField(Field): def __init__(self, name: str, embedding_fn: Callable[[str], np.array], - embedding_size: int): + embedding_size: int, + vocab=None, + is_target=False, + language='en', + allow_missing_data=False): """ Field used for sentence-level multidimensional embeddings. @@ -986,20 +990,28 @@ def __init__(self, Field name, used for referencing data in the dataset. embedding_fn: Callable[[str], np.array] Callable that takes a string and returns a fixed-width embedding. + In case of missing data, this callable will be passed a None. embedding_size: int Width of the embedding. + vocab: Vocab + Vocab that will be updated with the sentences passed to this field. + Keep in mind that whole sentences will be passed to the vocab. + language: str + Langage of the data. Not used in this field. + allow_missing_data: bool + Whether this field will allow the processing of missing data. """ super().__init__(name, custom_numericalize=embedding_fn, tokenizer=None, - language=None, - vocab=None, + language=language, + vocab=vocab, tokenize=False, store_as_raw=True, store_as_tokenized=False, - is_target=False, + is_target=is_target, fixed_length=embedding_size, - allow_missing_data=True) + allow_missing_data=allow_missing_data) def numericalize_multihot(tokens, token_indexer, num_of_classes): diff --git a/test/storage/test_field.py b/test/storage/test_field.py index d44e138b..5f7a6ea9 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -865,7 +865,8 @@ def mock_embedding_fn(sentence): field = SentenceEmbeddingField("test_field", embedding_fn=mock_embedding_fn, - embedding_size=4) + embedding_size=4, + allow_missing_data=True) (_, data), = field.preprocess("test_sentence") numericalization_1 = field.numericalize(data) From 17980ba12505168ca22fb4418e30f240f181cbbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Fri, 17 Apr 2020 15:05:46 +0200 Subject: [PATCH 25/25] merged master --- test/storage/test_field.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/storage/test_field.py b/test/storage/test_field.py index 35701994..8b777e78 100644 --- a/test/storage/test_field.py +++ b/test/storage/test_field.py @@ -5,7 +5,7 @@ from mock import patch from podium.storage import Field, TokenizedField, MultilabelField, \ - Vocab, SpecialVocabSymbols, MultioutputField, LabelField + Vocab, SpecialVocabSymbols, MultioutputField, LabelField, SentenceEmbeddingField ONE_TO_FIVE = [1, 2, 3, 4, 5]