Merge pull request #1513 from benoit-pierre/pr/translation_avoid_unne…

…cessary_lookups translation: avoid unnecessary lookups
openstenoproject · May 14, 2022 · 56f1770 · 56f1770
2 parents 38cd7b0 + 51bf53b
commit 56f1770
Show file tree

Hide file tree

Showing 3 changed files with 233 additions and 34 deletions.
diff --git a/news.d/feature/1513.core.md b/news.d/feature/1513.core.md
@@ -0,0 +1 @@
+Improve translation stage: cut down on unnecessary / duplicate dictionary lookups.
diff --git a/plover/translation.py b/plover/translation.py
@@ -292,15 +292,21 @@ def translate_stroke(self, stroke):
         stroke -- The Stroke object to process.
 
         """
-        mapping = self._lookup_with_prefix(self._state.translations, [stroke])
+        max_len = self._dictionary.longest_key
+        mapping = self._lookup_with_prefix(max_len, self._state.translations, [stroke])
         macro = _mapping_to_macro(mapping, stroke)
         if macro is not None:
             self.translate_macro(macro)
             return
         t = (
-            self._find_translation_helper(stroke) or
-            self._find_translation_helper(stroke, system.SUFFIX_KEYS) or
-            Translation([stroke], mapping)
+            # No prefix lookups (note we avoid looking up [stroke] again).
+            self._find_longest_match(2, max_len, stroke) or
+            # Return [stroke] result if mapped.
+            (mapping is not None and Translation([stroke], mapping)) or
+            # No direct match, try with suffixes.
+            self._find_longest_match(1, max_len, stroke, system.SUFFIX_KEYS) or
+            # Fallback to untranslate.
+            Translation([stroke], None)
         )
         self.translate_translation(t)
 
@@ -328,15 +334,33 @@ def _do(self, *translations):
         self._state.translations.extend(translations)
         self._to_do += len(translations)
 
-    def _find_translation_helper(self, stroke, suffixes=()):
+    def _find_longest_match(self, min_len, max_len, stroke, suffixes=()):
+        '''Find mapping with the longest series of strokes.
+
+        min_len  -- Minimum number of strokes involved.
+        max_len  -- Maximum number of strokes involved.
+        stroke   -- The latest stroke.
+        suffixes -- List of suffix keys to try.
+
+        Return the corresponding translation, or None if no match is found.
+
+        Note: the code either look for a direct match (empty suffix
+        list), or assume the last stroke contains an implicit suffix
+        and look for a corresponding match, but not both.
+        '''
+        if suffixes:
+            # Implicit suffix lookup, determine possible suffixes.
+            suffixes = self._lookup_involved_suffixes(stroke, suffixes)
+            if not suffixes:
+                # No suffix involved, abort.
+                return None
         # Figure out how much of the translation buffer can be involved in this
         # stroke and build the stroke list for translation.
         num_strokes = 1
         translation_count = 0
-        longest_key = self._dictionary.longest_key
         for t in reversed(self._state.translations):
             num_strokes += len(t)
-            if num_strokes > longest_key:
+            if num_strokes > max_len:
                 break
             translation_count += 1
         translation_index = len(self._state.translations) - translation_count
@@ -348,48 +372,88 @@ def _find_translation_helper(self, stroke, suffixes=()):
             replaced = translations[i:]
             strokes = [s for t in replaced for s in t.strokes]
             strokes.append(stroke)
-            mapping = self._lookup_with_prefix(translations[:i], strokes, suffixes)
+            if len(strokes) < min_len:
+                continue
+            mapping = self._lookup_with_prefix(max_len, translations[:i], strokes, suffixes)
             if mapping is not None:
                 t = Translation(strokes, mapping)
                 t.replaced = replaced
                 return t
+        return None
+
+    def _lookup_strokes(self, strokes):
+        '''Look for a matching translation.
+
+        strokes -- a list of Stroke instances.
+
+        Return the resulting mapping.
+        '''
+        return self._dictionary.lookup(tuple(s.rtfcre for s in strokes))
+
+    def _lookup_with_suffix(self, strokes, suffixes=()):
+        '''Look for a matching translation.
+
+        suffixes -- A list of (suffix stroke, suffix mapping) pairs to try.
+
+        If the suffix list is empty, look for a direct match.
+
+        Otherwise, assume the last stroke contains an implicit suffix,
+        and look for a corresponding match.
+        '''
+        if not suffixes:
+            # No suffix, do a regular lookup.
+            return self._lookup_strokes(strokes)
+        for suffix_stroke, suffix_mapping in suffixes:
+            assert suffix_stroke in strokes[-1]
+            main_mapping = self._lookup_strokes(strokes[:-1] + [strokes[-1] - suffix_stroke])
+            if main_mapping is not None:
+                return main_mapping + ' ' + suffix_mapping
+        return None
+
+    def _lookup_involved_suffixes(self, stroke, suffixes):
+        '''Find possible implicit suffixes for a stroke.
+
+        stroke   -- The stroke to check for implicit suffixes.
+        suffixes -- List of supported suffix keys.
+
+        Return a list of (suffix_stroke, suffix_mapping) pairs.
+        '''
+        possible_suffixes = []
+        for suffix_stroke in map(Stroke, suffixes):
+            if suffix_stroke not in stroke:
+                continue
+            suffix_mapping = self._lookup_strokes((suffix_stroke,))
+            if suffix_mapping is None:
+                continue
+            possible_suffixes.append((suffix_stroke, suffix_mapping))
+        return possible_suffixes
 
     def lookup(self, strokes, suffixes=()):
-        dict_key = tuple(s.rtfcre for s in strokes)
-        result = self._dictionary.lookup(dict_key)
+        result = self._lookup_strokes(strokes)
         if result is not None:
             return result
-        for key in suffixes:
-            if key in strokes[-1].steno_keys:
-                dict_key = (Stroke([key]).rtfcre,)
-                suffix_mapping = self._dictionary.lookup(dict_key)
-                if suffix_mapping is None:
-                    continue
-                keys = strokes[-1].steno_keys[:]
-                keys.remove(key)
-                copy = strokes[:]
-                copy[-1] = Stroke(keys)
-                dict_key = tuple(s.rtfcre for s in copy)
-                main_mapping = self._dictionary.lookup(dict_key)
-                if main_mapping is None:
-                    continue
-                return main_mapping + ' ' + suffix_mapping
-        return None
+        suffixes = self._lookup_involved_suffixes(strokes[-1], suffixes)
+        if not suffixes:
+            return None
+        return self._lookup_with_suffix(strokes, suffixes)
 
-    def _previous_word_is_finished(self, last_translations):
+    @staticmethod
+    def _previous_word_is_finished(last_translations):
         if not last_translations:
             return True
         formatting = last_translations[-1].formatting
         if not formatting:
             return True
         return formatting[-1].word_is_finished
 
-    def _lookup_with_prefix(self, last_translations, strokes, suffixes=()):
-        if self._previous_word_is_finished(last_translations):
-            mapping = self.lookup([Stroke.PREFIX_STROKE] + strokes, suffixes)
+    def _lookup_with_prefix(self, max_len, last_translations, strokes, suffixes=()):
+        if len(strokes) < max_len and self._previous_word_is_finished(last_translations):
+            mapping = self._lookup_with_suffix([Stroke.PREFIX_STROKE] + strokes, suffixes)
             if mapping is not None:
                 return mapping
-        return self.lookup(strokes, suffixes)
+        if len(strokes) <= max_len:
+            return self._lookup_with_suffix(strokes, suffixes)
+        return None
 
 
 class _State:

diff --git a/test/test_translation.py b/test/test_translation.py
@@ -4,6 +4,7 @@
 """Unit tests for translation.py."""
 
 from collections import namedtuple
+import ast
 import copy
 import operator
 
@@ -416,15 +417,18 @@ def test_restrict_size_multiple_translations(self):
 
 class TestTranslateStroke:
 
+    DICT_COLLECTION_CLASS = StenoDictionaryCollection
+
     class CaptureOutput:
-        output = namedtuple('output', 'undo do prev')
+
+        Output = namedtuple('Output', 'undo do prev')
 
         def __init__(self):
             self.output = []
 
         def __call__(self, undo, new, prev):
             prev = list(prev) if prev else None
-            self.output = type(self).output(undo, new, prev)
+            self.output = self.Output(undo, new, prev)
 
     def t(self, strokes):
         """A quick way to make a translation."""
@@ -470,7 +474,7 @@ def _check_output(self, undo, do, prev):
 
     def setup_method(self):
         self.d = StenoDictionary()
-        self.dc = StenoDictionaryCollection([self.d])
+        self.dc = self.DICT_COLLECTION_CLASS([self.d])
         self.s = _State()
         self.o = self.CaptureOutput()
         self.tlor = Translator()
@@ -808,3 +812,133 @@ def test_untranslate_translation(self):
 def test_escape_unescape_translation(raw, escaped):
     assert unescape_translation(escaped) == raw
     assert escape_translation(raw) == escaped
+
+
+class TestNoUnnecessaryLookups(TestTranslateStroke):
+
+    # Custom dictionary collection class for tracking lookups.
+    class DictTracy(StenoDictionaryCollection):
+
+        def __init__(self, dicts):
+            super().__init__(dicts)
+            self.lookup_history = []
+
+        def lookup(self, key):
+            self.lookup_history.append(key)
+            return super().lookup(key)
+
+    DICT_COLLECTION_CLASS = DictTracy
+
+    def _prepare_state(self, definitions, translations):
+        if definitions:
+            for steno, english in ast.literal_eval('{' + definitions + '}').items():
+                self.define(steno, english)
+        translations = self.lt(translations)
+        for t in translations:
+            for s in t.strokes:
+                self.translate(s.rtfcre)
+        state = translations[len(translations)-self.dc.longest_key:]
+        self._check_translations(state)
+        self.dc.lookup_history.clear()
+
+    def _check_lookup_history(self, expected):
+        # Hide from traceback on assertions (reduce output size for failed tests).
+        __tracebackhide__ = operator.methodcaller('errisinstance', AssertionError)
+        result = ['/'.join(key) for key in self.dc.lookup_history]
+        expected = expected.split()
+        msg = '''
+        lookup history:
+            results: %s
+            expected: %s
+        ''' % (result, expected)
+        assert result == expected, msg
+
+    def test_zero_lookups(self):
+        # No lookups at all if longest key is zero.
+        self.translate('TEFT')
+        self._check_lookup_history('')
+        self._check_translations(self.lt('TEFT'))
+
+    def test_no_prefix_lookup_over_the_longest_key_limit(self):
+        self._prepare_state(
+            '''
+            "HROPBG/EFT/KAOE": "longest key",
+            "HRETS": "let's",
+            "TKO": "do",
+            "SPH": "some",
+            "TEFT": "test",
+            "-G": "{^ing}",
+            ''',
+            'HRETS TKO SPH TEFT')
+        self.translate('-G')
+        self._check_lookup_history(
+            # Macros.
+            '''
+            /-G
+            -G
+            '''
+            # Others.
+            '''
+            SPH/TEFT/-G
+            /TEFT/-G TEFT/-G
+            '''
+        )
+
+    def test_no_duplicate_lookups_for_longest_no_suffix_match(self):
+        self._prepare_state(
+            '''
+            "TEFT": "test",
+            "-G": "{^ing}",
+            ''',
+            'TEFT')
+        self.translate('TEFGT')
+        self._check_lookup_history(
+            # Macros.
+            '''
+            TEFGT
+            '''
+            # No suffix.
+            '''
+            '''
+            # With suffix.
+            '''
+            -G TEFT
+            '''
+        )
+
+    def test_lookup_suffixes_once(self):
+        self._prepare_state(
+            '''
+            "HROPBG/EFT/KAOE": "longest key",
+            "HRETS": "let's",
+            "TEFT": "test",
+            "SPH": "some",
+            "SUFBGS": "suffix",
+            "-G": "{^ing}",
+            "-S": "{^s}",
+            "-D": "{^ed}",
+            "-Z": "{^s}",
+            ''',
+            'HRETS TEFT SPH')
+        self.translate('SUFBGSZ')
+        self._check_lookup_history(
+            # Macros.
+            '''
+            /SUFBGSZ
+            SUFBGSZ
+            '''
+            # Without suffix.
+            '''
+            TEFT/SPH/SUFBGSZ
+            /SPH/SUFBGSZ
+            SPH/SUFBGSZ
+            '''
+            # Suffix lookups.
+            '''
+            -Z -S -G
+            TEFT/SPH/SUFBGS TEFT/SPH/SUFBGZ TEFT/SPH/SUFBSZ
+            /SPH/SUFBGS /SPH/SUFBGZ /SPH/SUFBSZ
+            SPH/SUFBGS SPH/SUFBGZ SPH/SUFBSZ
+            /SUFBGS /SUFBGZ /SUFBSZ
+            SUFBGS
+            ''')