From 491f681c717e51c62b44e3c650699a6ba516b574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20Ba=C3=B1ados=20Schwerter?= Date: Mon, 25 Nov 2024 21:19:19 +0000 Subject: [PATCH] search_run -> search_results --- src/morphodict/frontend/views.py | 14 +++--- src/morphodict/search/affix.py | 14 +++--- src/morphodict/search/core.py | 4 +- src/morphodict/search/cvd_search.py | 8 ++-- src/morphodict/search/espt.py | 26 +++++----- src/morphodict/search/glossary_count.py | 4 +- src/morphodict/search/lemma_freq.py | 4 +- src/morphodict/search/lookup.py | 34 ++++++------- src/morphodict/search/pos_matches.py | 12 ++--- src/morphodict/search/presentation.py | 11 +++-- src/morphodict/search/ranking_test.py | 4 +- src/morphodict/search/runner.py | 55 +++++++++++----------- src/morphodict/tests/espt/test_espt_crk.py | 22 ++++----- 13 files changed, 107 insertions(+), 105 deletions(-) diff --git a/src/morphodict/frontend/views.py b/src/morphodict/frontend/views.py index 11bcc259a..4bb233214 100644 --- a/src/morphodict/frontend/views.py +++ b/src/morphodict/frontend/views.py @@ -91,17 +91,17 @@ def index(request): # pragma: no cover user_query = request.GET.get("q", None) dict_source = get_dict_source(request) - search_run = None + search_results = None if user_query: include_auto_definitions = should_include_auto_definitions(request) inflect_english_phrases = should_inflect_phrases(request) - search_run = search_with_affixes( + search_results = search_with_affixes( user_query, include_auto_definitions=include_auto_definitions, inflect_english_phrases=inflect_english_phrases, ) - search_results = search_run.serialized_presentation_results( + search_results_presentation = search_results.serialized_presentation_results( display_mode=DisplayMode.current_value_from_request(request), animate_emoji=AnimateEmoji.current_value_from_request(request), show_emoji=ShowEmoji.current_value_from_request(request), @@ -110,7 +110,7 @@ def index(request): # pragma: no cover did_search = True else: - search_results = [] + search_results_presentation = [] did_search = False if did_search: @@ -123,15 +123,15 @@ def index(request): # pragma: no cover word_search_form=WordSearchForm(), # when we have initial query word to search and display query_string=user_query, - search_results=search_results, + search_results=search_results_presentation, did_search=did_search, ) context["show_dict_source_setting"] = settings.SHOW_DICT_SOURCE_SETTING context["show_morphemes"] = request.COOKIES.get("show_morphemes") context["show_ic"] = request.COOKIES.get("show_inflectional_category") - if search_run and search_run.verbose_messages and search_run.query.verbose: + if search_results and search_results.verbose_messages and search_results.query.verbose: context["verbose_messages"] = json.dumps( - search_run.verbose_messages, indent=2, ensure_ascii=False + search_results.verbose_messages, indent=2, ensure_ascii=False ) return render(request, "morphodict/index.html", context) diff --git a/src/morphodict/search/affix.py b/src/morphodict/search/affix.py index 7c71ec9f2..c64004498 100644 --- a/src/morphodict/search/affix.py +++ b/src/morphodict/search/affix.py @@ -97,27 +97,27 @@ def do_affix_search(query: InternalForm, affixes: AffixSearcher) -> Iterable[Wor return Wordform.objects.filter(id__in=matched_ids) -def do_target_language_affix_search(search_run: core.SearchResults): +def do_target_language_affix_search(search_results: core.SearchResults): matching_words = do_affix_search( - search_run.internal_query, + search_results.internal_query, cache.target_language_affix_searcher, ) for word in matching_words: - search_run.add_result(Result(word, target_language_affix_match=True)) + search_results.add_result(Result(word, target_language_affix_match=True)) -def do_source_language_affix_search(search_run: core.SearchResults): +def do_source_language_affix_search(search_results: core.SearchResults): matching_words = do_affix_search( - search_run.internal_query, + search_results.internal_query, cache.source_language_affix_searcher, ) for word in matching_words: - search_run.add_result( + search_results.add_result( Result( word, source_language_affix_match=True, query_wordform_edit_distance=get_modified_distance( - word.text, search_run.internal_query + word.text, search_results.internal_query ), ) ) diff --git a/src/morphodict/search/core.py b/src/morphodict/search/core.py index 24589e755..805facad9 100644 --- a/src/morphodict/search/core.py +++ b/src/morphodict/search/core.py @@ -85,7 +85,7 @@ def presentation_results( return [ presentation.PresentationResult( r, - search_run=self, + search_results=self, display_mode=display_mode, animate_emoji=animate_emoji, show_emoji=show_emoji, @@ -129,7 +129,7 @@ def add_verbose_message(self, message=None, **messages): Protip! Use keyword arguments as syntactic sugar for adding a dictionary, e.g., - search_run.add_verbose_message(foo="bar") + search_results.add_verbose_message(foo="bar") Will appear as: diff --git a/src/morphodict/search/cvd_search.py b/src/morphodict/search/cvd_search.py index b5781108e..dd5e214cd 100644 --- a/src/morphodict/search/cvd_search.py +++ b/src/morphodict/search/cvd_search.py @@ -19,17 +19,17 @@ logger = logging.getLogger(__name__) -def do_cvd_search(search_run: SearchResults): +def do_cvd_search(search_results: SearchResults): """Use cosine vector distance to add results to the search run. Keywords from the query string are turned into vectors from Google News, added together, and then compared against pre-computed definition vectors. """ - keys = extract_keyed_words(search_run.query.query_string, google_news_vectors()) + keys = extract_keyed_words(search_results.query.query_string, google_news_vectors()) if not keys: return - search_run.add_verbose_message(cvd_extracted_keys=keys) + search_results.add_verbose_message(cvd_extracted_keys=keys) query_vector = vector_for_keys(google_news_vectors(), keys) try: @@ -71,4 +71,4 @@ def do_cvd_search(search_run: SearchResults): else: for wf in wordforms_for_query: if wordform_query_matches(wordform_query, wf): - search_run.add_result(Result(wf, cosine_vector_distance=distance)) + search_results.add_result(Result(wf, cosine_vector_distance=distance)) diff --git a/src/morphodict/search/espt.py b/src/morphodict/search/espt.py index f06fbfe30..e66574625 100644 --- a/src/morphodict/search/espt.py +++ b/src/morphodict/search/espt.py @@ -39,12 +39,12 @@ class EsptSearch: other methods. """ - def __init__(self, search_run): - self.search_run = search_run + def __init__(self, search_results): + self.search_results = search_results self.query_analyzed_ok = False - def analyze_query(self): - """Analyze this search’s search_run query, possibly updating it. + def convert_search_query_to_espt(self): + """Analyze this search’s search_results query, possibly updating it. If the phrase-parsing FST returns an analysis, e.g., “ crawls +V+AI+Prt+3Pl” for “they crawled”, then the tags are saved for @@ -53,8 +53,8 @@ def analyze_query(self): """ self.new_tags = [] analyzed_query = PhraseAnalyzedQuery( - self.search_run.internal_query, - add_verbose_message=self.search_run.add_verbose_message, + self.search_results.internal_query, + add_verbose_message=self.search_results.add_verbose_message, ) if analyzed_query.has_tags: if "+N" in analyzed_query.tags: @@ -68,13 +68,13 @@ def analyze_query(self): self.new_tags = tag_map.map_tags(analyzed_query.tags) except UnknownTagError as e: logger.error(f"Unable to map tags for {analyzed_query}", exc_info=True) - self.search_run.add_verbose_message(espt_analysis_error=repr(e)) + self.search_results.add_verbose_message(espt_analysis_error=repr(e)) return - self.search_run.query.replace_query(analyzed_query.filtered_query) + self.search_results.query.replace_query(analyzed_query.filtered_query) self.query_analyzed_ok = True - self.search_run.add_verbose_message( + self.search_results.add_verbose_message( filtered_query=analyzed_query.filtered_query, tags=analyzed_query.tags, new_tags=self.new_tags, @@ -116,10 +116,10 @@ def inflect_search_results(self): # if there are multiple inflections for the same original result, we # may already have removed it - if self.search_run.has_result(result.original_result): - self.search_run.remove_result(result.original_result) + if self.search_results.has_result(result.original_result): + self.search_results.remove_result(result.original_result) - self.search_run.add_result( + self.search_results.add_result( result.original_result.create_related_result( wordform, is_espt_result=True, @@ -128,7 +128,7 @@ def inflect_search_results(self): def _collect_non_inflected_results(self) -> list[Result]: words = [] - for r in self.search_run.unsorted_results(): + for r in self.search_results.unsorted_results(): if not r.is_lemma: continue analysis = r.wordform.analysis diff --git a/src/morphodict/search/glossary_count.py b/src/morphodict/search/glossary_count.py index 00760a1a6..fb8fa210f 100644 --- a/src/morphodict/search/glossary_count.py +++ b/src/morphodict/search/glossary_count.py @@ -5,9 +5,9 @@ DOCUMENT_FREQUENCY = {} -def get_glossary_count(search_run): +def get_glossary_count(search_results): prep_freqs() - [find_glossary_count(result) for result in search_run.unsorted_results()] + [find_glossary_count(result) for result in search_results.unsorted_results()] def prep_freqs(): diff --git a/src/morphodict/search/lemma_freq.py b/src/morphodict/search/lemma_freq.py index 5dd74013a..048a63d3a 100644 --- a/src/morphodict/search/lemma_freq.py +++ b/src/morphodict/search/lemma_freq.py @@ -20,9 +20,9 @@ def load_lemma_data(): LEMMA_FREQUENCY[l] = int(l_freq) / max -def get_lemma_freq(search_run): +def get_lemma_freq(search_results): load_lemma_data() - [find_lemma_freq(result) for result in search_run.unsorted_results()] + [find_lemma_freq(result) for result in search_results.unsorted_results()] def find_lemma_freq(result): diff --git a/src/morphodict/search/lookup.py b/src/morphodict/search/lookup.py index 79450dd68..af552f822 100644 --- a/src/morphodict/search/lookup.py +++ b/src/morphodict/search/lookup.py @@ -20,14 +20,14 @@ logger = logging.getLogger(__name__) -def fetch_results(search_run: core.SearchResults): - fetch_results_from_target_language_keywords(search_run) - fetch_results_from_source_language_keywords(search_run) +def fetch_results(search_results: core.SearchResults): + fetch_results_from_target_language_keywords(search_results) + fetch_results_from_source_language_keywords(search_results) # Use the spelling relaxation to try to decipher the query # e.g., "atchakosuk" becomes "acâhkos+N+A+Pl" -- # thus, we can match "acâhkos" in the dictionary! - fst_analyses = set(rich_analyze_relaxed(search_run.internal_query)) + fst_analyses = set(rich_analyze_relaxed(search_results.internal_query)) # print([a.tuple for a in fst_analyses]) db_matches = list( @@ -35,12 +35,12 @@ def fetch_results(search_run: core.SearchResults): ) for wf in db_matches: - search_run.add_result( + search_results.add_result( Result( wf, source_language_match=wf.text, query_wordform_edit_distance=get_modified_distance( - wf.text, search_run.internal_query + wf.text, search_results.internal_query ), ) ) @@ -61,7 +61,7 @@ def fetch_results(search_run: core.SearchResults): logger.error( "Cannot generate normative form for analysis: %s (query: %s)", analysis, - search_run.internal_query, + search_results.internal_query, ) continue @@ -69,7 +69,7 @@ def fetch_results(search_run: core.SearchResults): # closest to what the user typed. normatized_user_query = min( normatized_form_for_analysis, - key=lambda f: get_modified_distance(f, search_run.internal_query), + key=lambda f: get_modified_distance(f, search_results.internal_query), ) possible_lemma_wordforms = best_lemma_matches( @@ -82,12 +82,12 @@ def fetch_results(search_run: core.SearchResults): raw_analysis=analysis.tuple, lemma=lemma_wordform, ) - search_run.add_result( + search_results.add_result( Result( synthetic_wordform, analyzable_inflection_match=True, query_wordform_edit_distance=get_modified_distance( - search_run.internal_query, + search_results.internal_query, normatized_user_query, ), ) @@ -136,27 +136,27 @@ def best_lemma_matches(analysis, possible_lemmas) -> list[Wordform]: ] -def fetch_results_from_target_language_keywords(search_run): - for stemmed_keyword in stem_keywords(search_run.internal_query): +def fetch_results_from_target_language_keywords(search_results): + for stemmed_keyword in stem_keywords(search_results.internal_query): for wordform in Wordform.objects.filter( target_language_keyword__text__iexact=stemmed_keyword ): - search_run.add_result( + search_results.add_result( Result(wordform, target_language_keyword_match=[stemmed_keyword]) ) -def fetch_results_from_source_language_keywords(search_run): +def fetch_results_from_source_language_keywords(search_results): res = SourceLanguageKeyword.objects.filter( - Q(text=to_source_language_keyword(search_run.internal_query)) + Q(text=to_source_language_keyword(search_results.internal_query)) ) for kw in res: - search_run.add_result( + search_results.add_result( Result( kw.wordform, source_language_keyword_match=[kw.text], query_wordform_edit_distance=get_modified_distance( - search_run.internal_query, kw.wordform.text + search_results.internal_query, kw.wordform.text ), ) ) diff --git a/src/morphodict/search/pos_matches.py b/src/morphodict/search/pos_matches.py index 7c1e2fb53..5f5ab1ead 100644 --- a/src/morphodict/search/pos_matches.py +++ b/src/morphodict/search/pos_matches.py @@ -3,14 +3,14 @@ from morphodict.analysis import rich_analyze_relaxed -def find_pos_matches(search_run: SearchResults) -> None: - analyzed_query = AnalyzedQuery(search_run.internal_query) - # print(search_run.verbose_messages["new_tags"]) +def find_pos_matches(search_results: SearchResults) -> None: + analyzed_query = AnalyzedQuery(search_results.internal_query) + # print(search_results.verbose_messages["new_tags"]) - if len(search_run.verbose_messages) <= 1: + if len(search_results.verbose_messages) <= 1: return - tags = search_run.verbose_messages[1].get("tags") - [pos_match(result, tags) for result in search_run.unsorted_results()] + tags = search_results.verbose_messages[1].get("tags") + [pos_match(result, tags) for result in search_results.unsorted_results()] def pos_match(result, tags): diff --git a/src/morphodict/search/presentation.py b/src/morphodict/search/presentation.py index 2e0413e2d..7f1363278 100644 --- a/src/morphodict/search/presentation.py +++ b/src/morphodict/search/presentation.py @@ -112,14 +112,14 @@ def __init__( self, result: types.Result, *, - search_run: core.SearchResults, + search_results: core.SearchResults, display_mode="community", animate_emoji=AnimateEmoji.default, show_emoji=ShowEmoji.default, dict_source=None, ): self._result = result - self._search_run = search_run + self._search_results = search_results self._relabeller = { "english": read_labels().english, "linguistic": read_labels().linguistic_long, @@ -216,7 +216,7 @@ def serialize(self) -> SerializedPresentationResult: # This is the only place include_auto_definitions is used, # because we only auto-translate non-lemmas, and this is the # only place where a non-lemma search result appears. - include_auto_definitions=self._search_run.include_auto_definitions, + include_auto_definitions=self._search_results.include_auto_definitions, dict_source=self.dict_source, ), "lexical_info": self.lexical_info, @@ -229,13 +229,13 @@ def serialize(self) -> SerializedPresentationResult: self.is_lemma, self.lemma_wordform, self.dict_source, - self._search_run.include_auto_definitions, + self._search_results.include_auto_definitions, ), "relevant_tags": tuple(t.serialize() for t in self.relevant_tags), "morphemes": self.morphemes, "lemma_morphemes": self.lemma_morphemes, } - if self._search_run.query.verbose: + if self._search_results.query.verbose: cast(Any, ret)["verbose_info"] = self._result return ret @@ -319,6 +319,7 @@ def should_show_form_of( return True if is_lemma: return True + # TODO Refactor inner for-loop using instead a search via django .values for definition in lemma_wordform.definitions.all(): for source in definition.source_ids: if source in dict_source: diff --git a/src/morphodict/search/ranking_test.py b/src/morphodict/search/ranking_test.py index 1e22119b0..ab0e48d32 100644 --- a/src/morphodict/search/ranking_test.py +++ b/src/morphodict/search/ranking_test.py @@ -57,8 +57,8 @@ def test_model_evaluation(expected, kwargs): @pytest.mark.skip() def test_cvd_exclusive_only_uses_cvd_for_ranking(db): - search_run = search(query="dance cvd:2") - results = search_run.sorted_results() + search_results = search(query="dance cvd:2") + results = search_results.sorted_results() assert len(results) > 2 def is_sorted_by_cvd(results: list[Result]): diff --git a/src/morphodict/search/runner.py b/src/morphodict/search/runner.py index 001b489f8..1efdf0efd 100644 --- a/src/morphodict/search/runner.py +++ b/src/morphodict/search/runner.py @@ -22,32 +22,33 @@ def search( - *, query: str, include_affixes=True, include_auto_definitions=False, - inflect_english_phrases=False + inflect_english_phrases=False, ) -> SearchResults: """ Perform an actual search, using the provided options. This class encapsulates the logic of which search methods to try, and in - which order, to build up results in a SearchResults. + which order, to build up results in a SearchResults object. """ - search_run = SearchResults( + search_results = SearchResults( query=query, include_auto_definitions=include_auto_definitions ) - initial_query_terms = search_run.query.query_terms[:] - if (search_run.query.espt or inflect_english_phrases) and ( + initial_query_terms = search_results.query.query_terms[:] + + # If we need to do english simple phrase search + if (search_results.query.espt or inflect_english_phrases) and ( len(initial_query_terms) > 1 ): - espt_search = EsptSearch(search_run) - espt_search.analyze_query() + espt_search = EsptSearch(search_results) + espt_search.convert_search_query_to_espt() if settings.MORPHODICT_ENABLE_CVD: cvd_search_type = cast_away_optional( - first_non_none_value(search_run.query.cvd, default=CvdSearchType.DEFAULT) + first_non_none_value(search_results.query.cvd, default=CvdSearchType.DEFAULT) ) # For when you type 'cvd:exclusive' in a query to debug ONLY CVD results! @@ -56,54 +57,54 @@ def search( def sort_by_cvd(r: Result): return r.cosine_vector_distance - search_run.sort_function = sort_by_cvd - do_cvd_search(search_run) - return search_run + search_results.sort_function = sort_by_cvd + do_cvd_search(search_results) + return search_results - fetch_results(search_run) + fetch_results(search_results) if ( settings.MORPHODICT_ENABLE_AFFIX_SEARCH and include_affixes - and not query_would_return_too_many_results(search_run.internal_query) + and not query_would_return_too_many_results(search_results.internal_query) ): - do_source_language_affix_search(search_run) - do_target_language_affix_search(search_run) + do_source_language_affix_search(search_results) + do_target_language_affix_search(search_results) if settings.MORPHODICT_ENABLE_CVD: if cvd_search_type.should_do_search() and not is_almost_certainly_cree( - search_run + search_results ): - do_cvd_search(search_run) + do_cvd_search(search_results) - if (search_run.query.espt or inflect_english_phrases) and ( + if (search_results.query.espt or inflect_english_phrases) and ( len(initial_query_terms) > 1 ): espt_search.inflect_search_results() - find_pos_matches(search_run) - get_glossary_count(search_run) - get_lemma_freq(search_run) + find_pos_matches(search_results) + get_glossary_count(search_results) + get_lemma_freq(search_results) - return search_run + return search_results CREE_LONG_VOWEL = re.compile("[êîôâēīōā]") -def is_almost_certainly_cree(search_run: SearchResults) -> bool: +def is_almost_certainly_cree(search_results: SearchResults) -> bool: """ Heuristics intended to AVOID doing an English search. """ - query = search_run.query + query = search_results.query # If there is a word with two or more dashes in it, it's probably Cree: if any(term.count("-") >= 2 for term in query.query_terms): - search_run.add_verbose_message( + search_results.add_verbose_message( "Skipping CVD because query has too many hyphens" ) return True if CREE_LONG_VOWEL.search(query.query_string): - search_run.add_verbose_message("Skipping CVD because query has Cree diacritics") + search_results.add_verbose_message("Skipping CVD because query has Cree diacritics") return True return False diff --git a/src/morphodict/tests/espt/test_espt_crk.py b/src/morphodict/tests/espt/test_espt_crk.py index 587f46310..c26873112 100644 --- a/src/morphodict/tests/espt/test_espt_crk.py +++ b/src/morphodict/tests/espt/test_espt_crk.py @@ -92,16 +92,16 @@ def test_search_with_tags(query, has_tags, tags, filtered_query): ], ) def test_espt_search(db, search, params): - search_run = SearchResults(search) - espt_search = EsptSearch(search_run) - espt_search.analyze_query() - assert search_run.query.query_terms == params["expected_query_terms"] - assert search_run.query.query_string == " ".join(params["expected_query_terms"]) + search_results = SearchResults(search) + espt_search = EsptSearch(search_results) + espt_search.convert_search_query_to_espt() + assert search_results.query.query_terms == params["expected_query_terms"] + assert search_results.query.query_string == " ".join(params["expected_query_terms"]) assert espt_search.new_tags == params["expected_new_tags"] lemma1 = Wordform.objects.get(slug=params["slug"], is_lemma=True) - search_run.add_result( + search_results.add_result( Result( wordform=lemma1, target_language_keyword_match=params["expected_query_terms"], @@ -111,19 +111,19 @@ def test_espt_search(db, search, params): espt_search.inflect_search_results() assert params["expected_inflection"] in [ - entry.wordform.text for entry in list(search_run.unsorted_results()) + entry.wordform.text for entry in list(search_results.unsorted_results()) ] def test_espt_search_doesnt_crash_when_no_analysis(db): - search_run = SearchResults("my little bears") - espt_search = EsptSearch(search_run) - espt_search.analyze_query() + search_results = SearchResults("my little bears") + espt_search = EsptSearch(search_results) + espt_search.convert_search_query_to_espt() wordform = Wordform(text="pê-") wordform.lemma = wordform wordform.is_lemma = True - search_run.add_result( + search_results.add_result( Result(wordform=wordform, target_language_keyword_match=["bear"]) )