From 91c77443c92f87b068c04ea169d215aa4deab735 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20Ba=C3=B1ados=20Schwerter?= Date: Mon, 25 Nov 2024 23:23:22 +0000 Subject: [PATCH] Better documented the search algorithm --- src/morphodict/search/espt.py | 2 ++ src/morphodict/search/lookup.py | 10 ++++++++++ src/morphodict/search/pos_matches.py | 7 ++++--- src/morphodict/search/runner.py | 19 ++++++++++++++++++- 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/morphodict/search/espt.py b/src/morphodict/search/espt.py index 33e334a9a..d2c023850 100644 --- a/src/morphodict/search/espt.py +++ b/src/morphodict/search/espt.py @@ -43,6 +43,7 @@ def __init__(self, query, search_results): self.search_results = search_results self.query = query self.query_analyzed_ok = False + self.tags = None def convert_search_query_to_espt(self): """Analyze this search’s search_results query, possibly updating it. @@ -80,6 +81,7 @@ def convert_search_query_to_espt(self): tags=analyzed_query.tags, new_tags=self.new_tags, ) + self.tags = analyzed_query.tags def inflect_search_results(self): if not self.query_analyzed_ok: diff --git a/src/morphodict/search/lookup.py b/src/morphodict/search/lookup.py index 5a2be8b56..6c3361c87 100644 --- a/src/morphodict/search/lookup.py +++ b/src/morphodict/search/lookup.py @@ -21,9 +21,17 @@ def fetch_results(query: core.Query, search_results: core.SearchResults): + # First collect some candidate results via keywords. + # We split the query string into keywords, and collect all the entries that + # match exactly as keywords in the database, both source and target. + fetch_results_from_target_language_keywords(query, search_results) fetch_results_from_source_language_keywords(query, search_results) + # Then we proceed to analyze the query, if successfull, we look for those + # entries in the dictionary that share the analysis with the FST result. + # This introduces source-level spelling relaxation if the FST supports it. + # Use the spelling relaxation to try to decipher the query # e.g., "atchakosuk" becomes "acâhkos+N+A+Pl" -- # thus, we can match "acâhkos" in the dictionary! @@ -51,6 +59,8 @@ def fetch_results(query: core.Query, search_results: core.SearchResults): # fst_analyses has now been thinned by calls to `fst_analyses.remove()` # above; remaining items are analyses which are not in the database, # although their lemmas should be. + # + # Therefore, we will make on the go the extra entries. for analysis in fst_analyses: # When the user query is outside of paradigm tables # e.g. mad preverb and reduplication: ê-mâh-misi-nâh-nôcihikocik diff --git a/src/morphodict/search/pos_matches.py b/src/morphodict/search/pos_matches.py index 9a80fe7b3..6abb4ef1c 100644 --- a/src/morphodict/search/pos_matches.py +++ b/src/morphodict/search/pos_matches.py @@ -1,12 +1,13 @@ from morphodict.search.core import SearchResults from morphodict.phrase_translate.fst import source_phrase_analyses +from morphodict.search.espt import EsptSearch from morphodict.analysis import rich_analyze_relaxed -def find_pos_matches(search_results: SearchResults) -> None: - if len(search_results.verbose_messages) <= 1: +def find_pos_matches(tag_source: EsptSearch | None, search_results: SearchResults) -> None: + if not tag_source: return - tags = search_results.verbose_messages[1].get("tags") + tags = tag_source.tags [pos_match(result, tags) for result in search_results.unsorted_results()] diff --git a/src/morphodict/search/runner.py b/src/morphodict/search/runner.py index 0e2a1d184..14bb152d3 100644 --- a/src/morphodict/search/runner.py +++ b/src/morphodict/search/runner.py @@ -49,6 +49,8 @@ def search( espt_search = EsptSearch(search_query, search_results) espt_search.convert_search_query_to_espt() + # Now, check if we were asked to do only vector distance results, and if so, + # compute them and return them: if settings.MORPHODICT_ENABLE_CVD: cvd_search_type: CvdSearchType = first_non_none_value( search_query.cvd, default=CvdSearchType.DEFAULT @@ -64,8 +66,13 @@ def sort_by_cvd(r: Result): do_cvd_search(search_query, search_results) return search_results + # We were NOT asked for only vector distance results, so now we actually + # go and perform the search. + + # First, fetch keyword-based and FST-based orthography-relaxed results fetch_results(search_query, search_results) + # If allowed, add affix search candidates if ( settings.MORPHODICT_ENABLE_AFFIX_SEARCH and include_affixes @@ -74,21 +81,31 @@ def sort_by_cvd(r: Result): do_source_language_affix_search(search_query, search_results) do_target_language_affix_search(search_query, search_results) + # Now, if we wanted to do vector search (not exclusively), add the results. if settings.MORPHODICT_ENABLE_CVD: if cvd_search_type.should_do_search() and not is_almost_certainly_cree( search_query, search_results ): do_cvd_search(search_query, search_results) + # If we did an english phrase search, we have to inflect back the results! if (search_query.espt or inflect_english_phrases) and ( len(initial_query_terms) > 1 ): espt_search.inflect_search_results() - find_pos_matches(search_results) + # Annotate every entry in search results with the POS match when that is available + if espt_search: + find_pos_matches(espt_search, search_results) + + # Annotate every entry with a frequency count from the glossary get_glossary_count(search_results) + + # Annotate every entry with a lemma frequency from lemma_frequency.txt get_lemma_freq(search_results) + # Return. NOTE THAT WE HAVE NOT SORTED RESULTS YET! + # This will be done when we call sorted_results return search_results