VedaWebProject · bkis · Sep 13, 2024 · Sep 3, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/Tekst-API/deployment/elasticsearch/Dockerfile b/Tekst-API/deployment/elasticsearch/Dockerfile
@@ -1,2 +1,2 @@
-FROM elasticsearch:8.13.4
+FROM elasticsearch:8.15.0
 RUN elasticsearch-plugin install analysis-icu
diff --git a/Tekst-API/openapi.json b/Tekst-API/openapi.json
@@ -8524,11 +8524,16 @@
       },
       "CommonResourceSearchQueryData": {
         "properties": {
-          "req": {
-            "type": "boolean",
-            "title": "Req",
-            "description": "Whether this query is required to match for the location to be considered a search hit",
-            "default": false
+          "occ": {
+            "type": "string",
+            "enum": [
+              "should",
+              "must",
+              "not"
+            ],
+            "title": "Occ",
+            "description": "The occurrence type of the search query",
+            "default": "should"
           },
           "res": {
             "type": "string",

diff --git a/Tekst-API/poetry.lock b/Tekst-API/poetry.lock
diff --git a/Tekst-API/pyproject.toml b/Tekst-API/pyproject.toml
@@ -34,7 +34,7 @@ pydantic-extra-types = "^2.0.0"
 setuptools = "^69.1.1"
 bleach = "^6.1.0"
 jsonref = "^1.1.0"
-elasticsearch = "^8.13.0"
+elasticsearch = "^8.15.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.1.1"

diff --git a/Tekst-API/tekst/resources/__init__.py b/Tekst-API/tekst/resources/__init__.py
@@ -9,7 +9,7 @@
 from os.path import realpath
 from pathlib import Path
 from time import perf_counter
-from typing import Annotated, Any, Union
+from typing import Annotated, Any, Literal, Union
 
 import jsonref
 
@@ -70,16 +70,13 @@ async def call_resource_maintenance_hooks(
 
 
 class CommonResourceSearchQueryData(ModelBase):
-    required: Annotated[
-        bool,
+    occurrence: Annotated[
+        Literal["should", "must", "not"],
         Field(
-            alias="req",
-            description=(
-                "Whether this query is required to match for the "
-                "location to be considered a search hit"
-            ),
+            alias="occ",
+            description="The occurrence type of the search query",
         ),
-    ] = False
+    ] = "should"
     resource_id: Annotated[
         PydanticObjectId,
         Field(

diff --git a/Tekst-API/tekst/resources/text_annotation.py b/Tekst-API/tekst/resources/text_annotation.py
@@ -42,13 +42,25 @@ def rtype_index_doc_props(cls) -> dict[str, Any]:
         return {
             "tokens": {
                 "type": "nested",
-                "dynamic": True,
                 "properties": {
                     "token": {
                         "type": "keyword",
                         "normalizer": "no_diacritics_normalizer",
                         "fields": {"strict": {"type": "keyword"}},
-                    }
+                    },
+                    "annotations": {
+                        "type": "nested",
+                        "properties": {
+                            "key": {
+                                "type": "keyword",
+                            },
+                            "value": {
+                                "type": "keyword",
+                                "normalizer": "no_diacritics_normalizer",
+                                "fields": {"strict": {"type": "keyword"}},
+                            },
+                        },
+                    },
                 },
             },
         }
@@ -62,10 +74,15 @@ def rtype_index_doc_data(
             "tokens": [
                 {
                     "token": token.token or "",
-                    "annotations": {
-                        anno.key: anno.value[0] if len(anno.value) == 1 else anno.value
+                    "annotations": [
+                        {
+                            "key": anno.key,
+                            "value": anno.value[0]
+                            if len(anno.value) == 1
+                            else anno.value,
+                        }
                         for anno in token.annotations
-                    }
+                    ]
                     if token.annotations
                     else None,
                 }
@@ -85,101 +102,93 @@ def rtype_es_queries(
         res_id = str(query.common.resource_id)
         q_id = str(uuid4())
 
-        if (
-            query.resource_type_specific.token.strip(" ") == "*"
-            and not query.resource_type_specific.annotations
-        ):
+        token_usr_q = (query.resource_type_specific.token or "").strip(" ") or None
+        token_es_q = []
+        annos_usr_q = query.resource_type_specific.annotations or []
+        annos_es_q = []
+
+        # process token query
+        if (token_usr_q == "*" or not token_usr_q) and not annos_usr_q:
             # handle empty/match-all query (query for existing target resource field)
-            es_queries.append(
+            token_es_q.append(
                 {
-                    "nested": {
-                        "path": f"resources.{res_id}.tokens",
-                        "query": {
-                            "exists": {
-                                "field": f"resources.{res_id}.tokens.token",
-                            }
-                        },
-                        "inner_hits": {"name": q_id},
+                    "exists": {
+                        "field": f"resources.{res_id}.tokens.token",
                     }
                 }
             )
-        elif (
-            query.resource_type_specific.token
-            or query.resource_type_specific.annotations
-        ):
-            # construct token query
-            token_query = (
+        elif token_usr_q:
+            # handle actual token query with content
+            token_es_q.append(
                 {
                     "simple_query_string": {
                         "fields": [f"resources.{res_id}.tokens.token{strict_suffix}"],
-                        "query": query.resource_type_specific.token,
+                        "query": token_usr_q,
                         "analyze_wildcard": True,
                     }
                 }
-                if query.resource_type_specific.token
-                else None
             )
-            # construct annotation queries
-            anno_queries = []
-            for anno in query.resource_type_specific.annotations:
-                if not anno.value:
-                    # if only key is set (and no value),
-                    # query for the existence of the key
-                    anno_queries.append(
-                        {
-                            "exists": {
-                                "field": (
-                                    f"resources.{res_id}.tokens.annotations.{anno.key}"
-                                ),
-                            }
-                        }
-                    )
-                elif anno.value == "__missing__":
-                    # if value is set to "__missing__", we're looking for tokens
-                    # that specifically DON'T have an annotation with the given key
-                    anno_queries.append(
-                        {
-                            "bool": {
-                                "must_not": {
-                                    "exists": {
-                                        "field": (
-                                            f"resources.{res_id}"
-                                            f".tokens.annotations.{anno.key}"
-                                        ),
-                                    }
-                                }
-                            }
+
+        # process annotation queries
+        for anno_q in annos_usr_q:
+            if anno_q.key and not anno_q.value:
+                # only key is set (and no value): query for existence of key
+                anno_k_q = {
+                    "term": {f"resources.{res_id}.tokens.annotations.key": anno_q.key}
+                }
+                annos_es_q.append(
+                    {
+                        "nested": {
+                            "path": f"resources.{res_id}.tokens.annotations",
+                            "query": anno_k_q,
                         }
-                    )
-                else:
-                    # if both key and value are set,
-                    # query for the specific key/value combination
-                    anno_queries.append(
-                        {
-                            "simple_query_string": {
-                                "fields": [
-                                    f"resources.{res_id}.tokens.annotations.{anno.key}"
-                                ],
-                                "query": anno.value,
-                                "analyze_wildcard": True,
-                            }
+                    }
+                )
+            elif anno_q.key and anno_q.value:
+                # both key and value are set: query for specific key/value combination
+                anno_k_q = {
+                    "term": {f"resources.{res_id}.tokens.annotations.key": anno_q.key}
+                }
+                anno_v_q = {
+                    "simple_query_string": {
+                        "fields": [
+                            (
+                                f"resources.{res_id}.tokens.annotations"
+                                f".value{strict_suffix}"
+                            ),
+                        ],
+                        "query": anno_q.value,
+                        "analyze_wildcard": True,
+                    }
+                }
+                annos_es_q.append(
+                    {
+                        "nested": {
+                            "path": f"resources.{res_id}.tokens.annotations",
+                            "query": {
+                                "bool": {
+                                    "must": [anno_k_q, anno_v_q],
+                                },
+                            },
                         }
-                    )
+                    }
+                )
 
-            # add token and annotation queries to the ES query
+        # add token and annotation queries to the ES queries
+        if token_es_q or annos_es_q:
+            es_sub_queries = [*token_es_q, *annos_es_q]
             es_queries.append(
                 {
                     "nested": {
                         "path": f"resources.{res_id}.tokens",
+                        "inner_hits": {"name": q_id},
                         "query": {
                             "bool": {
-                                "must": [
-                                    *([token_query] if token_query else []),
-                                    *anno_queries,
-                                ],
+                                "must": es_sub_queries,
                             },
-                        },
-                        "inner_hits": {"name": q_id},
+                        }
+                        if len(es_sub_queries) > 1
+                        else es_sub_queries[0],
                     }
                 }
             )
@@ -198,7 +207,7 @@ def _highlights_generator(hit: dict[str, Any]) -> list[str]:
                     token = ih_hit["_source"]["token"]
                     annos = ih_hit["_source"]["annotations"]
                     annos = (
-                        f" ({'; '.join([a for a in annos.values()])})" if annos else ""
+                        f" ({'; '.join([a['value'] for a in annos])})" if annos else ""
                     )
                     hl_strings.append(f"{token} {annos}")
             return hl_strings