Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revisit annotation indexing, search query occ #331

Merged
merged 5 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Tekst-API/deployment/elasticsearch/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
FROM elasticsearch:8.13.4
FROM elasticsearch:8.15.0
RUN elasticsearch-plugin install analysis-icu
15 changes: 10 additions & 5 deletions Tekst-API/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -8524,11 +8524,16 @@
},
"CommonResourceSearchQueryData": {
"properties": {
"req": {
"type": "boolean",
"title": "Req",
"description": "Whether this query is required to match for the location to be considered a search hit",
"default": false
"occ": {
"type": "string",
"enum": [
"should",
"must",
"not"
],
"title": "Occ",
"description": "The occurrence type of the search query",
"default": "should"
},
"res": {
"type": "string",
Expand Down
888 changes: 474 additions & 414 deletions Tekst-API/poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Tekst-API/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ pydantic-extra-types = "^2.0.0"
setuptools = "^69.1.1"
bleach = "^6.1.0"
jsonref = "^1.1.0"
elasticsearch = "^8.13.0"
elasticsearch = "^8.15.0"

[tool.poetry.group.dev.dependencies]
pytest = "^8.1.1"
Expand Down
15 changes: 6 additions & 9 deletions Tekst-API/tekst/resources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from os.path import realpath
from pathlib import Path
from time import perf_counter
from typing import Annotated, Any, Union
from typing import Annotated, Any, Literal, Union

import jsonref

Expand Down Expand Up @@ -70,16 +70,13 @@ async def call_resource_maintenance_hooks(


class CommonResourceSearchQueryData(ModelBase):
required: Annotated[
bool,
occurrence: Annotated[
Literal["should", "must", "not"],
Field(
alias="req",
description=(
"Whether this query is required to match for the "
"location to be considered a search hit"
),
alias="occ",
description="The occurrence type of the search query",
),
] = False
] = "should"
resource_id: Annotated[
PydanticObjectId,
Field(
Expand Down
167 changes: 88 additions & 79 deletions Tekst-API/tekst/resources/text_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,25 @@ def rtype_index_doc_props(cls) -> dict[str, Any]:
return {
"tokens": {
"type": "nested",
"dynamic": True,
"properties": {
"token": {
"type": "keyword",
"normalizer": "no_diacritics_normalizer",
"fields": {"strict": {"type": "keyword"}},
}
},
"annotations": {
"type": "nested",
"properties": {
"key": {
"type": "keyword",
},
"value": {
"type": "keyword",
"normalizer": "no_diacritics_normalizer",
"fields": {"strict": {"type": "keyword"}},
},
},
},
},
},
}
Expand All @@ -62,10 +74,15 @@ def rtype_index_doc_data(
"tokens": [
{
"token": token.token or "",
"annotations": {
anno.key: anno.value[0] if len(anno.value) == 1 else anno.value
"annotations": [
{
"key": anno.key,
"value": anno.value[0]
if len(anno.value) == 1
else anno.value,
}
for anno in token.annotations
}
]
if token.annotations
else None,
}
Expand All @@ -85,101 +102,93 @@ def rtype_es_queries(
res_id = str(query.common.resource_id)
q_id = str(uuid4())

if (
query.resource_type_specific.token.strip(" ") == "*"
and not query.resource_type_specific.annotations
):
token_usr_q = (query.resource_type_specific.token or "").strip(" ") or None
token_es_q = []
annos_usr_q = query.resource_type_specific.annotations or []
annos_es_q = []

# process token query
if (token_usr_q == "*" or not token_usr_q) and not annos_usr_q:
# handle empty/match-all query (query for existing target resource field)
es_queries.append(
token_es_q.append(
{
"nested": {
"path": f"resources.{res_id}.tokens",
"query": {
"exists": {
"field": f"resources.{res_id}.tokens.token",
}
},
"inner_hits": {"name": q_id},
"exists": {
"field": f"resources.{res_id}.tokens.token",
}
}
)
elif (
query.resource_type_specific.token
or query.resource_type_specific.annotations
):
# construct token query
token_query = (
elif token_usr_q:
# handle actual token query with content
token_es_q.append(
{
"simple_query_string": {
"fields": [f"resources.{res_id}.tokens.token{strict_suffix}"],
"query": query.resource_type_specific.token,
"query": token_usr_q,
"analyze_wildcard": True,
}
}
if query.resource_type_specific.token
else None
)
# construct annotation queries
anno_queries = []
for anno in query.resource_type_specific.annotations:
if not anno.value:
# if only key is set (and no value),
# query for the existence of the key
anno_queries.append(
{
"exists": {
"field": (
f"resources.{res_id}.tokens.annotations.{anno.key}"
),
}
}
)
elif anno.value == "__missing__":
# if value is set to "__missing__", we're looking for tokens
# that specifically DON'T have an annotation with the given key
anno_queries.append(
{
"bool": {
"must_not": {
"exists": {
"field": (
f"resources.{res_id}"
f".tokens.annotations.{anno.key}"
),
}
}
}

# process annotation queries
for anno_q in annos_usr_q:
if anno_q.key and not anno_q.value:
# only key is set (and no value): query for existence of key
anno_k_q = {
"term": {f"resources.{res_id}.tokens.annotations.key": anno_q.key}
}
annos_es_q.append(
{
"nested": {
"path": f"resources.{res_id}.tokens.annotations",
"query": anno_k_q,
}
)
else:
# if both key and value are set,
# query for the specific key/value combination
anno_queries.append(
{
"simple_query_string": {
"fields": [
f"resources.{res_id}.tokens.annotations.{anno.key}"
],
"query": anno.value,
"analyze_wildcard": True,
}
}
)
elif anno_q.key and anno_q.value:
# both key and value are set: query for specific key/value combination
anno_k_q = {
"term": {f"resources.{res_id}.tokens.annotations.key": anno_q.key}
}
anno_v_q = {
"simple_query_string": {
"fields": [
(
f"resources.{res_id}.tokens.annotations"
f".value{strict_suffix}"
),
],
"query": anno_q.value,
"analyze_wildcard": True,
}
}
annos_es_q.append(
{
"nested": {
"path": f"resources.{res_id}.tokens.annotations",
"query": {
"bool": {
"must": [anno_k_q, anno_v_q],
},
},
}
)
}
)

# add token and annotation queries to the ES query
# add token and annotation queries to the ES queries
if token_es_q or annos_es_q:
es_sub_queries = [*token_es_q, *annos_es_q]
es_queries.append(
{
"nested": {
"path": f"resources.{res_id}.tokens",
"inner_hits": {"name": q_id},
"query": {
"bool": {
"must": [
*([token_query] if token_query else []),
*anno_queries,
],
"must": es_sub_queries,
},
},
"inner_hits": {"name": q_id},
}
if len(es_sub_queries) > 1
else es_sub_queries[0],
}
}
)
Expand All @@ -198,7 +207,7 @@ def _highlights_generator(hit: dict[str, Any]) -> list[str]:
token = ih_hit["_source"]["token"]
annos = ih_hit["_source"]["annotations"]
annos = (
f" ({'; '.join([a for a in annos.values()])})" if annos else ""
f" ({'; '.join([a['value'] for a in annos])})" if annos else ""
)
hl_strings.append(f"{token} {annos}")
return hl_strings
Expand Down
Loading
Loading