Skip to content

Commit

Permalink
Use libzim on Linux and macOS
Browse files Browse the repository at this point in the history
See #9
  • Loading branch information
abdnh committed Oct 14, 2022
1 parent 2d3ca7b commit dc8740e
Show file tree
Hide file tree
Showing 13 changed files with 235 additions and 63 deletions.
1 change: 1 addition & 0 deletions .github/workflows/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ python_versions=(
./scripts/vendor_no_platform.sh
for python_version in ${python_versions[@]}; do
./scripts/vendor_pyzstd.sh $python_version
./scripts/vendor_libzim.sh $python_version
# FIXME: min_point_version in manifest.json should depend on the Python version
make zip EXTRA_ARGS="--out build/zim_reader-py$python_version.ankiaddon"
make ankiweb EXTRA_ARGS="--out build/zim_reader-py$python_version-ankiweb.ankiaddon"
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ PY_VER := 39
vendor:
./scripts/vendor_no_platform.sh $(SPACY_FLAG)
./scripts/vendor_pyzstd.sh $(PY_VER)
./scripts/vendor_libzim.sh $(PY_VER)

clean:
rm -rf build/
3 changes: 3 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ ignore_errors = True
[mypy-PyQt6.*]
ignore_errors = True
ignore_missing_imports = True

[mypy-libzim.*]
ignore_missing_imports = True
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pylint==2.15.2
isort==5.10.1
ankibuild
git+https://github.com/abdnh/ankibuild@5e346f5ddab5c783dfc35a16a1e8a5fdc7c7bed4#egg=ankibuild[qt5,qt6]
libzim==2.0.0; sys.platform != "win32"
zimply-core
git+https://github.com/abdnh/zimply-core@09c6f0f004591e0642590210248e87ff72bb6e21
spacy==3.4.1
40 changes: 40 additions & 0 deletions scripts/vendor_libzim.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash

mkdir -p build
cd build

python_version=$1
libzim_version=2.0.0

platforms=(
# TODO: bundle for Windows too once available
# win_amd64
manylinux1_x86_64
macosx_10_9_x86_64
)

# Download wheels
for platform in ${platforms[@]}; do
pip download libzim==$libzim_version --only-binary=:all: --python-version $python_version --implementation cp --platform $platform
done

# Create a shared wheel from an arbitrary platform-specific wheel
cp libzim-$libzim_version-cp$python_version-cp$python_version-${platforms[0]}.whl libzim.whl

# Unzip wheels
wheels=(libzim-$libzim_version-cp$python_version-*.whl libzim.whl)
for wheel in ${wheels[@]}; do
mkdir -p "${wheel%.*}"
pushd "${wheel%.*}"
unzip -o ../$wheel
popd
done

# Copy platform-specific library files to the shared wheel
for dir in libzim-$libzim_version-cp$python_version-*/; do
cp $(find $dir -maxdepth 1 -name 'libzim.*' -type f) libzim/

done

# Copy to vendor dir
cp -r ./libzim/* ../src/vendor
6 changes: 3 additions & 3 deletions scripts/vendor_pyzstd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ pyzstd_version=0.15.3

platforms=(
win_amd64
manylinux2014_x86_64
manylinux2014_aarch64
macosx_10_9_x86_64
# manylinux2014_x86_64
# manylinux2014_aarch64
# macosx_10_9_x86_64
# FIXME: the arm64 shared library has the same name as the x86_64 one (_zstd.cpython-39-darwin.so)
# How to handle such situation?
# macosx_11_0_arm64
Expand Down
142 changes: 142 additions & 0 deletions src/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
"""
An abstraction layer over libzim and ZIMply
"""

from __future__ import annotations

from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Type

from .errors import ZIMClientLibNotAvailable


@dataclass
class ZIMItem:
path: str
title: str
content: bytes
mimetype: str


class ZIMClient(ABC):
def __init__(self, file_path: str):
self.file_path = file_path

@abstractmethod
def main_page(self) -> ZIMItem | None:
raise NotImplementedError("Implement this to get the home page of the ZIM file")

@abstractmethod
def get_item_by_path(self, path: str) -> ZIMItem | None:
raise NotImplementedError("Implement this to get an item given its path")

@abstractmethod
def get_item_by_title(self, title: str) -> ZIMItem | None:
raise NotImplementedError("Implement this to get an article given its title")

@abstractmethod
def first_result(self, query: str) -> ZIMItem | None:
raise NotImplementedError(
"Implement this to return the first search result given a query"
)


class ZIMplyClient(ZIMClient):
def __init__(self, file_path: str):
super().__init__(file_path)
try:
from zimply_core.zim_core import ZIMClient

self._zimply_client = ZIMClient(
file_path,
encoding="utf-8",
auto_delete=True,
enable_search=True,
)
except ImportError as exc:
raise ZIMClientLibNotAvailable() from exc

def _item_from_zimply_article(self, article: Any | None) -> ZIMItem | None:
if not article:
return None
return ZIMItem(article.url, article.title, article.data, article.mimetype)

def main_page(self) -> ZIMItem | None:
return self._item_from_zimply_article(self._zimply_client.main_page)

def get_item_by_path(self, path: str) -> ZIMItem | None:
return self._item_from_zimply_article(self._zimply_client.get_article(path))

def get_item_by_title(self, title: str) -> ZIMItem | None:
return self._item_from_zimply_article(
self._zimply_client.get_article_by_title(title)
)

def first_result(self, query: str) -> ZIMItem | None:
results = self._zimply_client.search(query, 0, 1)
if not results:
return None
return self.get_item_by_path(results[0].url)


class LibZIMClient(ZIMClient):
def __init__(self, file_path: str):
super().__init__(file_path)
try:
from libzim.reader import Archive

self._archive = Archive(file_path)
except ImportError as exc:
raise ZIMClientLibNotAvailable() from exc

def _item_from_libzim_entry(self, entry: Any | None) -> ZIMItem | None:
if not entry:
return None
return ZIMItem(
entry.path,
entry.title,
bytes(entry.get_item().content),
entry.get_item().mimetype,
)

def main_page(self) -> ZIMItem | None:
return self._item_from_libzim_entry(self._archive.main_entry)

def get_item_by_path(self, path: str) -> ZIMItem | None:
return self._item_from_libzim_entry(self._archive.get_entry_by_path(path))

def get_item_by_title(self, title: str) -> ZIMItem | None:
return self._item_from_libzim_entry(self._archive.get_entry_by_title(title))

def first_result(self, query: str) -> ZIMItem | None:
from libzim.search import Query, Searcher

query = Query().set_query(query)
searcher = Searcher(self._archive)
search = searcher.search(query)
results = list(search.getResults(0, 1))
if not results:
return None
return self.get_item_by_path(results[0])


def _get_available_client_class() -> Type[ZIMClient] | None:
client_classes: list[Type[ZIMClient]] = [LibZIMClient, ZIMplyClient]
for klass in client_classes:
try:
klass("")
except ZIMClientLibNotAvailable:
continue
except:
return klass
return None


def init_client(zim_path: str | Path) -> ZIMClient:
return _client_class(str(zim_path))


_client_class = _get_available_client_class()
assert _client_class
24 changes: 9 additions & 15 deletions src/dictionaries/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
from zimply_core.zim_core import Article, ZIMClient

from ..client import ZIMItem, init_client
from ..consts import USER_FILES
from ..errors import ZIMReaderException
from .parser import DefaultParser
Expand All @@ -37,12 +37,7 @@ def __init__(self, name: str, parser: Parser = DefaultParser()):
zim_path = next(folder_path.glob("*.zim"), None)
if not zim_path:
raise ZIMReaderException(f"No zim file was found in {str(name)}")
self.zim_client = ZIMClient(
zim_path,
encoding="utf-8",
auto_delete=True,
enable_search=True,
)
self.client = init_client(zim_path)
self.parser = parser

@classmethod
Expand All @@ -55,18 +50,17 @@ def build_dict(
output_folder = USER_FILES / name
output_folder.mkdir(exist_ok=True)
shutil.copy(filename, output_folder)
# Build search index
ZIMDict(name)

@staticmethod
@functools.lru_cache
def _get_soup(
title: str, dictionary: ZIMDict, parser: Parser
) -> BeautifulSoup | None:
article = parser.get_article(title, dictionary, is_title=True)
item = parser.get_item(title, dictionary, is_title=True)
soup = None
if article:
soup = BeautifulSoup(article.data.decode(), "html.parser")
if item:
soup = BeautifulSoup(item.content.decode(), "html.parser")
return soup

def get_soup(self, title: str) -> BeautifulSoup | None:
Expand All @@ -77,20 +71,20 @@ def lookup(self, title: str) -> DictEntry | None:
return None
return self.parser.lookup(title, self)

def get_article(self, path: str) -> Article | None:
return self.parser.get_article(path, self)
def get_item(self, path: str) -> ZIMItem | None:
return self.parser.get_item(path, self)

def save_resource(self, path: str) -> str | None:
# Strip out '../'
path = path.split("/", maxsplit=1)[-1]
path = urllib.parse.unquote(path)
try:
article = self.zim_client.get_article(path)
item = self.client.get_item_by_path(path)
except KeyError:
return None
filename = path.split("/")[-1]
assert self.parser.col
return self.parser.col.media.write_data(filename, article.data)
return self.parser.col.media.write_data(filename, item.content)


def get_next_sibling_element(element: Tag) -> Tag | None:
Expand Down
17 changes: 8 additions & 9 deletions src/dictionaries/greek.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import re
from typing import TYPE_CHECKING

from zimply_core.zim_core import Article

from ..client import ZIMItem
from .dictionary import DictEntry, ZIMDict, save_images, strip_images
from .parser import Parser

Expand Down Expand Up @@ -87,14 +86,14 @@ def _stem(self, word: str) -> str:
)
)

def get_article(
self, query: str, dictionary: ZIMDict, is_title: bool = False
) -> Article | None:
article = super().get_article(query, dictionary, is_title)
if article:
return article
def get_item(
self, path: str, dictionary: ZIMDict, is_title: bool = False
) -> ZIMItem | None:
item = super().get_item(path, dictionary, is_title)
if item:
return item
if self.nlp:
return super().get_article(self._stem(query), dictionary, is_title)
return super().get_item(self._stem(path), dictionary, is_title)
return None

def follow_redirects(self, query: str, dictionary: ZIMDict) -> str:
Expand Down
31 changes: 12 additions & 19 deletions src/dictionaries/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

from zimply_core.zim_core import Article
from ..client import ZIMItem

if TYPE_CHECKING:
from anki.collection import Collection
Expand All @@ -23,15 +23,15 @@ def __init__(self, col: Collection | None = None):

@staticmethod
@functools.lru_cache
def _get_article(
def _get_item(
path: str,
dictionary: ZIMDict,
is_title: bool,
) -> Article | None:
get_article = (
dictionary.zim_client.get_article_by_title
) -> ZIMItem | None:
get_item = (
dictionary.client.get_item_by_title
if is_title
else dictionary.zim_client.get_article
else dictionary.client.get_item_by_path
)
nopunct = path.strip(string.punctuation).strip()
if is_title:
Expand All @@ -40,23 +40,16 @@ def _get_article(
forms = [path, path, path.lower(), path.title(), path.upper()]
for form in forms:
try:
article = get_article(form)
return article
item = get_item(form)
return item
except KeyError:
pass
# Return first search result, if any
results = dictionary.zim_client.search(path, 0, 1)
if results:
try:
return get_article(results[0].url)
except KeyError:
pass
return None
return dictionary.client.first_result(path)

def get_article(
def get_item(
self, path: str, dictionary: ZIMDict, is_title: bool = False
) -> Article | None:
return self._get_article(path, dictionary, is_title)
) -> ZIMItem | None:
return self._get_item(path, dictionary, is_title)

@abstractmethod
def lookup(self, query: str, dictionary: ZIMDict) -> DictEntry | None:
Expand Down
4 changes: 4 additions & 0 deletions src/errors.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
class ZIMReaderException(Exception):
pass


class ZIMClientLibNotAvailable(ZIMReaderException):
pass
Loading

0 comments on commit dc8740e

Please sign in to comment.