diff --git a/.flake8 b/.flake8 index d0da5961..804c52e0 100644 --- a/.flake8 +++ b/.flake8 @@ -7,6 +7,19 @@ inline-quotes = " application-import-names = dedoc, tests, scripts, train_dataset import-order-style = pycharm +extend-immutable-calls = File, Depends + +banned-modules = + dedoc = Use full path + dedoc.data_structures = Use full path + dedoc.attachments_extractors = Use full path + dedoc.attachments_handler = Use full path + dedoc.converters = Use full path + dedoc.metadata_extractors = Use full path + dedoc.readers = Use full path + dedoc.structure_constructors = Use full path + dedoc.structure_extractors = Use full path + exclude = .git, __pycache__, @@ -28,9 +41,11 @@ exclude = # ANN202 - Missing return type annotation for protected function # ANN204 - Missing return type annotation for special method # N802 - function name should be lowercase +# I251 - Banned import (Use full path) ignore = ANN101 per-file-ignores = scripts/*:T201 scripts/benchmark_pdf_performance*:JS101 tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802 + docs/source/_static/code_examples/*:I251 diff --git a/.github/check_version.py b/.github/check_version.py index 9a107efa..06120405 100644 --- a/.github/check_version.py +++ b/.github/check_version.py @@ -23,7 +23,7 @@ def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern args = parser.parse_args() print(f"Old version: {args.old_version}, new version: {args.new_version}, " - f"branch: {args.branch}, tag: {args.tag}, pre_release: {args.pre_release}") # noqa + f"branch: {args.branch}, tag: {args.tag}, pre_release: {args.pre_release}") master_version_pattern = re.compile(r"^\d+\.\d+(\.\d+)?$") develop_version_pattern = re.compile(r"^\d+\.\d+\.\d+rc\d+$") @@ -43,4 +43,4 @@ def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern is_correct_version(args.new_version, args.tag, args.old_version, master_version_pattern) assert args.pre_release != "true", "Pre-releases are not allowed on master" - print("Version is correct") # noqa + print("Version is correct") diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0f439368..09231202 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,6 +15,7 @@ repos: flake8-import-order==0.18.2, flake8-multiline-containers==0.0.19, flake8-print==5.0.0, + flake8-tidy-imports==4.10.0, flake8-quotes==3.3.2, flake8-use-fstring==1.4, pycodestyle==2.9.0, diff --git a/Dockerfile b/Dockerfile index 779508df..3d00dea6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,7 @@ ARG REPOSITORY="docker.io" FROM dedocproject/dedoc_p3.9_base:version_2023_08_28 +ARG LANGUAGES="" +RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$lang; done ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root" ENV RESOURCES_PATH "/dedoc_root/resources" diff --git a/README.md b/README.md index 5293342e..ab022482 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,18 @@ # Dedoc +[![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/) [![PyPI version](https://badge.fury.io/py/dedoc.svg)](https://badge.fury.io/py/dedoc) +[![PyPI downloads](https://pepy.tech/badge/dedoc)](https://pepy.tech/project/dedoc) +[![Docker Hub](https://img.shields.io/docker/pulls/dedocproject/dedoc.svg)](https://hub.docker.com/r/dedocproject/dedoc/ "Docker Pulls") [![License](http://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html) -[![Documentation Status](https://readthedocs.org/projects/dedoc/badge/?version=latest)](https://dedoc.readthedocs.io/en/latest/?badge=latest) -[![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/) [![Demo dedoc-readme.hf.space](https://img.shields.io/website-up-down-green-red/https/huggingface.co/spaces/dedoc/README.svg)](https://dedoc-readme.hf.space) -[![Docker Hub](https://img.shields.io/docker/pulls/dedocproject/dedoc.svg)](https://hub.docker.com/r/dedocproject/dedoc/ "Docker Pulls") +[![Documentation Status](https://readthedocs.org/projects/dedoc/badge/?version=latest)](https://dedoc.readthedocs.io/en/latest/?badge=latest) [![CI tests](https://github.com/ispras/dedoc/workflows/CI/badge.svg)](https://github.com/ispras/dedoc/actions) ![Dedoc](https://github.com/ispras/dedoc/raw/master/dedoc_logo.png) Dedoc is an open universal system for converting documents to a unified output format. -It extracts a document’s logical structure and content, its tables, text formatting and metadata. +It extracts a document’s logical structure and content: tables, text formatting and metadata. The document’s content is represented as a tree storing headings and lists of any level. Dedoc can be integrated in a document contents and structure analysis system as a separate module. @@ -22,14 +23,14 @@ Dedoc can be integrated in a document contents and structure analysis system as Workflow description is given [`here`](https://dedoc.readthedocs.io/en/latest/?badge=latest#workflow) ## Features and advantages -Dedoc is implemented in Python and works with semi-structured data formats (DOC/DOCX, ODT, XLS/XLSX, CSV, TXT, JSON) and none-structured data formats like images (PNG, JPG etc.), archives (ZIP, RAR etc.), PDF and HTML formats. +Dedoc is implemented in Python and works with semi-structured data formats (DOC/DOCX, ODT, XLS/XLSX, CSV, TXT, JSON) and unstructured data formats like images (PNG, JPG etc.), archives (ZIP, RAR etc.), PDF and HTML formats. Document structure extraction is fully automatic regardless of input data type. Metadata and text formatting are also extracted automatically. In 2022, the system won a grant to support the development of promising AI projects from the [Innovation Assistance Foundation (Фонд содействия инновациям)](https://fasie.ru/). ## Dedoc provides: -* Extensibility due to a flexible addition of new document formats and to an easy change of an output data format. +* Extensibility due to flexible addition of new document formats and easy change of an output data format. * Support for extracting document structure out of nested documents having different formats. * Extracting various text formatting features (indentation, font type, size, style etc.). * Working with documents of various origin (statements of work, legal documents, technical reports, scientific papers) allowing flexible tuning for new domains. @@ -68,7 +69,7 @@ The system processes different document formats. The main formats are listed bel ## Impact -This project may be useful as a first step of automatic document analysis pipeline (e.g. before the NLP part). +This project may be useful as a first step of an automatic document analysis pipeline (e.g. before the NLP part). Dedoc is in demand for information analytic systems, information leak monitoring systems, as well as for natural language processing systems. The library is intended for application use by developers of systems for automatic analysis and structuring of electronic documents, including for further search in electronic documents. @@ -92,7 +93,7 @@ Relevant documentation of dedoc is available [here](https://dedoc.readthedocs.io # Installation instructions -This project has REST Api and you can run it in Docker container. +This project has a REST api and you can run it in Docker container. Also, dedoc can be installed as a library via `pip`. There are two ways to install and run dedoc as a web application or a library that are described below. @@ -149,7 +150,7 @@ If you need to change some application settings, you may update `config.py` acco If you don't want to use docker for running the application, it's possible to run dedoc locally. However, it isn't suitable for any operating system (`Ubuntu 20+` is recommended) and -there may be not enough machine's resources for its work. +there may be not enough machine resources for its work. You should have `python` (`python3.8`, `python3.9` are recommended) and `pip` installed. Installation instructions via pip are available [here](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-dedoc-using-pip). diff --git a/VERSION b/VERSION index 6b4d1577..04761555 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.3 \ No newline at end of file +2.2.4 \ No newline at end of file diff --git a/dedoc/__init__.py b/dedoc/__init__.py index 82dbebad..e9841fc0 100644 --- a/dedoc/__init__.py +++ b/dedoc/__init__.py @@ -1,2 +1,2 @@ -from .dedoc_manager import DedocManager # noqa -from .version import __version__ # noqa +from .dedoc_manager import DedocManager +from .version import __version__ diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index 20d01db1..c77ec19a 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -28,7 +28,7 @@ class QueryParameters: # pdf handling pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"], description="Extract text from a text layer of PDF or using OCR methods for image-like documents") - language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng", "fra", "spa"], description="Recognition language") + language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')") pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right') is_one_column_document: str = Form("auto", enum=["auto", "true", "false"], description='One or multiple column document, "auto" - predict number of page columns automatically') diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py index c942fefa..ad91f2d8 100644 --- a/dedoc/api/api_utils.py +++ b/dedoc/api/api_utils.py @@ -120,12 +120,20 @@ def json2html(text: str, attachments: Optional[List[ParsedDocument]], tabs: int = 0, table2id: Dict[str, int] = None, - attach2id: Dict[str, int] = None) -> str: + attach2id: Dict[str, int] = None, + prev_page_id: Optional[List[int]] = None) -> str: + if prev_page_id is None: + prev_page_id = [0] + tables = [] if tables is None else tables attachments = [] if attachments is None else attachments table2id = {table.metadata.uid: table_id for table_id, table in enumerate(tables)} if table2id is None else table2id attach2id = {attachment.metadata.uid: attachment_id for attachment_id, attachment in enumerate(attachments)} if attach2id is None else attach2id + if paragraph.metadata.page_id != prev_page_id[0]: + text += f"
Page {prev_page_id[0] + 1}

" + prev_page_id[0] = paragraph.metadata.page_id + ptext = __annotations2html(paragraph=paragraph, table2id=table2id, attach2id=attach2id, tabs=tabs) if paragraph.metadata.hierarchy_level.line_type in [HierarchyLevel.header, HierarchyLevel.root]: @@ -141,7 +149,8 @@ def json2html(text: str, text += ptext for subparagraph in paragraph.subparagraphs: - text = json2html(text=text, paragraph=subparagraph, tables=None, attachments=None, tabs=tabs + 4, table2id=table2id, attach2id=attach2id) + text = json2html(text=text, paragraph=subparagraph, tables=None, attachments=None, tabs=tabs + 4, table2id=table2id, attach2id=attach2id, + prev_page_id=prev_page_id) if tables is not None and len(tables) > 0: text += "

Tables:

" diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py index 9e7063f7..1458ffd3 100644 --- a/dedoc/api/dedoc_api.py +++ b/dedoc/api/dedoc_api.py @@ -7,13 +7,12 @@ import traceback from typing import Optional -import uvicorn from fastapi import Depends, FastAPI, File, Request, Response, UploadFile from fastapi.responses import ORJSONResponse, UJSONResponse from fastapi.staticfiles import StaticFiles from starlette.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse -import dedoc +import dedoc.version from dedoc.api.api_args import QueryParameters from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt from dedoc.api.schema.parsed_document import ParsedDocument @@ -53,7 +52,7 @@ def get_static_file(request: Request) -> Response: @app.get("/version") def get_version() -> Response: - return PlainTextResponse(dedoc.__version__) + return PlainTextResponse(dedoc.version.__version__) def _get_static_file_path(request: Request) -> str: @@ -70,10 +69,10 @@ def __add_base64_info_to_attachments(document_tree: ParsedDocument, attachments_ @app.post("/upload", response_model=ParsedDocument) -async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: # noqa +async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: parameters = dataclasses.asdict(query_params) if not file or file.filename == "": - raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.__version__) + raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.version.__version__) return_format = str(parameters.get("return_format", "json")).lower() @@ -152,4 +151,5 @@ def get_api() -> FastAPI: def run_api(app: FastAPI) -> None: + import uvicorn uvicorn.run(app=app, host="0.0.0.0", port=int(PORT)) diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html index e045d483..d98a9161 100644 --- a/dedoc/api/web/index.html +++ b/dedoc/api/web/index.html @@ -129,14 +129,15 @@

PDF handling

-

diff --git a/dedoc/attachments_extractors/abstract_attachment_extractor.py b/dedoc/attachments_extractors/abstract_attachment_extractor.py index 21a4497b..d55859bf 100644 --- a/dedoc/attachments_extractors/abstract_attachment_extractor.py +++ b/dedoc/attachments_extractors/abstract_attachment_extractor.py @@ -1,12 +1,7 @@ -import logging -import os -import uuid from abc import ABC, abstractmethod from typing import List, Optional, Set, Tuple from dedoc.data_structures.attached_file import AttachedFile -from dedoc.utils.parameter_utils import get_param_attachments_dir -from dedoc.utils.utils import get_mime_extension, save_data_to_unique_file class AbstractAttachmentsExtractor(ABC): @@ -19,6 +14,8 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti :param recognized_extensions: set of supported files extensions with a dot, for example {.doc, .pdf} :param recognized_mimes: set of supported MIME types of files """ + import logging + self.config = {} if config is None else config self.logger = self.config.get("logger", logging.getLogger()) self._recognized_extensions = {} if recognized_extensions is None else recognized_extensions @@ -39,6 +36,7 @@ def can_extract(self, :param parameters: any additional parameters for the given document :return: the indicator of possibility to get attachments of this file """ + from dedoc.utils.utils import get_mime_extension mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in self._recognized_extensions or mime in self._recognized_mimes @@ -66,7 +64,13 @@ def with_attachments(parameters: dict) -> bool: return str(parameters.get("with_attachments", "false")).lower() == "true" def _content2attach_file(self, content: List[Tuple[str, bytes]], tmpdir: str, need_content_analysis: bool, parameters: dict) -> List[AttachedFile]: + import os + import uuid + from dedoc.utils.parameter_utils import get_param_attachments_dir + from dedoc.utils.utils import save_data_to_unique_file + attachments = [] + attachments_dir = get_param_attachments_dir(parameters, tmpdir) for original_name, contents in content: diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py index eaac7cf5..d267f4ff 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py @@ -1,14 +1,8 @@ -import os -import zipfile from abc import ABC from typing import List, Optional, Set, Tuple -import olefile -from charset_normalizer import from_bytes - from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile -from dedoc.utils.parameter_utils import get_param_need_content_analysis class AbstractOfficeAttachmentsExtractor(AbstractAttachmentsExtractor, ABC): @@ -25,6 +19,8 @@ def __parse_ole_contents(self, stream: bytes) -> Tuple[str, bytes]: :param stream: binary content of olefile :return: tuple of (name of original file and binary file content) """ + from charset_normalizer import from_bytes + # original filename in ANSI starts at byte 7 and is null terminated stream = stream[6:] @@ -65,6 +61,11 @@ def __parse_ole_contents(self, stream: bytes) -> Tuple[str, bytes]: return filename, contents def _get_attachments(self, tmpdir: str, filename: str, parameters: dict, attachments_dir: str) -> List[AttachedFile]: + import olefile + import os + import zipfile + from dedoc.utils.parameter_utils import get_param_need_content_analysis + result = [] with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile: diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py index ea5da542..8c054d0d 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py @@ -1,17 +1,9 @@ -import hashlib -import os -import re -import tempfile -import zipfile from typing import List, Optional - -from bs4 import BeautifulSoup, Tag +from zipfile import BadZipFile, ZipFile from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.attached_file import AttachedFile -from dedoc.extensions import recognized_extensions, recognized_mimes -from dedoc.utils.parameter_utils import get_param_need_content_analysis class DocxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): @@ -19,6 +11,7 @@ class DocxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): Extract attachments from docx files. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.docx_like_format, recognized_mimes=recognized_mimes.docx_like_format) def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: @@ -28,11 +21,14 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + import os + from dedoc.utils.parameter_utils import get_param_need_content_analysis + parameters = {} if parameters is None else parameters tmpdir, filename = os.path.split(file_path) result = [] try: - with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile: + with ZipFile(os.path.join(tmpdir, filename), "r") as zfile: diagram_attachments = self.__extract_diagrams(zfile) need_content_analysis = get_param_need_content_analysis(parameters) result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis, @@ -40,17 +36,23 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word") - except zipfile.BadZipFile: + except BadZipFile: raise BadFileFormatError(f"Bad docx file:\n file_name = {filename}. Seems docx is broken") return result - def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]: + def __extract_diagrams(self, document: ZipFile) -> List[tuple]: """ Creates files for diagram: separate file for each paragraph with diagram. :param document: archive with docx document :returns: list of files with diagrams """ + import hashlib + import os + import re + import tempfile + from bs4 import BeautifulSoup, Tag + result = [] try: content = document.read("word/document.xml") @@ -85,7 +87,7 @@ def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]: with open(f"{tmpdir}/word/document.xml", "w") as f: f.write(doc_text) diagram_name = f"{uid}.docx" - with zipfile.ZipFile(os.path.join(tmpdir, diagram_name), mode="w") as new_d: + with ZipFile(os.path.join(tmpdir, diagram_name), mode="w") as new_d: for filename in namelist: new_d.write(os.path.join(tmpdir, filename), arcname=filename) with open(os.path.join(tmpdir, diagram_name), "rb") as f: diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py index 7097e5d3..70d64ce8 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py @@ -1,9 +1,7 @@ -import os from typing import List, Optional from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile -from dedoc.extensions import recognized_extensions, recognized_mimes class ExcelAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): @@ -11,6 +9,7 @@ class ExcelAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): Extracts attachments from xlsx files. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.excel_like_format, recognized_mimes=recognized_mimes.excel_like_format) def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: @@ -20,6 +19,8 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + import os + parameters = {} if parameters is None else parameters tmpdir, filename = os.path.split(file_path) return self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="xl") diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py index a68c5848..a1f47bad 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py @@ -1,11 +1,7 @@ -import json -import os from typing import List, Optional from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile -from dedoc.extensions import recognized_extensions, recognized_mimes -from dedoc.utils.parameter_utils import get_param_need_content_analysis class JsonAttachmentsExtractor(AbstractAttachmentsExtractor): @@ -13,6 +9,7 @@ class JsonAttachmentsExtractor(AbstractAttachmentsExtractor): Extract attachments from json files. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.json_like_format, recognized_mimes=recognized_mimes.json_like_format) def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: @@ -32,6 +29,10 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + import json + import os + from dedoc.utils.parameter_utils import get_param_need_content_analysis + parameters = {} if parameters is None else parameters tmpdir, filename = os.path.split(file_path) attachments = [] diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py index b152ca63..fc58af36 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py @@ -1,17 +1,9 @@ -import json -import os -import uuid from typing import List, Optional, Tuple -import PyPDF2 -from PyPDF2.pdf import PageObject -from PyPDF2.utils import PdfReadError +from PyPDF2.pdf import PageObject, PdfFileReader from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile -from dedoc.extensions import recognized_extensions, recognized_mimes -from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis -from dedoc.utils.utils import convert_datetime, get_unique_name class PDFAttachmentsExtractor(AbstractAttachmentsExtractor): @@ -19,6 +11,7 @@ class PDFAttachmentsExtractor(AbstractAttachmentsExtractor): Extract attachments from pdf files. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: @@ -28,12 +21,16 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + import os + from PyPDF2.utils import PdfReadError + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis + parameters = {} if parameters is None else parameters filename = os.path.basename(file_path) with open(file_path, "rb") as handler: try: - reader = PyPDF2.PdfFileReader(handler) + reader = PdfFileReader(handler) except Exception as e: self.logger.warning(f"can't handle {filename}, get {e}") return [] @@ -52,6 +49,8 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att return self._content2attach_file(content=attachments, tmpdir=attachments_dir, need_content_analysis=need_content_analysis, parameters=parameters) def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]: + from dedoc.utils.utils import convert_datetime + attachments = [] if "/Annots" in page.keys(): for annot in page["/Annots"]: @@ -72,7 +71,7 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]: attachments.append((name, bytes(content))) return attachments - def __get_page_level_attachments(self, reader: PyPDF2.PdfFileReader) -> List[Tuple[str, bytes]]: + def __get_page_level_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]: cnt_page = reader.getNumPages() attachments = [] for i in range(cnt_page): @@ -82,12 +81,14 @@ def __get_page_level_attachments(self, reader: PyPDF2.PdfFileReader) -> List[Tup return attachments - def __get_root_attachments(self, reader: PyPDF2.PdfFileReader) -> List[Tuple[str, bytes]]: + def __get_root_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]: """ Retrieves the file attachments of the PDF as a dictionary of file names and the file data as a bytestring. :return: dictionary of filenames and bytestrings """ + import uuid + attachments = [] catalog = reader.trailer["/Root"] if "/Names" in catalog.keys() and "/EmbeddedFiles" in catalog["/Names"].keys() and "/Names" in catalog["/Names"]["/EmbeddedFiles"].keys(): @@ -104,6 +105,9 @@ def __get_root_attachments(self, reader: PyPDF2.PdfFileReader) -> List[Tuple[str return attachments def __create_note(self, content: str, modified_time: int, created_time: int, author: str, size: int = None) -> [str, bytes]: + import json + from dedoc.utils.utils import get_unique_name + filename = get_unique_name("note.json") note_dict = { "content": content, diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py index f0fd8c9f..bd26456f 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py @@ -3,7 +3,6 @@ from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile -from dedoc.extensions import recognized_extensions, recognized_mimes class PptxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): @@ -11,6 +10,7 @@ class PptxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): Extract attachments from pptx files. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.pptx_like_format, recognized_mimes=recognized_mimes.pptx_like_format) def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py index b657dd88..8d88f9e6 100644 --- a/dedoc/attachments_handler/attachments_handler.py +++ b/dedoc/attachments_handler/attachments_handler.py @@ -1,14 +1,11 @@ -import copy -import logging -import os -import time from typing import List, Optional from dedoc.common.exceptions.dedoc_error import DedocError -from dedoc.data_structures import AttachedFile, DocumentMetadata, ParsedDocument +from dedoc.data_structures.attached_file import AttachedFile +from dedoc.data_structures.document_metadata import DocumentMetadata +from dedoc.data_structures.parsed_document import ParsedDocument from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.utils.parameter_utils import get_param_with_attachments -from dedoc.utils.utils import get_empty_content +from dedoc.dedoc_manager import DedocManager class AttachmentsHandler: @@ -26,10 +23,12 @@ def __init__(self, *, config: Optional[dict] = None) -> None: """ :param config: configuration of the handler, e.g. logger for logging """ + import logging + self.config = {} if config is None else config self.logger = self.config.get("logger", logging.getLogger()) - def handle_attachments(self, document_parser: "DedocManager", document: UnstructuredDocument, parameters: dict) -> List[ParsedDocument]: # noqa + def handle_attachments(self, document_parser: DedocManager, document: UnstructuredDocument, parameters: dict) -> List[ParsedDocument]: """ Handle attachments of the document in the intermediate representation. @@ -39,6 +38,11 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct are important, look to the API parameters documentation for more details). :return: list of parsed document attachments """ + import copy + import os + import time + from dedoc.utils.parameter_utils import get_param_with_attachments + attachments = [] recursion_deep_attachments = int(parameters.get("recursion_deep_attachments", 10)) - 1 @@ -76,7 +80,8 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct attachments.append(parsed_file) return attachments - def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa + def __get_empty_document(self, document_parser: DedocManager, attachment: AttachedFile, parameters: dict) -> ParsedDocument: + from dedoc.utils.utils import get_empty_content metadata = document_parser.document_metadata_extractor.extract( file_path=attachment.get_filename_in_path(), original_filename=attachment.get_original_filename(), diff --git a/dedoc/common/exceptions/dedoc_error.py b/dedoc/common/exceptions/dedoc_error.py index 78426e39..f91c8bd0 100644 --- a/dedoc/common/exceptions/dedoc_error.py +++ b/dedoc/common/exceptions/dedoc_error.py @@ -1,6 +1,6 @@ from typing import Optional -import dedoc +import dedoc.version class DedocError(Exception): @@ -14,7 +14,7 @@ def __init__(self, self.msg = msg self.msg_api = msg if msg_api is None else msg_api self.filename = filename - self.version = version if version is not None else dedoc.__version__ + self.version = version if version is not None else dedoc.version.__version__ self.metadata = metadata def __str__(self) -> str: diff --git a/dedoc/config.py b/dedoc/config.py index 06d98894..58ebc88a 100644 --- a/dedoc/config.py +++ b/dedoc/config.py @@ -1,55 +1,3 @@ -import logging -import os -import sys - -logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s") - -DEBUG_MODE = False -RESOURCES_PATH = os.environ.get("RESOURCES_PATH", os.path.join(os.path.expanduser("~"), ".cache", "dedoc", "resources")) - -_config = dict( - # -----------------------------------------RESOURCES PATH SETTINGS---------------------------------------------------- - resources_path=RESOURCES_PATH, - intermediate_data_path=os.path.join(RESOURCES_PATH, "datasets"), - table_path="/tmp/tables", - - # -----------------------------------------COMMON DEBUG SETTINGS---------------------------------------------------- - debug_mode=DEBUG_MODE, - path_debug=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc"), - - # --------------------------------------------JOBLIB SETTINGS------------------------------------------------------- - # number of parallel jobs in some tasks as OCR - n_jobs=1, - - # --------------------------------------------GPU SETTINGS---------------------------------------------------------- - # set gpu in XGBoost and torch models - on_gpu=False, - - # ---------------------------------------------API SETTINGS--------------------------------------------------------- - # max file size in bytes - max_content_length=512 * 1024 * 1024, - # application port - api_port=int(os.environ.get("DOCREADER_PORT", "1231")), - static_files_dirs={}, - # log settings - logger=logging.getLogger(), - import_path_init_api_args="dedoc.api.api_args", - - # ----------------------------------------TABLE RECOGNIZER DEBUG SETTINGS------------------------------------------- - # path to save debug images for tables recognizer - path_detect=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "detect_lines"), - - # -------------------------------------------RECOGNIZE SETTINGS----------------------------------------------------- - # TESSERACT OCR confidence threshold ( values: [-1 - undefined; 0.0 : 100.0 % - confidence value) - ocr_conf_threshold=40.0, - # max depth of document structure tree - recursion_deep_subparagraphs=30, - - # -------------------------------------------EXTERNAL SERVICES SETTINGS--------------------------------------------- - grobid_max_connection_attempts=3 -) - - class Configuration(object): """ Pattern Singleton for configuration service @@ -70,7 +18,54 @@ def get_instance(cls: "Configuration") -> "Configuration": def get_config(self) -> dict: if self.__config is None: - self.__config = _config + import logging + import os + import sys + + logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s") + + resources_path = os.environ.get("RESOURCES_PATH", os.path.join(os.path.expanduser("~"), ".cache", "dedoc", "resources")) + self.__config = dict( + # -----------------------------------------RESOURCES PATH SETTINGS---------------------------------------------------- + resources_path=resources_path, + intermediate_data_path=os.path.join(resources_path, "datasets"), + table_path="/tmp/tables", + + # -----------------------------------------COMMON DEBUG SETTINGS---------------------------------------------------- + debug_mode=False, + path_debug=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc"), + + # --------------------------------------------JOBLIB SETTINGS------------------------------------------------------- + # number of parallel jobs in some tasks as OCR + n_jobs=1, + + # --------------------------------------------GPU SETTINGS---------------------------------------------------------- + # set gpu in XGBoost and torch models + on_gpu=False, + + # ---------------------------------------------API SETTINGS--------------------------------------------------------- + # max file size in bytes + max_content_length=512 * 1024 * 1024, + # application port + api_port=int(os.environ.get("DOCREADER_PORT", "1231")), + static_files_dirs={}, + # log settings + logger=logging.getLogger(), + import_path_init_api_args="dedoc.api.api_args", + + # ----------------------------------------TABLE RECOGNIZER DEBUG SETTINGS------------------------------------------- + # path to save debug images for tables recognizer + path_detect=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "detect_lines"), + + # -------------------------------------------RECOGNIZE SETTINGS----------------------------------------------------- + # TESSERACT OCR confidence threshold ( values: [-1 - undefined; 0.0 : 100.0 % - confidence value) + ocr_conf_threshold=40.0, + # max depth of document structure tree + recursion_deep_subparagraphs=30, + + # -------------------------------------------EXTERNAL SERVICES SETTINGS--------------------------------------------- + grobid_max_connection_attempts=3 + ) return self.__config diff --git a/dedoc/converters/concrete_converters/abstract_converter.py b/dedoc/converters/concrete_converters/abstract_converter.py index d4385fe4..0e9f5310 100644 --- a/dedoc/converters/concrete_converters/abstract_converter.py +++ b/dedoc/converters/concrete_converters/abstract_converter.py @@ -1,11 +1,7 @@ -import logging -import os -import subprocess from abc import ABC, abstractmethod from typing import List, Optional, Set from dedoc.common.exceptions.conversion_error import ConversionError -from dedoc.utils.utils import get_mime_extension class AbstractConverter(ABC): @@ -18,6 +14,8 @@ def __init__(self, *, config: Optional[dict] = None, converted_extensions: Optio :param converted_extensions: set of supported files extensions with a dot, for example {.doc, .pdf} :param converted_mimes: set of supported MIME types of files """ + import logging + self.timeout = 60 self.period_checking = 0.05 self.config = {} if config is None else config @@ -40,6 +38,8 @@ def can_convert(self, :param parameters: any additional parameters for the given document :return: the indicator of possibility to convert this file """ + from dedoc.utils.utils import get_mime_extension + mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in self._converted_extensions or mime in self._converted_mimes @@ -58,6 +58,9 @@ def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: pass def _run_subprocess(self, command: List[str], filename: str, expected_path: str) -> None: + import os + import subprocess + try: conversion_results = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=self.timeout) error_message = conversion_results.stderr.decode().strip() diff --git a/dedoc/converters/concrete_converters/binary_converter.py b/dedoc/converters/concrete_converters/binary_converter.py index ba7741cf..bfb00cb4 100644 --- a/dedoc/converters/concrete_converters/binary_converter.py +++ b/dedoc/converters/concrete_converters/binary_converter.py @@ -1,9 +1,6 @@ from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.converters.concrete_converters.png_converter import PNGConverter -from dedoc.utils import supported_image_types -from dedoc.utils.utils import get_mime_extension class BinaryConverter(AbstractConverter): @@ -12,6 +9,7 @@ class BinaryConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.converters.concrete_converters.png_converter import PNGConverter super().__init__(config=config) self.png_converter = PNGConverter(config=self.config) @@ -23,6 +21,9 @@ def can_convert(self, """ Checks if the document is image-like (e.g. it has .bmp, .jpg, .tiff, etc. extension) and has `mime=application/octet-stream`. """ + from dedoc.utils import supported_image_types + from dedoc.utils.utils import get_mime_extension + mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return mime == "application/octet-stream" and extension in supported_image_types diff --git a/dedoc/converters/concrete_converters/docx_converter.py b/dedoc/converters/concrete_converters/docx_converter.py index 539a5d14..3422e400 100644 --- a/dedoc/converters/concrete_converters/docx_converter.py +++ b/dedoc/converters/concrete_converters/docx_converter.py @@ -1,9 +1,6 @@ -import os from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_extensions, converted_mimes -from dedoc.utils.utils import splitext_ class DocxConverter(AbstractConverter): @@ -12,12 +9,16 @@ class DocxConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import converted_extensions, converted_mimes super().__init__(config=config, converted_extensions=converted_extensions.docx_like_format, converted_mimes=converted_mimes.docx_like_format) def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the docx-like documents into files with .docx extension using the soffice application. """ + import os + from dedoc.utils.utils import splitext_ + file_dir, file_name = os.path.split(file_path) name_wo_ext, _ = splitext_(file_name) command = ["soffice", "--headless", "--convert-to", "docx", "--outdir", file_dir, file_path] diff --git a/dedoc/converters/concrete_converters/excel_converter.py b/dedoc/converters/concrete_converters/excel_converter.py index 2d3d2b59..351e5312 100644 --- a/dedoc/converters/concrete_converters/excel_converter.py +++ b/dedoc/converters/concrete_converters/excel_converter.py @@ -1,9 +1,6 @@ -import os from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_extensions, converted_mimes -from dedoc.utils.utils import splitext_ class ExcelConverter(AbstractConverter): @@ -12,12 +9,16 @@ class ExcelConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import converted_extensions, converted_mimes super().__init__(config=config, converted_extensions=converted_extensions.excel_like_format, converted_mimes=converted_mimes.excel_like_format) def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the xlsx-like documents into files with .xlsx extension using the soffice application. """ + import os + from dedoc.utils.utils import splitext_ + file_dir, file_name = os.path.split(file_path) name_wo_ext, _ = splitext_(file_name) command = ["soffice", "--headless", "--convert-to", "xlsx", "--outdir", file_dir, file_path] diff --git a/dedoc/converters/concrete_converters/pdf_converter.py b/dedoc/converters/concrete_converters/pdf_converter.py index 44b9fba1..306890ed 100644 --- a/dedoc/converters/concrete_converters/pdf_converter.py +++ b/dedoc/converters/concrete_converters/pdf_converter.py @@ -1,9 +1,6 @@ -import os from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_extensions, converted_mimes -from dedoc.utils.utils import splitext_ class PDFConverter(AbstractConverter): @@ -12,12 +9,16 @@ class PDFConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import converted_extensions, converted_mimes super().__init__(config=config, converted_extensions=converted_extensions.pdf_like_format, converted_mimes=converted_mimes.pdf_like_format) def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the pdf-like documents into files with .pdf extension using the ddjvu application. """ + import os + from dedoc.utils.utils import splitext_ + file_dir, file_name = os.path.split(file_path) name_wo_ext, _ = splitext_(file_name) converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.pdf") diff --git a/dedoc/converters/concrete_converters/png_converter.py b/dedoc/converters/concrete_converters/png_converter.py index fc04f876..fe616299 100644 --- a/dedoc/converters/concrete_converters/png_converter.py +++ b/dedoc/converters/concrete_converters/png_converter.py @@ -1,13 +1,7 @@ -import os from typing import Optional -import cv2 -from PIL import Image, UnidentifiedImageError - from dedoc.common.exceptions.conversion_error import ConversionError from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_extensions, converted_mimes -from dedoc.utils.utils import splitext_ class PNGConverter(AbstractConverter): @@ -16,12 +10,18 @@ class PNGConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import converted_extensions, converted_mimes super().__init__(config=config, converted_extensions=converted_extensions.image_like_format, converted_mimes=converted_mimes.image_like_format) def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the image-like documents into files with .png extension. """ + import os + import cv2 + from PIL import Image, UnidentifiedImageError + from dedoc.utils.utils import splitext_ + file_dir, file_name = os.path.split(file_path) name_wo_ext, extension = splitext_(file_name) converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.png") diff --git a/dedoc/converters/concrete_converters/pptx_converter.py b/dedoc/converters/concrete_converters/pptx_converter.py index 3eef1f61..c6eef47d 100644 --- a/dedoc/converters/concrete_converters/pptx_converter.py +++ b/dedoc/converters/concrete_converters/pptx_converter.py @@ -1,9 +1,6 @@ -import os from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_extensions, converted_mimes -from dedoc.utils.utils import splitext_ class PptxConverter(AbstractConverter): @@ -12,12 +9,16 @@ class PptxConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import converted_extensions, converted_mimes super().__init__(config=config, converted_extensions=converted_extensions.pptx_like_format, converted_mimes=converted_mimes.pptx_like_format) def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the pptx-like documents into files with .pptx extension using the soffice application. """ + import os + from dedoc.utils.utils import splitext_ + file_dir, file_name = os.path.split(file_path) name_wo_ext, _ = splitext_(file_name) command = ["soffice", "--headless", "--convert-to", "pptx", "--outdir", file_dir, file_path] diff --git a/dedoc/converters/concrete_converters/txt_converter.py b/dedoc/converters/concrete_converters/txt_converter.py index f0b71147..c8cfaf6c 100644 --- a/dedoc/converters/concrete_converters/txt_converter.py +++ b/dedoc/converters/concrete_converters/txt_converter.py @@ -1,10 +1,6 @@ -import os -import shutil from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_extensions, converted_mimes -from dedoc.utils.utils import splitext_ class TxtConverter(AbstractConverter): @@ -13,12 +9,17 @@ class TxtConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import converted_extensions, converted_mimes super().__init__(config=config, converted_extensions=converted_extensions.txt_like_format, converted_mimes=converted_mimes.txt_like_format) def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the txt-like documents into files with .txt extension by renaming it. """ + import os + import shutil + from dedoc.utils.utils import splitext_ + file_dir, file_name = os.path.split(file_path) name_wo_ext, _ = splitext_(file_name) converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.txt") diff --git a/dedoc/converters/converter_composition.py b/dedoc/converters/converter_composition.py index d6d24f91..d9f7533b 100644 --- a/dedoc/converters/converter_composition.py +++ b/dedoc/converters/converter_composition.py @@ -1,9 +1,6 @@ -import os -from stat import S_IREAD, S_IRGRP, S_IROTH from typing import List, Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.utils.utils import get_mime_extension class ConverterComposition: @@ -30,6 +27,10 @@ def convert(self, file_path: str, parameters: Optional[dict] = None, extension: :param mime: MIME type of file :return: path of converted file if conversion was executed else path of the original file """ + import os + from stat import S_IREAD, S_IRGRP, S_IROTH + from dedoc.utils.utils import get_mime_extension + mime, extension = get_mime_extension(file_path=file_path, extension=extension, mime=mime) converted_file_path = file_path diff --git a/dedoc/data_structures/cell_with_meta.py b/dedoc/data_structures/cell_with_meta.py index f544e9e7..c2ff7cac 100644 --- a/dedoc/data_structures/cell_with_meta.py +++ b/dedoc/data_structures/cell_with_meta.py @@ -1,7 +1,5 @@ from typing import List -import numpy as np - from dedoc.api.schema.cell_with_meta import CellWithMeta as ApiCellWithMeta from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.line_with_meta import LineWithMeta @@ -40,9 +38,11 @@ def get_annotations(self) -> List[Annotation]: return LineWithMeta.join(lines=self.lines, delimiter="\n").annotations @staticmethod - def create_from_cell(cell: "Cell") -> "CellWithMeta": # noqa + def create_from_cell(cell: "CellWithMeta") -> "CellWithMeta": return CellWithMeta(lines=cell.lines, colspan=cell.colspan, rowspan=cell.rowspan, invisible=cell.invisible) def to_api_schema(self) -> ApiCellWithMeta: + import numpy as np + lines = [line.to_api_schema() for line in self.lines] return ApiCellWithMeta(lines=lines, colspan=int(np.int8(self.colspan)), rowspan=int(np.int8(self.rowspan)), invisible=self.invisible) diff --git a/dedoc/data_structures/concrete_annotations/bbox_annotation.py b/dedoc/data_structures/concrete_annotations/bbox_annotation.py index bd453d24..303c9eb0 100644 --- a/dedoc/data_structures/concrete_annotations/bbox_annotation.py +++ b/dedoc/data_structures/concrete_annotations/bbox_annotation.py @@ -1,4 +1,3 @@ -import json from typing import Tuple from dedocutils.data_structures import BBox @@ -20,6 +19,8 @@ def __init__(self, start: int, end: int, value: BBox, page_width: int, page_heig :param page_width: width of original image with this bbox :param page_height: height of original image with this bbox """ + import json + if not isinstance(value, BBox): raise ValueError("the value of bounding box annotation should be instance of BBox") @@ -30,6 +31,8 @@ def get_bbox_from_value(value: str) -> Tuple[BBox, int, int]: """ Returns: BBox object, page_width, page_height """ + import json + bbox_dict = json.loads(value) bbox = BBox(x_top_left=int(bbox_dict["x_top_left"] * bbox_dict["page_width"]), y_top_left=int(bbox_dict["y_top_left"] * bbox_dict["page_height"]), diff --git a/dedoc/data_structures/concrete_annotations/color_annotation.py b/dedoc/data_structures/concrete_annotations/color_annotation.py index 4b6983d6..8ddd2479 100644 --- a/dedoc/data_structures/concrete_annotations/color_annotation.py +++ b/dedoc/data_structures/concrete_annotations/color_annotation.py @@ -1,6 +1,3 @@ -import json -from collections import OrderedDict - from dedoc.data_structures.annotation import Annotation @@ -18,6 +15,9 @@ def __init__(self, start: int, end: int, red: float, green: float, blue: float) :param green: mean value of the green color component in the pixels that are not white in the given bounding box :param blue: mean value of the blue color component in the pixels that are not white in the given bounding box """ + import json + from collections import OrderedDict + assert red >= 0 assert green >= 0 assert blue >= 0 diff --git a/dedoc/data_structures/document_metadata.py b/dedoc/data_structures/document_metadata.py index e93b2c16..ec51d143 100644 --- a/dedoc/data_structures/document_metadata.py +++ b/dedoc/data_structures/document_metadata.py @@ -1,5 +1,4 @@ -import uuid -from typing import Any, Dict, Union +from typing import Dict, Union from dedoc.api.schema.document_metadata import DocumentMetadata as ApiDocumentMetadata from dedoc.data_structures.serializable import Serializable @@ -30,6 +29,8 @@ def __init__(self, :param access_time: time of the last access to the file in unixtime :param file_type: mime type of the file """ + import uuid + self.file_name = file_name self.temporary_file_name = temporary_file_name self.size = size @@ -41,7 +42,7 @@ def __init__(self, self.add_attribute(key, value) self.uid = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid - def add_attribute(self, key: str, value: Any) -> None: # noqa + def add_attribute(self, key: str, value: Union[str, int, float]) -> None: setattr(self, key, value) def to_api_schema(self) -> ApiDocumentMetadata: diff --git a/dedoc/data_structures/hierarchy_level.py b/dedoc/data_structures/hierarchy_level.py index 9df3304c..ab2ea053 100644 --- a/dedoc/data_structures/hierarchy_level.py +++ b/dedoc/data_structures/hierarchy_level.py @@ -1,8 +1,6 @@ from functools import total_ordering from typing import Optional -import numpy as np - @total_ordering class HierarchyLevel: @@ -89,6 +87,8 @@ def __str__(self) -> str: return f"HierarchyLevel(level_1={self.level_1}, level_2={self.level_2}, can_be_multiline={self.can_be_multiline}, line_type={self.line_type})" def __to_number(self, x: Optional[int]) -> int: + import numpy as np + return np.inf if x is None else x def is_raw_text(self) -> bool: diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py index 798a1712..a16a7dd2 100644 --- a/dedoc/data_structures/line_with_meta.py +++ b/dedoc/data_structures/line_with_meta.py @@ -1,13 +1,9 @@ -import re -from copy import deepcopy from typing import List, Optional, Sized, Union -from uuid import uuid1 from dedoc.api.schema.line_with_meta import LineWithMeta as ApiLineWithMeta from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.serializable import Serializable -from dedoc.utils.annotation_merger import AnnotationMerger class LineWithMeta(Sized, Serializable): @@ -24,6 +20,7 @@ def __init__(self, line: str, metadata: Optional[LineMetadata] = None, annotatio :param annotations: metadata that refers to some part of the text, for example, font size, font type, etc. :param uid: unique identifier of the line """ + from uuid import uuid1 self._line = line self._metadata = LineMetadata(page_id=0, line_id=None) if metadata is None else metadata @@ -43,6 +40,8 @@ def join(lines: List["LineWithMeta"], delimiter: str = "\n") -> "LineWithMeta": :param delimiter: delimiter to insert between lines :return: merged line """ + from copy import deepcopy + if len(lines) == 0: return LineWithMeta("") @@ -65,6 +64,8 @@ def split(self, sep: str) -> List["LineWithMeta"]: :param sep: separator for splitting :return: list of split lines """ + import re + if not sep: raise ValueError("empty separator") borders = set() @@ -140,6 +141,8 @@ def __repr__(self) -> str: f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})") def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta": + from dedoc.utils.annotation_merger import AnnotationMerger + assert isinstance(other, (LineWithMeta, str)) if len(other) == 0: return self diff --git a/dedoc/data_structures/parsed_document.py b/dedoc/data_structures/parsed_document.py index 9b671e04..862b87d0 100644 --- a/dedoc/data_structures/parsed_document.py +++ b/dedoc/data_structures/parsed_document.py @@ -1,6 +1,5 @@ from typing import List, Optional -import dedoc from dedoc.api.schema.parsed_document import ParsedDocument as ApiParsedDocument from dedoc.data_structures.document_content import DocumentContent from dedoc.data_structures.document_metadata import DocumentMetadata @@ -36,7 +35,9 @@ def set_metadata(self, metadata: DocumentMetadata) -> None: self.metadata = metadata def to_api_schema(self) -> ApiParsedDocument: + import dedoc.version + content = self.content.to_api_schema() metadata = self.metadata.to_api_schema() attachments = [attachment.to_api_schema() for attachment in self.attachments] if self.attachments is not None else [] - return ApiParsedDocument(content=content, metadata=metadata, version=dedoc.__version__, warnings=self.warnings, attachments=attachments) + return ApiParsedDocument(content=content, metadata=metadata, version=dedoc.version.__version__, warnings=self.warnings, attachments=attachments) diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py index fc934d9a..e85c747e 100644 --- a/dedoc/data_structures/table_metadata.py +++ b/dedoc/data_structures/table_metadata.py @@ -1,4 +1,3 @@ -import uuid from typing import Optional from dedoc.api.schema.table_metadata import TableMetadata as ApiTableMetadata @@ -16,6 +15,8 @@ def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_an :param rotated_angle: value of the rotation angle by which the table was rotated during recognition :param title: table's title """ + import uuid + self.page_id = page_id self.uid = str(uuid.uuid4()) if not uid else uid self.rotated_angle = rotated_angle diff --git a/dedoc/data_structures/tree_node.py b/dedoc/data_structures/tree_node.py index 9d8ba676..6cde3554 100644 --- a/dedoc/data_structures/tree_node.py +++ b/dedoc/data_structures/tree_node.py @@ -6,7 +6,6 @@ from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.serializable import Serializable -from dedoc.utils.annotation_merger import AnnotationMerger class TreeNode(Serializable): @@ -52,20 +51,18 @@ def create(lines: List[LineWithMeta] = None) -> "TreeNode": """ page_id = 0 if len(lines) == 0 else min((line.metadata.page_id for line in lines)) line_id = 0 if len(lines) == 0 else min((line.metadata.line_id for line in lines)) + metadata = LineMetadata(page_id=page_id, line_id=line_id, hierarchy_level=HierarchyLevel.create_root()) texts = (line.line for line in lines) annotations = [] text_length = 0 for line in lines: annotations.extend(TreeNode.__shift_annotations(line=line, text_length=text_length)) + TreeNode.__add_additional_page_id(start=text_length, metadata=metadata, other_line=line) + text_length += len(line.line) text = "".join(texts) - return TreeNode("0", - text, - annotations=annotations, - metadata=LineMetadata(page_id=page_id, line_id=line_id, hierarchy_level=HierarchyLevel.create_root()), - subparagraphs=[], - parent=None) + return TreeNode("0", text, annotations=annotations, metadata=metadata, subparagraphs=[], parent=None) def add_child(self, line: LineWithMeta) -> "TreeNode": """ @@ -94,6 +91,7 @@ def add_text(self, line: LineWithMeta) -> None: text_length = len(self.text) new_annotations = self.__shift_annotations(line, text_length) + self.__add_additional_page_id(start=len(self.text), metadata=self.metadata, other_line=line) self.text += line.line self.annotations.extend(new_annotations) @@ -115,6 +113,8 @@ def get_root(self) -> "TreeNode": return node def merge_annotations(self) -> None: + from dedoc.utils.annotation_merger import AnnotationMerger + root = self.get_root() stack = [root] merger = AnnotationMerger() @@ -123,3 +123,30 @@ def merge_annotations(self) -> None: node.annotations = merger.merge_annotations(node.annotations, node.text) for sub_node in node.subparagraphs: stack.append(sub_node) + + @staticmethod + def __add_additional_page_id(start: int, metadata: LineMetadata, other_line: LineWithMeta) -> None: + """ + Adds additional page_id metadata for multi-page nodes. + + If node is located on several pages, its metadata will contain "additional_page_id" attribute with list of dicts: + { + start: start index of the text on the next page, + end: end index (not included), + page_id: page id, where this textual part (node_text[start:end]) is located + } + """ + if metadata.page_id == other_line.metadata.page_id: + return + + if hasattr(metadata, "additional_page_ids"): + last_page_id = metadata.additional_page_ids[-1]["page_id"] + if last_page_id == other_line.metadata.page_id: + metadata.additional_page_ids[-1]["end"] = start + len(other_line.line) + return + + additional_page_id = {"start": start, "end": start + len(other_line.line), "page_id": other_line.metadata.page_id} + if hasattr(metadata, "additional_page_ids"): + metadata.additional_page_ids.append(additional_page_id) + else: + metadata.additional_page_ids = [additional_page_id] diff --git a/dedoc/data_structures/unstructured_document.py b/dedoc/data_structures/unstructured_document.py index 29e82917..94197e2e 100644 --- a/dedoc/data_structures/unstructured_document.py +++ b/dedoc/data_structures/unstructured_document.py @@ -28,3 +28,6 @@ def __init__(self, self.attachments = attachments self.warnings = warnings if warnings else [] self.metadata = metadata if metadata is not None else {} + + def get_text(self) -> str: + return LineWithMeta.join(self.lines).line diff --git a/dedoc/dedoc_manager.py b/dedoc/dedoc_manager.py index 668bc07f..62b7302e 100644 --- a/dedoc/dedoc_manager.py +++ b/dedoc/dedoc_manager.py @@ -1,20 +1,11 @@ -import logging -import os.path -import shutil -import tempfile from typing import Dict, Optional, Tuple from dedoc.api.api_args import QueryParameters from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.common.exceptions.conversion_error import ConversionError from dedoc.common.exceptions.dedoc_error import DedocError -from dedoc.config import get_config -from dedoc.data_structures import ParsedDocument, UnstructuredDocument -from dedoc.extensions import mime2extension -from dedoc.manager_config import get_manager_config -from dedoc.metadata_extractors import BaseMetadataExtractor -from dedoc.utils.train_dataset_utils import get_path_original_documents, save_line_with_meta -from dedoc.utils.utils import get_file_mime_by_content, get_mime_extension, get_unique_name +from dedoc.data_structures.parsed_document import ParsedDocument +from dedoc.data_structures.unstructured_document import UnstructuredDocument class DedocManager: @@ -42,6 +33,11 @@ def __init__(self, config: Optional[dict] = None, manager_config: Optional[dict] - document_metadata_extractor (:class:`~dedoc.metadata_extractors.MetadataExtractorComposition`) - attachments_handler (:class:`~dedoc.attachments_handler.AttachmentsHandler`) """ + import logging + + from dedoc.config import get_config + from dedoc.manager_config import get_manager_config + self.config = get_config() if config is None else config self.logger = self.config.get("logger", logging.getLogger()) manager_config = get_manager_config(self.config) if manager_config is None else manager_config @@ -69,12 +65,16 @@ def parse(self, file_path: str, parameters: Optional[Dict[str, str]] = None) -> :param parameters: any parameters, specify how to parse file, see :ref:`parameters_description` for more details :return: parsed document """ + import os.path + parameters = self.__init_parameters(file_path, parameters) self.logger.info(f"Get file {os.path.basename(file_path)} with parameters {parameters}") try: return self.__parse_no_error_handling(file_path=file_path, parameters=parameters) except DedocError as e: + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + file_dir, file_name = os.path.split(file_path) e.filename = file_name e.metadata = BaseMetadataExtractor._get_base_meta_information(directory=file_dir, filename=file_name, name_actual=file_name) @@ -88,6 +88,11 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) :param parameters: any parameters, specify how to parse file :return: parsed document """ + import os.path + import shutil + import tempfile + from dedoc.utils.utils import get_unique_name + if not os.path.isfile(path=file_path): raise FileNotFoundError(file_path) self.logger.info(f"Start handle {file_path}") @@ -124,6 +129,8 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) return parsed_document def __init_parameters(self, file_path: str, parameters: Optional[dict]) -> dict: + import os.path + parameters = {} if parameters is None else parameters result_parameters = {} @@ -136,6 +143,10 @@ def __init_parameters(self, file_path: str, parameters: Optional[dict]) -> dict: return result_parameters def __read_with_mime_auto_detection(self, file_path: str, file_name: str, parameters: Optional[dict]) -> Tuple[str, UnstructuredDocument]: + import os.path + from dedoc.extensions import mime2extension + from dedoc.utils.utils import get_file_mime_by_content, get_mime_extension + # firstly, try to read file using its original extension mime, extension = get_mime_extension(file_path=file_path) try: @@ -156,6 +167,9 @@ def __read_with_mime_auto_detection(self, file_path: str, file_name: str, parame return converted_file_path, document def __parse_file(self, file_path: str, file_name: str, parameters: Optional[dict], extension: str, mime: str) -> Tuple[str, UnstructuredDocument]: + import os.path + from dedoc.utils.utils import get_mime_extension + converted_file_path = self.converter.convert(file_path, parameters=parameters, mime=mime, extension=extension) if converted_file_path != file_path: mime, extension = get_mime_extension(file_path=converted_file_path) @@ -168,6 +182,10 @@ def __parse_file(self, file_path: str, file_name: str, parameters: Optional[dict return converted_file_path, unstructured_document def __save(self, file_path: str, classified_document: UnstructuredDocument) -> None: + import os.path + import shutil + from dedoc.utils.train_dataset_utils import get_path_original_documents, save_line_with_meta + self.logger.info(f'Save document lines to {self.config["intermediate_data_path"]}') save_line_with_meta(lines=classified_document.lines, config=self.config, original_document=os.path.basename(file_path)) shutil.copy(file_path, os.path.join(get_path_original_documents(self.config), os.path.basename(file_path))) diff --git a/dedoc/download_models.py b/dedoc/download_models.py index b520a7df..7fa611bd 100644 --- a/dedoc/download_models.py +++ b/dedoc/download_models.py @@ -1,10 +1,4 @@ """Downloading models in advance inside the docker container.""" -import os -import shutil - -from huggingface_hub import hf_hub_download - -from dedoc.config import get_config """ These are versions of the models that are used at the current moment - hashes of commits from https://huggingface.co/dedoc. @@ -21,12 +15,18 @@ def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str) -> None: + import os + import shutil + from huggingface_hub import hf_hub_download + os.makedirs(out_dir, exist_ok=True) path = os.path.realpath(hf_hub_download(repo_id=f"dedoc/{repo_name}", filename=hub_name, revision=model_hash_dict[repo_name])) shutil.move(path, os.path.join(out_dir, out_name)) def download(resources_path: str) -> None: + import os + download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.pkl.gz", repo_name="txtlayer_classifier", hub_name="model.pkl.gz") download_from_hub(out_dir=resources_path, @@ -53,5 +53,7 @@ def download(resources_path: str) -> None: if __name__ == "__main__": + from dedoc.config import get_config + resources_path = get_config()["resources_path"] download(resources_path) diff --git a/dedoc/extensions.py b/dedoc/extensions.py index 7037eedb..069642e0 100644 --- a/dedoc/extensions.py +++ b/dedoc/extensions.py @@ -1,7 +1,6 @@ from collections import namedtuple from typing import List -from dedoc.utils.utils import get_extensions_by_mimes Extensions = namedtuple("Parts", [ "excel_like_format", @@ -125,6 +124,8 @@ def get_image_extensions() -> List[str]: + from dedoc.utils.utils import get_extensions_by_mimes + image_extensions = get_extensions_by_mimes(converted_mimes.image_like_format) image_extensions.extend(get_extensions_by_mimes(recognized_mimes.image_like_format)) image_extensions.extend(converted_extensions.image_like_format) diff --git a/dedoc/main.py b/dedoc/main.py index 9c97125c..1d650a1a 100644 --- a/dedoc/main.py +++ b/dedoc/main.py @@ -1,7 +1,7 @@ -from dedoc.api.dedoc_api import get_api, run_api # noqa from dedoc.config import Configuration if __name__ == "__main__": + from dedoc.api.dedoc_api import get_api, run_api Configuration.get_instance().get_config() run_api(get_api()) diff --git a/dedoc/metadata_extractors/abstract_metadata_extractor.py b/dedoc/metadata_extractors/abstract_metadata_extractor.py index 5fca1063..dfcd451d 100644 --- a/dedoc/metadata_extractors/abstract_metadata_extractor.py +++ b/dedoc/metadata_extractors/abstract_metadata_extractor.py @@ -1,10 +1,6 @@ -import logging -import os from abc import ABC, abstractmethod from typing import Optional, Set, Tuple -from dedoc.utils.utils import get_mime_extension - class AbstractMetadataExtractor(ABC): """ @@ -16,6 +12,8 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti :param recognized_extensions: set of supported files extensions with a dot, for example {.doc, .pdf} :param recognized_mimes: set of supported MIME types of files """ + import logging + self.config = {} if config is None else config self.logger = self.config.get("logger", logging.getLogger()) self._recognized_extensions = {} if recognized_extensions is None else recognized_extensions @@ -41,6 +39,9 @@ def can_extract(self, :param extension: file extension, for example .doc or .pdf :return: True if the extractor can handle the given file and False otherwise """ + import os + from dedoc.utils.utils import get_mime_extension + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) converted_file_path = os.path.join(file_dir, converted_filename) mime, extension = get_mime_extension(file_path=converted_file_path, mime=mime, extension=extension) @@ -66,6 +67,8 @@ def extract(self, pass def _get_names(self, file_path: str, converted_filename: Optional[str], original_filename: Optional[str]) -> Tuple[str, str, str, str]: + import os + file_dir, file_name = os.path.split(file_path) converted_filename = file_name if converted_filename is None else converted_filename original_filename = file_name if original_filename is None else original_filename diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py index 7986aaf2..c75681df 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py @@ -1,9 +1,6 @@ -import os -from base64 import b64encode from typing import Optional from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor -from dedoc.utils.utils import get_file_mime_type class BaseMetadataExtractor(AbstractMetadataExtractor): @@ -42,6 +39,9 @@ def extract(self, Gets the basic meta-information about the file. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ + from base64 import b64encode + import os + parameters = {} if parameters is None else parameters file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) meta_info = self._get_base_meta_information(file_dir, file_name, original_filename) @@ -54,6 +54,9 @@ def extract(self, @staticmethod def _get_base_meta_information(directory: str, filename: str, name_actual: str) -> dict: + import os + from dedoc.utils.utils import get_file_mime_type + (mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime) = os.stat(os.path.join(directory, filename)) meta = { "file_name": name_actual, diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py index 6234cd67..dc0bd949 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py @@ -1,11 +1,6 @@ -import os from datetime import datetime from typing import Optional -import docx -from docx.opc.exceptions import PackageNotFoundError - -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor @@ -26,6 +21,8 @@ class DocxMetadataExtractor(AbstractMetadataExtractor): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes + super().__init__(config=config, recognized_extensions=recognized_extensions.docx_like_format, recognized_mimes=recognized_mimes.docx_like_format) self.base_extractor = BaseMetadataExtractor(config=config) @@ -38,6 +35,8 @@ def extract(self, Add the predefined list of metadata for the docx documents. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ + import os + parameters = {} if parameters is None else parameters file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) @@ -53,6 +52,10 @@ def __convert_date(self, date: Optional[datetime]) -> Optional[int]: return None if date is None else int(date.timestamp()) def _get_docx_fields(self, file_path: str) -> dict: + import docx + from docx.opc.exceptions import PackageNotFoundError + import os + assert os.path.isfile(file_path) try: doc = docx.Document(file_path) diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py index bbf2e3a1..9ec09bb1 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py @@ -1,12 +1,5 @@ -import math -import os from typing import Optional, Union -import piexif -from PIL import ExifTags, Image -from dateutil import parser - -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor @@ -31,6 +24,7 @@ class ImageMetadataExtractor(AbstractMetadataExtractor): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.image_like_format, recognized_mimes=recognized_mimes.image_like_format) self.keys = { "DateTime": ("date_time", self.__parse_date), @@ -60,6 +54,8 @@ def extract(self, Add the predefined list of metadata for images. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ + import os + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) base_fields = self.base_extractor.extract( file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters @@ -85,6 +81,8 @@ def __parse_int(self, exif: Union[str, bytes]) -> Optional[int]: return None def __parse_date(self, date_str: Union[str, bytes]) -> Optional[int]: + from dateutil import parser + try: date_str = self.__encode_exif(date_str) date = parser.parse(date_str.replace(": ", ":")) @@ -93,6 +91,8 @@ def __parse_date(self, date_str: Union[str, bytes]) -> Optional[int]: return None def __parse_float(self, exif: Union[str, bytes]) -> Optional[float]: + import math + try: exif = self.__encode_exif(exif) result = float(exif) @@ -101,6 +101,9 @@ def __parse_float(self, exif: Union[str, bytes]) -> Optional[float]: return None def _get_exif(self, path: str) -> dict: + from PIL import ExifTags, Image + import piexif + try: image = Image.open(path) exif_dict = piexif.load(image.info["exif"]).get("Exif", {}) if "exif" in image.info else {} @@ -109,6 +112,6 @@ def _get_exif(self, path: str) -> dict: encoded_dict = {k: v for k, v in encoded_dict.items() if k is not None if v is not None} image.close() return encoded_dict - except Exception as e: # noqa + except Exception as e: self.logger.debug(e) return {"broken_image": True} diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py index f52608c1..cd1eec39 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py @@ -1,5 +1,3 @@ -import os -import pickle from typing import Optional from dedoc.common.exceptions.bad_file_error import BadFileFormatError @@ -40,6 +38,9 @@ def extract(self, Add the predefined list of metadata for the .note.pickle documents. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ + import os + import pickle + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) try: diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py index d3217a7b..f8b8e65a 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py @@ -1,13 +1,7 @@ -import os from typing import Optional -from PyPDF2 import PdfFileReader -from PyPDF2.utils import PdfReadError - -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor -from dedoc.utils.utils import convert_datetime class PdfMetadataExtractor(AbstractMetadataExtractor): @@ -27,6 +21,7 @@ class PdfMetadataExtractor(AbstractMetadataExtractor): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) self.base_extractor = BaseMetadataExtractor(config=config) self.keys = { @@ -52,6 +47,8 @@ def extract(self, Add the predefined list of metadata for the pdf documents. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ + import os + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) base_fields = self.base_extractor.extract( file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters @@ -61,6 +58,9 @@ def extract(self, return result def _get_pdf_info(self, path: str) -> dict: + from PyPDF2 import PdfFileReader + from PyPDF2.utils import PdfReadError + try: with open(path, "rb") as file: document = PdfFileReader(file) @@ -76,6 +76,8 @@ def _get_pdf_info(self, path: str) -> dict: return {"broken_pdf": True} def __prettify_metadata(self, document_info: dict) -> dict: + from dedoc.utils.utils import convert_datetime + result = {} for key, value in document_info.items(): if isinstance(value, str) and len(value) > 0: @@ -84,7 +86,7 @@ def __prettify_metadata(self, document_info: dict) -> dict: elif key in self.keys_date: try: date = convert_datetime(value) - except: # noqa + except Exception: date = None if date is not None: result[self.keys_date[key]] = date diff --git a/dedoc/metadata_extractors/metadata_extractor_composition.py b/dedoc/metadata_extractors/metadata_extractor_composition.py index f76ea6a3..165cd538 100644 --- a/dedoc/metadata_extractors/metadata_extractor_composition.py +++ b/dedoc/metadata_extractors/metadata_extractor_composition.py @@ -1,4 +1,3 @@ -import os.path from typing import List, Optional from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor @@ -37,6 +36,8 @@ def extract(self, :param mime: MIME type of file :return: dict with metadata information about the document """ + import os.path + for extractor in self.extractors: if extractor.can_extract( file_path=file_path, diff --git a/dedoc/readers/__init__.py b/dedoc/readers/__init__.py index 2d96fdae..357ab534 100644 --- a/dedoc/readers/__init__.py +++ b/dedoc/readers/__init__.py @@ -19,5 +19,5 @@ from .txt_reader.raw_text_reader import RawTextReader __all__ = ['ArchiveReader', 'ArticleReader', 'BaseReader', 'CSVReader', 'DocxReader', 'EmailReader', 'ExcelReader', 'HtmlReader', 'JsonReader', 'MhtmlReader', - 'NoteReader', 'PptxReader', 'ReaderComposition', 'RawTextReader', - 'PdfBaseReader', 'PdfImageReader', 'PdfTabbyReader', 'PdfTxtlayerReader', 'PdfAutoReader'] + 'NoteReader', 'PptxReader', 'ReaderComposition', 'RawTextReader', 'PdfBaseReader', 'PdfImageReader', 'PdfTabbyReader', 'PdfTxtlayerReader', + 'PdfAutoReader'] diff --git a/dedoc/readers/archive_reader/archive_reader.py b/dedoc/readers/archive_reader/archive_reader.py index 1598c403..af9554ec 100644 --- a/dedoc/readers/archive_reader/archive_reader.py +++ b/dedoc/readers/archive_reader/archive_reader.py @@ -1,20 +1,9 @@ -import os -import tarfile -import uuid -import zipfile -import zlib from typing import IO, Iterator, List, Optional -import py7zlib -import rarfile - from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.attached_file import AttachedFile from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments -from dedoc.utils.utils import get_file_mime_type, save_data_to_unique_file class ArchiveReader(BaseReader): @@ -23,6 +12,7 @@ class ArchiveReader(BaseReader): Documents with the following extensions can be parsed: .zip, .tar, .tar.gz, .rar, .7z. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.archive_like_format, recognized_mimes=recognized_mimes.archive_like_format) def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: @@ -30,6 +20,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure The method return empty content of archive, all content will be placed inside attachments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments + parameters = {} if parameters is None else parameters with_attachments = get_param_with_attachments(parameters) @@ -42,6 +34,11 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(lines=[], tables=[], attachments=attachments) def __get_attachments(self, path: str, tmp_dir: str, need_content_analysis: bool) -> List[AttachedFile]: + import rarfile + import tarfile + import zipfile + from dedoc.utils.utils import get_file_mime_type + mime = get_file_mime_type(path) if zipfile.is_zipfile(path) and mime == "application/zip": return list(self.__read_zip_archive(path=path, tmp_dir=tmp_dir, need_content_analysis=need_content_analysis)) @@ -55,6 +52,9 @@ def __get_attachments(self, path: str, tmp_dir: str, need_content_analysis: bool raise BadFileFormatError(f"bad archive {path}") def __read_zip_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]: + import zipfile + import zlib + try: with zipfile.ZipFile(path, "r") as arch_file: names = [member.filename for member in arch_file.infolist() if member.file_size > 0] @@ -66,6 +66,8 @@ def __read_zip_archive(self, path: str, tmp_dir: str, need_content_analysis: boo raise BadFileFormatError(f"Can't read file {path} ({e})") def __read_tar_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]: + import tarfile + with tarfile.open(path, "r") as arch_file: names = [member.name for member in arch_file.getmembers() if member.isfile()] for name in names: @@ -74,6 +76,8 @@ def __read_tar_archive(self, path: str, tmp_dir: str, need_content_analysis: boo file.close() def __read_rar_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]: + import rarfile + with rarfile.RarFile(path, "r") as arch_file: names = [item.filename for item in arch_file.infolist() if item.compress_size > 0] for name in names: @@ -81,6 +85,8 @@ def __read_rar_archive(self, path: str, tmp_dir: str, need_content_analysis: boo yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis) def __read_7z_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]: + import py7zlib + with open(path, "rb") as content: arch_file = py7zlib.Archive7z(content) names = arch_file.getnames() @@ -89,6 +95,10 @@ def __read_7z_archive(self, path: str, tmp_dir: str, need_content_analysis: bool yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis) def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes], need_content_analysis: bool) -> AttachedFile: + import os + import uuid + from dedoc.utils.utils import save_data_to_unique_file + file_name = os.path.basename(file_name) binary_data = file.read() if isinstance(binary_data, str): diff --git a/dedoc/readers/article_reader/article_reader.py b/dedoc/readers/article_reader/article_reader.py index c85a72d1..e1065fb8 100644 --- a/dedoc/readers/article_reader/article_reader.py +++ b/dedoc/readers/article_reader/article_reader.py @@ -1,23 +1,14 @@ -import math -import os -import time -import uuid from typing import Dict, List, Optional, Tuple -import cv2 -import numpy as np -import requests -from bs4 import BeautifulSoup, Tag -from pdf2image import convert_from_path +from bs4 import Tag +from numpy import ndarray -from dedoc.data_structures import Annotation, AttachAnnotation, AttachedFile, CellWithMeta, HierarchyLevel, LineMetadata, Table, TableAnnotation, TableMetadata -from dedoc.data_structures.concrete_annotations.reference_annotation import ReferenceAnnotation +from dedoc.data_structures.annotation import Annotation +from dedoc.data_structures.attached_file import AttachedFile from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.data_structures.table import Table from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth -from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_document_type, get_param_need_content_analysis, get_param_with_attachments class ArticleReader(BaseReader): @@ -26,7 +17,11 @@ class ArticleReader(BaseReader): """ def __init__(self, config: Optional[dict] = None) -> None: + import os + from dedoc.extensions import recognized_extensions, recognized_mimes + super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) + grobid_url = os.environ.get("GROBID_URL", "") if grobid_url: self.grobid_url = grobid_url @@ -47,6 +42,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + import requests + from bs4 import BeautifulSoup + with open(file_path, "rb") as file: files = {"input": file} try: @@ -87,6 +85,8 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + from dedoc.utils.parameter_utils import get_param_document_type + if get_param_document_type(parameters) != "article": return False @@ -97,6 +97,9 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, return super().can_read(file_path=file_path, mime=mime, extension=extension) def __update_grobid_alive(self, grobid_url: str, max_attempts: int = 2) -> None: + import time + import requests + if self.grobid_is_alive: return @@ -127,6 +130,9 @@ def __get_tag_by_hierarchy_path(self, source: Tag, hierarchy_path: List[str]) -> def __create_line(self, text: str, hierarchy_level_id: Optional[int] = None, paragraph_type: Optional[str] = None, annotations: Optional[List[Annotation]] = None, other_fields: Optional[Dict] = None) -> LineWithMeta: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + from dedoc.data_structures.line_metadata import LineMetadata + # TODO check on improve if other_fields is None: other_fields = {} @@ -214,6 +220,10 @@ def __parse_keywords(self, keywords_tag: Tag) -> List[LineWithMeta]: return lines def __create_line_with_refs(self, content: List[Tuple[str, Tag]], bib2uid: dict, table2uid: dict, attachment2uid: dict) -> LineWithMeta: + from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation + from dedoc.data_structures.concrete_annotations.reference_annotation import ReferenceAnnotation + from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation + text = "" start = 0 annotations = [] @@ -258,6 +268,8 @@ def __parse_text(self, soup: Tag, bib2uid: dict, table2uid: dict, attachment2uid return lines def __parse_section(self, section_tag: Tag, bib2uid: dict, table2uid: dict, attachment2uid: dict) -> List[LineWithMeta]: + from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth + lines = [] number = section_tag.head.get("n") if section_tag.head else "" number = number + " " if number else "" @@ -299,6 +311,9 @@ def __parse_tables(self, soup: Tag) -> Tuple[List[Table], dict]: """ + from dedoc.data_structures.cell_with_meta import CellWithMeta + from dedoc.data_structures.table_metadata import TableMetadata + tables = [] table2uid = {} @@ -349,6 +364,11 @@ def __parse_images(self, soup: Tag, file_path: str, parameters: Optional[dict]) Documentation: https://grobid.readthedocs.io/en/latest/Coordinates-in-PDF/ """ + import os + import uuid + import cv2 + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments + if not get_param_with_attachments(parameters): return [], {} @@ -379,12 +399,16 @@ def __parse_images(self, soup: Tag, file_path: str, parameters: Optional[dict]) return attachments, attachment2uid - def __get_image(self, figure_tag: Tag, file_path: str, page_sizes: List[Tuple[float, float]]) -> Optional[np.ndarray]: + def __get_image(self, figure_tag: Tag, file_path: str, page_sizes: List[Tuple[float, float]]) -> Optional[ndarray]: """ Crop the PDF page according to the figure's coordinates. Figure can consist of multiple sub-figures: we crop the union of all sub-figures. Example of the figure's coordinates: coords="3,151.56,211.52,312.23,7.89;3,136.68,115.84,338.92,75.24" """ + import math + import numpy as np + from pdf2image import convert_from_path + if figure_tag.graphic is None: return None diff --git a/dedoc/readers/base_reader.py b/dedoc/readers/base_reader.py index d4adff80..b351ae43 100644 --- a/dedoc/readers/base_reader.py +++ b/dedoc/readers/base_reader.py @@ -1,9 +1,7 @@ -import logging from abc import ABC, abstractmethod from typing import Optional, Set from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.utils.utils import get_mime_extension class BaseReader(ABC): @@ -22,6 +20,8 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti :param recognized_extensions: set of supported files extensions with a dot, for example {.doc, .pdf} :param recognized_mimes: set of supported MIME types of files """ + import logging + self.config = {} if config is None else config self.logger = self.config.get("logger", logging.getLogger()) self._recognized_extensions = {} if recognized_extensions is None else recognized_extensions @@ -39,6 +39,8 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, :return: True if this reader can handle the file, False otherwise """ + from dedoc.utils.utils import get_mime_extension + mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in self._recognized_extensions or mime in self._recognized_mimes diff --git a/dedoc/readers/csv_reader/csv_reader.py b/dedoc/readers/csv_reader/csv_reader.py index 73c2d1d2..4048bd7b 100644 --- a/dedoc/readers/csv_reader/csv_reader.py +++ b/dedoc/readers/csv_reader/csv_reader.py @@ -1,15 +1,7 @@ from typing import List, Optional, Tuple -import pandas as pd - -from dedoc.data_structures import LineMetadata, LineWithMeta -from dedoc.data_structures.cell_with_meta import CellWithMeta -from dedoc.data_structures.table import Table -from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.utils.utils import get_encoding class CSVReader(BaseReader): @@ -18,6 +10,7 @@ class CSVReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.csv_like_format, recognized_mimes=recognized_mimes.csv_like_format) self.default_separator = "," @@ -27,19 +20,26 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure The lines and attachments remain empty. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + import pandas as pd + from dedoc.data_structures.line_metadata import LineMetadata + from dedoc.data_structures.line_with_meta import LineWithMeta + from dedoc.data_structures.cell_with_meta import CellWithMeta + from dedoc.data_structures.table import Table + from dedoc.data_structures.table_metadata import TableMetadata + parameters = {} if parameters is None else parameters delimiter = parameters.get("delimiter") if delimiter is None: delimiter = "\t" if file_path.endswith(".tsv") else self.default_separator encoding, encoding_warning = self.__get_encoding(file_path, parameters) - df = pd.read_csv(file_path, sep=delimiter, header=None, encoding=encoding) + df = pd.read_csv(file_path, sep=delimiter, header=None, encoding=encoding, dtype="string", keep_default_na=False) table_metadata = TableMetadata(page_id=0) cells_with_meta = [] line_id = 0 for ind in df.index: row_lines = [] for cell in df.loc[ind]: - row_lines.append(CellWithMeta(lines=[LineWithMeta(line=str(cell), metadata=LineMetadata(page_id=0, line_id=line_id))])) + row_lines.append(CellWithMeta(lines=[LineWithMeta(line=cell, metadata=LineMetadata(page_id=0, line_id=line_id))])) line_id += 1 cells_with_meta.append(row_lines) @@ -49,6 +49,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(lines=[], tables=tables, attachments=[], warnings=warnings) def __get_encoding(self, path: str, parameters: dict) -> Tuple[str, List[str]]: + from dedoc.utils.utils import get_encoding + if parameters.get("encoding"): return parameters["encoding"], [] else: diff --git a/dedoc/readers/docx_reader/data_structures/base_props.py b/dedoc/readers/docx_reader/data_structures/base_props.py index c439c3d0..f6154d1c 100644 --- a/dedoc/readers/docx_reader/data_structures/base_props.py +++ b/dedoc/readers/docx_reader/data_structures/base_props.py @@ -3,7 +3,7 @@ class BaseProperties: - def __init__(self, properties: Optional["BaseProperties"] = None) -> None: # noqa + def __init__(self, properties: Optional["BaseProperties"] = None) -> None: """ Contains style properties for paragraphs and runs. :param properties: Paragraph or Run for copying its properties diff --git a/dedoc/readers/docx_reader/data_structures/table.py b/dedoc/readers/docx_reader/data_structures/table.py index b86855cb..1f11fdac 100644 --- a/dedoc/readers/docx_reader/data_structures/table.py +++ b/dedoc/readers/docx_reader/data_structures/table.py @@ -3,8 +3,8 @@ from bs4 import Tag -from dedoc.data_structures import LineWithMeta from dedoc.data_structures.cell_with_meta import CellWithMeta +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.table import Table from dedoc.data_structures.table_metadata import TableMetadata from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph diff --git a/dedoc/readers/docx_reader/docx_reader.py b/dedoc/readers/docx_reader/docx_reader.py index 76f6c8e5..fc136a14 100644 --- a/dedoc/readers/docx_reader/docx_reader.py +++ b/dedoc/readers/docx_reader/docx_reader.py @@ -1,13 +1,8 @@ from typing import List, Optional -from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor -from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument -from dedoc.utils.parameter_utils import get_param_with_attachments class DocxReader(BaseReader): @@ -17,6 +12,9 @@ class DocxReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor + from dedoc.extensions import recognized_extensions, recognized_mimes + super().__init__(config=config, recognized_extensions=recognized_extensions.docx_like_format, recognized_mimes=recognized_mimes.docx_like_format) self.attachment_extractor = DocxAttachmentsExtractor(config=self.config) @@ -26,6 +24,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument + from dedoc.utils.parameter_utils import get_param_with_attachments + with_attachments = get_param_with_attachments(parameters) attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else [] @@ -34,6 +35,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(lines=lines, tables=docx_document.tables, attachments=attachments, warnings=[]) def __fix_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + for i, line in enumerate(lines[1:]): if lines[i].metadata.tag_hierarchy_level != line.metadata.tag_hierarchy_level \ or lines[i].metadata.tag_hierarchy_level.line_type != HierarchyLevel.unknown \ diff --git a/dedoc/readers/email_reader/email_reader.py b/dedoc/readers/email_reader/email_reader.py index 97c8d259..8d3ec876 100644 --- a/dedoc/readers/email_reader/email_reader.py +++ b/dedoc/readers/email_reader/email_reader.py @@ -1,24 +1,10 @@ -import email -import json -import mimetypes -import os -import re -import uuid -from email.header import decode_header from email.message import Message -from tempfile import NamedTemporaryFile from typing import List, Optional -from dedoc.data_structures.attached_file import AttachedFile -from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.readers.html_reader.html_reader import HtmlReader -from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis -from dedoc.utils.utils import get_mime_extension, get_unique_name, save_data_to_unique_file class EmailReader(BaseReader): @@ -27,6 +13,8 @@ class EmailReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.readers.html_reader.html_reader import HtmlReader super().__init__(config=config, recognized_extensions=recognized_extensions.eml_like_format, recognized_mimes=recognized_mimes.eml_like_format) self.html_reader = HtmlReader(config=self.config) @@ -35,6 +23,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, Check if the document extension or mime is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + from dedoc.utils.utils import get_mime_extension mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) # this code differs from BaseReader because .eml and .mhtml files have the same mime type if extension: @@ -50,6 +39,14 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + import email + import json + import os + import uuid + from dedoc.data_structures.attached_file import AttachedFile + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis + from dedoc.utils.utils import get_unique_name + parameters = {} if parameters is None else parameters attachments_dir = get_param_attachments_dir(parameters, file_path) @@ -108,6 +105,12 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments) def __add_attachment(self, message: Message, attachments_dir: str, attachments: list, need_content_analysis: bool) -> None: + import mimetypes + import os + import uuid + from dedoc.data_structures.attached_file import AttachedFile + from dedoc.utils.utils import save_data_to_unique_file + content_type = message.get_content_type() payload = message.get_payload(decode=True) @@ -133,6 +136,8 @@ def __add_attachment(self, message: Message, attachments_dir: str, attachments: need_content_analysis=need_content_analysis)) def __add_content_from_html(self, message: Message, lines: list, tables: list, parameters: dict) -> None: + from tempfile import NamedTemporaryFile + payload = message.get_payload(decode=True) if payload is None: return @@ -153,6 +158,8 @@ def __add_content_from_html(self, message: Message, lines: list, tables: list, p file.close() def __add_text_content(self, message: Message, lines: list) -> None: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + payload = message.get_payload(decode=True) if payload is None: return @@ -168,11 +175,15 @@ def __add_text_content(self, message: Message, lines: list) -> None: annotations=[])) def __fix_filename(self, filename: str) -> str: + import re + filename = re.sub(r"[<>:\"/\\|?*]", "_", filename) filename = re.sub(r"\s+", " ", filename) return filename def __get_decoded(self, text: str) -> str: + from email.header import decode_header + part = [] for letter, encode in decode_header(text): if isinstance(letter, bytes): @@ -188,6 +199,8 @@ def __get_field(self, message: Message, key: str, line_metadata: LineMetadata) - return LineWithMeta(line=text, metadata=line_metadata) def __get_main_fields(self, message: Message) -> List[LineWithMeta]: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + lines = list() line_metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel(0, 0, False, "root"), page_id=0, line_id=0) lines.append(self.__get_field(message, "subject", line_metadata)) diff --git a/dedoc/readers/excel_reader/excel_reader.py b/dedoc/readers/excel_reader/excel_reader.py index db3a104e..a68a92e5 100644 --- a/dedoc/readers/excel_reader/excel_reader.py +++ b/dedoc/readers/excel_reader/excel_reader.py @@ -1,20 +1,10 @@ from typing import Optional -import xlrd from xlrd.sheet import Sheet -from dedoc.attachments_extractors.concrete_attachments_extractors.excel_attachments_extractor import ExcelAttachmentsExtractor -from dedoc.data_structures import LineMetadata, LineWithMeta -from dedoc.data_structures.cell_with_meta import CellWithMeta from dedoc.data_structures.table import Table -from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.utils.parameter_utils import get_param_with_attachments - -xlrd.xlsx.ensure_elementtree_imported(False, None) -xlrd.xlsx.Element_has_iter = True class ExcelReader(BaseReader): @@ -22,8 +12,13 @@ class ExcelReader(BaseReader): This class is used for parsing documents with .xlsx extension. Please use :class:`~dedoc.converters.ExcelConverter` for getting xlsx file from similar formats. """ + import xlrd + xlrd.xlsx.ensure_elementtree_imported(False, None) + xlrd.xlsx.Element_has_iter = True def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.attachments_extractors.concrete_attachments_extractors.excel_attachments_extractor import ExcelAttachmentsExtractor + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.excel_like_format, recognized_mimes=recognized_mimes.excel_like_format) self.attachment_extractor = ExcelAttachmentsExtractor(config=self.config) @@ -32,6 +27,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure This method extracts tables and attachments from the document, `lines` attribute remains empty. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + import xlrd + from dedoc.utils.parameter_utils import get_param_with_attachments + with xlrd.open_workbook(file_path) as book: sheets_num = book.nsheets tables = [] @@ -45,6 +43,11 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(lines=[], tables=tables, attachments=attachments, warnings=[]) def __parse_sheet(self, sheet_id: int, sheet: Sheet) -> Table: + from dedoc.data_structures.line_with_meta import LineWithMeta + from dedoc.data_structures.line_metadata import LineMetadata + from dedoc.data_structures.cell_with_meta import CellWithMeta + from dedoc.data_structures.table_metadata import TableMetadata + n_rows = sheet.nrows n_cols = sheet.ncols res = [] diff --git a/dedoc/readers/html2pdf_reader/html2pdf_reader.py b/dedoc/readers/html2pdf_reader/html2pdf_reader.py index f18cbf16..b8f83ed1 100644 --- a/dedoc/readers/html2pdf_reader/html2pdf_reader.py +++ b/dedoc/readers/html2pdf_reader/html2pdf_reader.py @@ -1,28 +1,25 @@ -import os -import re -from copy import deepcopy -from tempfile import TemporaryDirectory from typing import Dict, Optional, Tuple -from uuid import uuid1 from bs4 import BeautifulSoup -from weasyprint import HTML -from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation from dedoc.data_structures.table import Table from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.html_reader.html_reader import HtmlReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader -from dedoc.utils.utils import calculate_file_hash class Html2PdfReader(HtmlReader): def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader self.pdf_reader = PdfTxtlayerReader(config=self.config) def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: + import os + from copy import deepcopy + from tempfile import TemporaryDirectory + from weasyprint import HTML + parameters = {} if parameters is None else parameters with TemporaryDirectory() as tmp_dir: modified_path, tables = self._modify_html(file_path, tmp_dir) @@ -36,6 +33,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return self._add_tables(document=unstructured_document, tables=tables) def _add_tables(self, document: UnstructuredDocument, tables: Dict[str, Table]) -> UnstructuredDocument: + from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation + lines = [] tables_result = [] previous_line = None @@ -54,6 +53,8 @@ def _add_tables(self, document: UnstructuredDocument, tables: Dict[str, Table]) return UnstructuredDocument(lines=lines, tables=tables_result, attachments=document.attachments) def _handle_tables(self, soup: BeautifulSoup, path_hash: str) -> dict: + from uuid import uuid1 + tables = {} for table_tag in soup.find_all("table"): table_uid = f"table_{uuid1()}" @@ -75,6 +76,8 @@ def _handle_super_elements(self, soup: BeautifulSoup) -> None: html-code: 1.1) lalala view: "1.1) lalala" """ + import re + supers = soup.find_all(["span", "p"], {"style": re.compile("vertical-align:super")}) for super_element in supers: @@ -86,6 +89,9 @@ def _handle_super_elements(self, soup: BeautifulSoup) -> None: super_element.decompose() def _modify_html(self, path: str, tmp_dir: str) -> Tuple[str, dict]: + import os + from dedoc.utils.utils import calculate_file_hash + with open(path, encoding="utf-8") as f: soup = BeautifulSoup(f.read(), "html.parser") diff --git a/dedoc/readers/html_reader/html_reader.py b/dedoc/readers/html_reader/html_reader.py index 5cd8b76a..11d815ba 100644 --- a/dedoc/readers/html_reader/html_reader.py +++ b/dedoc/readers/html_reader/html_reader.py @@ -1,22 +1,11 @@ -import hashlib -import string from typing import List, Optional, Tuple, Union from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag -from dedoc.data_structures.cell_with_meta import CellWithMeta -from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.table import Table -from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.readers.html_reader.html_line_postprocessing import HtmlLinePostprocessing -from dedoc.readers.html_reader.html_tag_annotation_parser import HtmlTagAnnotationParser -from dedoc.readers.html_reader.html_tags import HtmlTags -from dedoc.utils.utils import calculate_file_hash class HtmlReader(BaseReader): @@ -25,6 +14,10 @@ class HtmlReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.readers.html_reader.html_line_postprocessing import HtmlLinePostprocessing + from dedoc.readers.html_reader.html_tag_annotation_parser import HtmlTagAnnotationParser + super().__init__(config=config, recognized_extensions=recognized_extensions.html_like_format, recognized_mimes=recognized_mimes.html_like_format) self.postprocessor = HtmlLinePostprocessing() self.tag_annotation_parser = HtmlTagAnnotationParser() @@ -35,6 +28,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + from dedoc.utils.utils import calculate_file_hash + parameters = {} if parameters is None else parameters with open(file_path, "rb") as f: soup = BeautifulSoup(f.read(), "html.parser") @@ -52,6 +47,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure def __handle_block(self, tag: Union[Tag], filepath_hash: str, handle_invisible_table: bool, table: Optional[bool] = False, uid: Optional[str] = "") -> List[LineWithMeta]: + import hashlib + from dedoc.readers.html_reader.html_tags import HtmlTags + tag_uid = hashlib.md5((uid + str(tag.name)).encode()).hexdigest() assert isinstance(tag, (Tag, str)) if not self.__is_content_tag(tag, handle_invisible_table=handle_invisible_table): @@ -80,6 +78,10 @@ def __handle_block(self, tag: Union[Tag], filepath_hash: str, handle_invisible_t return block_lines def __handle_single_tag(self, tag: Tag, filepath_hash: str, uid: str, table: Optional[bool] = False) -> List[LineWithMeta]: + import hashlib + from dedoc.data_structures.hierarchy_level import HierarchyLevel + from dedoc.readers.html_reader.html_tags import HtmlTags + text = self.__get_text(tag, table) if not text or text.isspace(): @@ -95,6 +97,8 @@ def __handle_single_tag(self, tag: Tag, filepath_hash: str, uid: str, table: Opt def __read_blocks(self, block: Tag, filepath_hash: str = "", handle_invisible_table: bool = False, table: Optional[bool] = False, uid: Optional[str] = "") -> List[LineWithMeta]: + import hashlib + tag_uid = hashlib.md5((filepath_hash + uid + str(block.name)).encode()).hexdigest() if not self.__is_content_tag(block, handle_invisible_table=handle_invisible_table): return [] @@ -108,6 +112,9 @@ def __read_blocks(self, block: Tag, filepath_hash: str = "", handle_invisible_ta return lines def _handle_text_line(self, block: str, filepath_hash: str, uid: str, ignore_space: bool = True) -> List[LineWithMeta]: + import hashlib + from dedoc.data_structures.hierarchy_level import HierarchyLevel + if not block.strip() and ignore_space: return [] tag_uid = hashlib.md5((uid + block).encode()).hexdigest() @@ -116,6 +123,9 @@ def _handle_text_line(self, block: str, filepath_hash: str, uid: str, ignore_spa def __make_line(self, line: str, line_type: str, header_level: int = 0, uid: str = None, filepath_hash: str = None, annotations: List = None) -> LineWithMeta: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + from dedoc.data_structures.line_metadata import LineMetadata + if annotations is None: annotations = [] @@ -126,6 +136,10 @@ def __make_line(self, line: str, line_type: str, header_level: int = 0, uid: str return LineWithMeta(line=line, metadata=metadata, annotations=annotations, uid=uid) def __get_li_header(self, list_type: str, index: int) -> LineWithMeta: + import string + from dedoc.data_structures.hierarchy_level import HierarchyLevel + from dedoc.data_structures.line_metadata import LineMetadata + end = ") " if list_type in ["a", "A"] else ". " if list_type == "": header = "" @@ -146,6 +160,9 @@ def __get_li_header(self, list_type: str, index: int) -> LineWithMeta: return header_line def __read_list(self, lst: Tag, uid: str, filepath_hash: str, handle_invisible_table: bool) -> List[LineWithMeta]: + import hashlib + from dedoc.readers.html_reader.html_tags import HtmlTags + tag_uid = hashlib.md5((uid + str(lst.name)).encode()).hexdigest() lines = [] list_type = lst.get("type", "1" if lst.name in HtmlTags.ordered_list else "") @@ -164,6 +181,8 @@ def __read_list(self, lst: Tag, uid: str, filepath_hash: str, handle_invisible_t return lines def __handle_list_item(self, item: Tag, item_index: int, list_type: str, filepath_hash: str, uid: str, handle_invisible_table: bool) -> List[LineWithMeta]: + import hashlib + tag_uid = hashlib.md5((uid + str(item.name)).encode()).hexdigest() lines = [] header_line = self.__get_li_header(list_type=list_type, index=item_index) @@ -195,6 +214,8 @@ def __is_content_tag(self, tag: Tag, handle_invisible_table: bool = False) -> bo @param handle_invisible_table: is invisibly table should be handled as table @return: True if tag is a content tag False otherwise. """ + from dedoc.readers.html_reader.html_tags import HtmlTags + if tag.name in HtmlTags.service_tags: return False if tag.name == "table" and not self._visible_table(tag, handle_invisible_table=handle_invisible_table): @@ -202,6 +223,9 @@ def __is_content_tag(self, tag: Tag, handle_invisible_table: bool = False) -> bo return not isinstance(tag, Doctype) and not isinstance(tag, Comment) def __handle_invisible_table(self, block: Tag, filepath_hash: str, uid: str) -> List[LineWithMeta]: + import hashlib + from dedoc.data_structures.hierarchy_level import HierarchyLevel + result = [] rows = self._read_table(block, filepath_hash).cells for row in rows: @@ -213,6 +237,8 @@ def __handle_invisible_table(self, block: Tag, filepath_hash: str, uid: str) -> return result def __clone_cell(self, el: Tuple[Tag, NavigableString]) -> Tuple[Tag, NavigableString]: + from dedoc.readers.html_reader.html_tags import HtmlTags + if isinstance(el, NavigableString): return type(el)(el) @@ -228,6 +254,8 @@ def __clone_cell(self, el: Tuple[Tag, NavigableString]) -> Tuple[Tag, NavigableS return copy def __split_table_cells(self, table: Tag, table_list: List[List[Tag]]) -> None: + from dedoc.readers.html_reader.html_tags import HtmlTags + for row_index, row in enumerate(table.find_all(HtmlTags.table_rows)): for cell_index, cell in enumerate(row.find_all(HtmlTags.table_cells)): cell_rowspan = int(cell.attrs.get("rowspan", 1)) @@ -239,6 +267,8 @@ def __split_table_cells(self, table: Tag, table_list: List[List[Tag]]) -> None: table_list[index][cell_index:cell_index] = [cell_copy] * cell_colspan def __fix_table(self, table: Tag) -> List[List[Tag]]: + from dedoc.readers.html_reader.html_tags import HtmlTags + table_list = [] # create table list @@ -253,6 +283,9 @@ def __fix_table(self, table: Tag) -> List[List[Tag]]: return table_list def _read_table(self, table: Tag, filepath_hash: str) -> Table: + from dedoc.data_structures.cell_with_meta import CellWithMeta + from dedoc.data_structures.table_metadata import TableMetadata + cells_with_meta = [] fixed_table = self.__fix_table(table) diff --git a/dedoc/readers/json_reader/json_reader.py b/dedoc/readers/json_reader/json_reader.py index b83bb7cb..4afb768d 100644 --- a/dedoc/readers/json_reader/json_reader.py +++ b/dedoc/readers/json_reader/json_reader.py @@ -1,16 +1,9 @@ -from json import JSONDecodeError from typing import Any, List, Optional -import ujson as json - -from dedoc.attachments_extractors.concrete_attachments_extractors.json_attachment_extractor import JsonAttachmentsExtractor from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.common.exceptions.bad_parameters_error import BadParametersError -from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader @@ -20,6 +13,9 @@ class JsonReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.attachments_extractors.concrete_attachments_extractors.json_attachment_extractor import JsonAttachmentsExtractor + from dedoc.extensions import recognized_extensions, recognized_mimes + super().__init__(config=config, recognized_extensions=recognized_extensions.json_like_format, recognized_mimes=recognized_mimes.json_like_format) self.attachment_extractor = JsonAttachmentsExtractor(config=self.config) @@ -31,6 +27,10 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure The dictionaries are processed by creating key line with type `key` and value line as a child. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + from json import JSONDecodeError + import ujson as json + from dedoc.data_structures.hierarchy_level import HierarchyLevel + parameters = {} if parameters is None else parameters with open(file_path) as file: try: @@ -84,6 +84,8 @@ def __exclude_key(self, json_data: dict, keys: List[str]) -> None: del data[key] def __handle_list(self, depth: int, element: list, result: list, stack: list) -> None: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + for _ in range(len(element)): sub_element = element.pop(0) line = self.__handle_one_element(depth=depth, value=sub_element, line_type=HierarchyLevel.list_item, line_type_meta=HierarchyLevel.list_item) @@ -106,6 +108,9 @@ def __handle_dict(self, depth: int, element: dict, result: list, stack: list) -> break def __handle_one_element(self, depth: int, value: Any, line_type: str, line_type_meta: str) -> LineWithMeta: # noqa + from dedoc.data_structures.hierarchy_level import HierarchyLevel + from dedoc.data_structures.line_metadata import LineMetadata + if depth == 1 and line_type == "title": level1, level2 = 0, 0 else: diff --git a/dedoc/readers/mhtml_reader/mhtml_reader.py b/dedoc/readers/mhtml_reader/mhtml_reader.py index d96ed0ec..7073ee54 100644 --- a/dedoc/readers/mhtml_reader/mhtml_reader.py +++ b/dedoc/readers/mhtml_reader/mhtml_reader.py @@ -1,20 +1,8 @@ -import email -import gzip -import os -import uuid from typing import List, Optional, Tuple -from urllib.parse import urlparse - -from bs4 import BeautifulSoup from dedoc.data_structures.attached_file import AttachedFile from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.readers.html_reader.html_reader import HtmlReader -from dedoc.utils import supported_image_types -from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis -from dedoc.utils.utils import check_filename_length, get_encoding, get_mime_extension, save_data_to_unique_file class MhtmlReader(BaseReader): @@ -23,6 +11,9 @@ class MhtmlReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.readers.html_reader.html_reader import HtmlReader + super().__init__(config=config, recognized_extensions=recognized_extensions.mhtml_like_format, recognized_mimes=recognized_mimes.mhtml_like_format) self.html_reader = HtmlReader(config=self.config) @@ -31,6 +22,8 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + from dedoc.utils.utils import get_mime_extension + mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) # this code differs from BaseReader because .eml and .mhtml files have the same mime type if extension: @@ -43,6 +36,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis + parameters = {} if parameters is None else parameters attachments_dir = get_param_attachments_dir(parameters, file_path) @@ -70,6 +65,12 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(tables=tables, lines=lines, attachments=attachments) def __extract_files(self, path: str, save_dir: str) -> Tuple[List[str], List[str]]: + import email + import gzip + import os + from urllib.parse import urlparse + from dedoc.utils.utils import check_filename_length, save_data_to_unique_file + names_list = [] original_names_list = [] if path.endswith(".gz"): @@ -97,6 +98,10 @@ def __extract_files(self, path: str, save_dir: str) -> Tuple[List[str], List[str return names_list, original_names_list def __find_html(self, names_list: List[str]) -> List[str]: + from bs4 import BeautifulSoup + from dedoc.utils import supported_image_types + from dedoc.utils.utils import get_encoding + html_list = [] for file_name in names_list: extension = file_name.split(".")[-1] @@ -114,6 +119,10 @@ def __find_html(self, names_list: List[str]) -> List[str]: return html_list def __get_attachments(self, save_dir: str, tmp_names_list: List[str], original_names_list: List[str], need_content_analysis: bool) -> List[AttachedFile]: + import os + import uuid + from dedoc.utils import supported_image_types + attachments = [] for tmp_file_name, original_file_name in zip(tmp_names_list, original_names_list): *_, extension = tmp_file_name.rsplit(".", maxsplit=1) diff --git a/dedoc/readers/note_reader/note_reader.py b/dedoc/readers/note_reader/note_reader.py index 902386d1..2f6f4617 100644 --- a/dedoc/readers/note_reader/note_reader.py +++ b/dedoc/readers/note_reader/note_reader.py @@ -1,9 +1,5 @@ -import os -import pickle from typing import Optional -from dedoc.common.exceptions.bad_file_error import BadFileFormatError -from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader @@ -21,6 +17,10 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure The method return document content with all document's lines. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + import os + import pickle + from dedoc.common.exceptions.bad_file_error import BadFileFormatError + from dedoc.data_structures.line_with_meta import LineWithMeta try: with open(file_path, "rb") as infile: diff --git a/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py b/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py index fd6c178b..1880701d 100644 --- a/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py +++ b/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py @@ -1,6 +1,6 @@ from typing import List, Optional -import numpy as np +from numpy import ndarray from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox @@ -8,7 +8,7 @@ class PageWithBBox: - def __init__(self, image: np.ndarray, bboxes: List[TextWithBBox], page_num: int, attachments: List[PdfImageAttachment] = None, + def __init__(self, image: ndarray, bboxes: List[TextWithBBox], page_num: int, attachments: List[PdfImageAttachment] = None, pdf_page_width: Optional[int] = None, pdf_page_height: Optional[int] = None) -> None: self.image = image self.bboxes = bboxes diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py index 5df58258..9308fded 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/cell.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/cell.py @@ -1,9 +1,7 @@ -import uuid from typing import List, Optional from dedocutils.data_structures import BBox -from dedoc.data_structures import BBoxAnnotation from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.line_with_meta import LineWithMeta @@ -44,6 +42,7 @@ def __init__(self, rotated_angle: int = 0, uid: str = None, contour_coord: Optional[BBox] = None) -> None: + import uuid assert x_top_left <= x_bottom_right assert y_top_left <= y_bottom_right @@ -72,6 +71,8 @@ def get_annotations(self) -> List[Annotation]: return LineWithMeta.join(self.lines, delimiter="\n").annotations def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None: + from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation + for i_line, _ in enumerate(self.lines): for i_ann, annotation in enumerate(self.lines[i_line].annotations): if annotation.name != "bounding box": diff --git a/dedoc/readers/pdf_reader/data_classes/tables/location.py b/dedoc/readers/pdf_reader/data_classes/tables/location.py index bc6492d4..27d053a5 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/location.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/location.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from functools import total_ordering from typing import Any, Dict @@ -14,6 +13,8 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: self.rotated_angle = rotated_angle def to_dict(self) -> Dict[str, Any]: + from collections import OrderedDict + res = OrderedDict() res["page_number"] = self.page_number res["bbox"] = self.bbox.to_dict() # [x_begin, y_begin, width, height] diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py index 4bc057df..ee8316d0 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py @@ -1,11 +1,10 @@ -import copy -from collections import OrderedDict from typing import Any, List -import numpy as np from dedocutils.data_structures import BBox -from dedoc.data_structures import CellWithMeta, Table, TableMetadata +from dedoc.data_structures.cell_with_meta import CellWithMeta +from dedoc.data_structures.table import Table +from dedoc.data_structures.table_metadata import TableMetadata from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.data_classes.tables.location import Location @@ -63,6 +62,9 @@ def get_index_of_end_string_attr(matrix_cells: List[List[Cell]]) -> int: @staticmethod def get_attributes_cell(matrix_cells: List[List[Cell]]) -> (List[int], List[List[Cell]], int): + import copy + import numpy as np + required_columns = [] for j in range(0, len(matrix_cells[0])): if matrix_cells[0][j].is_attribute_required: @@ -94,6 +96,8 @@ def uid(self) -> str: return self.name def to_dict(self) -> dict: + from collections import OrderedDict + data_text = ScanTable.get_cells_text(self.matrix_cells) res = OrderedDict() diff --git a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py index a212bc80..9cf865ce 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py @@ -1,14 +1,10 @@ -import logging from collections import namedtuple from typing import List, Optional -import cv2 -import numpy as np from dedocutils.data_structures import BBox +from numpy import ndarray -from dedoc.data_structures import LineWithMeta -from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_cell_extractor import OCRCellExtractor -from dedoc.utils.image_utils import crop_image_text +from dedoc.data_structures.line_with_meta import LineWithMeta """-------------------------------Таблица в виде дерева, полученная от OpenCV----------------------------------------""" ContourCell = namedtuple("ContourCell", ["id_con", "image"]) @@ -25,6 +21,8 @@ class TableTree(object): minimal_cell_avg_length_line = 10 def __init__(self, *, config: dict) -> None: + import logging + self.left = None self.right = None self.cell_box: Optional[BBox] = None # [x_begin, y_begin, width, height] @@ -36,7 +34,10 @@ def __init__(self, *, config: dict) -> None: self.config = config self.logger = config.get("logger", logging.getLogger()) - def set_text_into_tree(self, tree: "TableTree", src_image: np.ndarray, language: str = "rus", *, config: dict) -> None: + def set_text_into_tree(self, tree: "TableTree", src_image: ndarray, language: str = "rus", *, config: dict) -> None: + import logging + from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_cell_extractor import OCRCellExtractor + # get List of TableTree cur_depth = 0 begin_depth = 2 @@ -60,7 +61,9 @@ def set_text_into_tree(self, tree: "TableTree", src_image: np.ndarray, language: for lines, tree in zip(lines_with_meta, trees): tree.lines = lines - def set_crop_text_box(self, page_image: np.ndarray) -> None: + def set_crop_text_box(self, page_image: ndarray) -> None: + from dedoc.utils.image_utils import crop_image_text + cell_image = BBox.crop_image_by_box(page_image, self.cell_box) self.crop_text_box = crop_image_text(cell_image) # make crop_text_box'coordinates relative page_image @@ -69,6 +72,8 @@ def set_crop_text_box(self, page_image: np.ndarray) -> None: @staticmethod def parse_contours_to_tree(contours: List, hierarchy: List, *, config: dict) -> "TableTree": + import cv2 + table_tree = TableTree(config=config) table_tree.id_contours = 0 if len(contours) == 0: @@ -91,6 +96,8 @@ def print_tree(self, depth: int) -> None: ch.print_tree(depth + 1) def __build_childs(self, cur: "TableTree", hierarchy: List, contours: List) -> "TableTree": + import cv2 + list_childs = [] for i, h in enumerate(hierarchy[0]): if h[3] == cur.id_contours: diff --git a/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py b/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py index 8ac0d201..e7537a07 100644 --- a/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py +++ b/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py @@ -1,6 +1,4 @@ -from collections import OrderedDict from typing import List, Optional -from uuid import uuid1 from dedocutils.data_structures import BBox @@ -18,6 +16,8 @@ def __init__(self, uid: Optional[str] = None, label: Optional[str] = None, annotations: List[Annotation] = None) -> None: + from uuid import uuid1 + self.bbox = bbox self.page_num = page_num self.line_num = line_num @@ -37,6 +37,8 @@ def __repr__(self) -> str: return self.__str__() def to_dict(self) -> dict: + from collections import OrderedDict + res = OrderedDict() res["uid"] = self.uid res["_uid"] = self.uid diff --git a/dedoc/readers/pdf_reader/data_classes/word_with_bbox.py b/dedoc/readers/pdf_reader/data_classes/word_with_bbox.py index cb9dc6c7..88bb6460 100644 --- a/dedoc/readers/pdf_reader/data_classes/word_with_bbox.py +++ b/dedoc/readers/pdf_reader/data_classes/word_with_bbox.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - from dedocutils.data_structures import BBox @@ -16,6 +14,8 @@ def __repr__(self) -> str: return self.__str__() def to_dict(self) -> dict: + from collections import OrderedDict + res = OrderedDict() res["bbox"] = self.bbox.to_dict() res["text"] = self.text diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py index 9cfe9dd8..523c96a1 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py @@ -1,18 +1,7 @@ -import copy -import os -from itertools import chain from typing import Optional -from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation -from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_detector import TxtLayerDetector -from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader -from dedoc.utils.parameter_utils import get_param_page_slice, get_param_pdf_with_txt_layer class PdfAutoReader(BaseReader): @@ -31,7 +20,14 @@ class PdfAutoReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_detector import TxtLayerDetector + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) + self.pdf_txtlayer_reader = PdfTxtlayerReader(config=self.config) self.pdf_tabby_reader = PdfTabbyReader(config=self.config) self.pdf_image_reader = PdfImageReader(config=self.config) @@ -46,6 +42,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, It is recommended to use `pdf_with_text_layer=auto_tabby` because it's faster and allows to get better results. You can look to :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer return super().can_read(file_path=file_path, mime=mime, extension=extension) and get_param_pdf_with_txt_layer(parameters) in ("auto", "auto_tabby") def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: @@ -70,12 +67,17 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return result def __handle_incorrect_text_layer(self, parameters_copy: dict, path: str, warnings: list) -> UnstructuredDocument: + import os + self.logger.info(f"Assume document {os.path.basename(path)} has incorrect textual layer") warnings.append("Assume document has incorrect textual layer") result = self.pdf_image_reader.read(file_path=path, parameters=parameters_copy) return result def __handle_correct_text_layer(self, is_first_page_correct: bool, parameters: dict, path: str, warnings: list) -> UnstructuredDocument: + import os + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer + self.logger.info(f"Assume document {os.path.basename(path)} has a correct textual layer") warnings.append("Assume document has a correct textual layer") recognized_first_page = None @@ -99,6 +101,9 @@ def __handle_correct_text_layer(self, is_first_page_correct: bool, parameters: d return result def __preparing_first_page_parameters(self, parameters: dict) -> dict: + import copy + from dedoc.utils.parameter_utils import get_param_page_slice + first_page, last_page = get_param_page_slice(parameters) # calculate indexes for the first page parsing first_page_index = 0 if first_page is None else first_page @@ -111,6 +116,8 @@ def __preparing_first_page_parameters(self, parameters: dict) -> dict: return scan_parameters def __preparing_other_pages_parameters(self, parameters: dict) -> dict: + from dedoc.utils.parameter_utils import get_param_page_slice + first_page, last_page = get_param_page_slice(parameters) # parameters for reading pages from the second page first_page_index = 1 if first_page is None else first_page @@ -120,6 +127,10 @@ def __preparing_other_pages_parameters(self, parameters: dict) -> dict: return parameters def __merge_documents(self, first: UnstructuredDocument, second: UnstructuredDocument) -> UnstructuredDocument: + from itertools import chain + from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation + from dedoc.data_structures.line_with_meta import LineWithMeta + tables = first.tables dropped_tables = set() for table in second.tables: diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py index 766ac250..21716598 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py @@ -7,7 +7,7 @@ from xgboost import XGBClassifier from dedoc.config import get_config -from dedoc.data_structures import LineWithMeta +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.download_models import download_from_hub from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_feature_extractor import TxtlayerFeatureExtractor from dedoc.utils.parameter_utils import get_param_gpu_available diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py index cfff918d..0500698f 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py @@ -3,7 +3,7 @@ from copy import deepcopy from typing import List -from dedoc.data_structures import LineWithMeta +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier import TxtlayerClassifier from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 9d46175a..839b5006 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -1,34 +1,16 @@ -import math -import os from abc import abstractmethod from collections import namedtuple from typing import Iterator, List, Optional, Set, Tuple -import cv2 -import numpy as np -from joblib import Parallel, delayed -from pdf2image import convert_from_path -from pdf2image.exceptions import PDFPageCountError, PDFSyntaxError +from numpy import ndarray -import dedoc.utils.parameter_utils as param_utils -from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions as extensions, recognized_mimes as mimes from dedoc.readers.base_reader import BaseReader from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable -from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.metadata_extractor import LineMetadataExtractor -from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer -from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis -from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker -from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor -from dedoc.utils.pdf_utils import get_pdf_page_count -from dedoc.utils.utils import flatten, get_file_mime_by_content -from dedoc.utils.utils import get_file_mime_type, splitext_ ParametersForParseDoc = namedtuple("ParametersForParseDoc", [ "orient_analysis_cells", @@ -55,6 +37,13 @@ class PdfBaseReader(BaseReader): def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Optional[Set[str]] = None, recognized_mimes: Optional[Set[str]] = None) -> None: super().__init__(config=config, recognized_extensions=recognized_extensions, recognized_mimes=recognized_mimes) + + from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.metadata_extractor import LineMetadataExtractor + from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor + from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer + from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker + from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor + self.config["n_jobs"] = self.config.get("n_jobs", 1) self.table_recognizer = TableRecognizer(config=self.config) self.metadata_extractor = LineMetadataExtractor(config=self.config) @@ -70,6 +59,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ + import dedoc.utils.parameter_utils as param_utils + parameters = {} if parameters is None else parameters first_page, last_page = param_utils.get_param_page_slice(parameters) @@ -101,6 +92,13 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( Tuple)[List[LineWithMeta], List[ScanTable], List[PdfImageAttachment], List[str], Optional[dict]]: + import math + from joblib import Parallel, delayed + from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.utils.pdf_utils import get_pdf_page_count + from dedoc.utils.utils import flatten + first_page = 0 if parameters.first_page is None or parameters.first_page < 0 else parameters.first_page last_page = math.inf if parameters.last_page is None else parameters.last_page images = self._get_images(path, first_page, last_page) @@ -142,7 +140,7 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata @abstractmethod - def _process_one_page(self, image: np.ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \ + def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \ -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]: """ function parses image and returns: @@ -153,7 +151,13 @@ def _process_one_page(self, image: np.ndarray, parameters: ParametersForParseDoc """ pass - def _get_images(self, path: str, page_from: int, page_to: int) -> Iterator[np.ndarray]: + def _get_images(self, path: str, page_from: int, page_to: int) -> Iterator[ndarray]: + import os + import cv2 + from dedoc.extensions import recognized_extensions as extensions, recognized_mimes as mimes + from dedoc.utils.utils import get_file_mime_by_content + from dedoc.utils.utils import get_file_mime_type, splitext_ + mime = get_file_mime_type(path) mime = get_file_mime_by_content(path) if mime not in self._recognized_mimes else mime if mime in mimes.pdf_like_format: @@ -166,10 +170,17 @@ def _get_images(self, path: str, page_from: int, page_to: int) -> Iterator[np.nd else: raise BadFileFormatError(f"Unsupported input format: {splitext_(path)[1]}") - def _split_pdf2image(self, path: str, page_from: int, page_to: int) -> Iterator[np.ndarray]: + def _split_pdf2image(self, path: str, page_from: int, page_to: int) -> Iterator[ndarray]: if page_from >= page_to: return + import math + import os + import numpy as np + from pdf2image import convert_from_path + from pdf2image.exceptions import PDFPageCountError, PDFSyntaxError + from dedoc.utils.pdf_utils import get_pdf_page_count + try: page_count = get_pdf_page_count(path) page_count = math.inf if page_count is None else page_count @@ -179,7 +190,7 @@ def _split_pdf2image(self, path: str, page_from: int, page_to: int) -> Iterator[ while (images is None or len(images) > 0) and left <= min(page_to, page_count): right = left + step # for convert_from_path function first_page should start from 1, last_page is included to the result - images = convert_from_path(path, first_page=left, last_page=right) # noqa + images = convert_from_path(path, first_page=left, last_page=right) # in logging we include both ends of the pages interval, numeration starts with 1 self.logger.info(f"Get page from {left} to {min(right, page_count)} of {page_count} file {os.path.basename(path)}") for image in images: @@ -190,24 +201,30 @@ def _split_pdf2image(self, path: str, page_from: int, page_to: int) -> Iterator[ except (PDFPageCountError, PDFSyntaxError) as error: raise BadFileFormatError(f"Bad pdf file:\n file_name = {os.path.basename(path)} \n exception = {error.args}") - def _convert_to_gray(self, image: np.ndarray) -> np.ndarray: + def _convert_to_gray(self, image: ndarray) -> ndarray: + import cv2 + import numpy as np + gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY) gray_image = self._binarization(gray_image) return gray_image - def _binarization(self, gray_image: np.ndarray) -> np.ndarray: + def _binarization(self, gray_image: ndarray) -> ndarray: + import numpy as np + if gray_image.mean() < 220: # filter black and white image binary_mask = gray_image >= np.quantile(gray_image, 0.05) gray_image[binary_mask] = 255 return gray_image def eval_tables_by_batch(self, - batch: Iterator[np.ndarray], + batch: Iterator[ndarray], page_number_begin: int, language: str, orient_analysis_cells: bool = False, orient_cell_angle: int = 270, - table_type: str = "") -> Tuple[List[np.ndarray], List[ScanTable]]: + table_type: str = "") -> Tuple[List[ndarray], List[ScanTable]]: + from joblib import Parallel, delayed result_batch = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.table_recognizer.recognize_tables_from_image)( image, page_number_begin + i, language, orient_analysis_cells, orient_cell_angle, table_type) for i, image in enumerate(batch)) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py index b3b0790b..245234c7 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py @@ -59,5 +59,9 @@ def __init__(self) -> None: def load_dataset(self, csv_path: str, image_path: str, batch_size: int = 4) -> DataLoader: trainset = DatasetImageOrient(csv_file=csv_path, root_dir=image_path, transform=self.transform) trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2) + self.amount = len(trainset) return trainloader + + def __len__(self) -> int: + return self.amount diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py index c9ef35a8..f381fc6b 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py @@ -6,7 +6,10 @@ import numpy as np from dedocutils.data_structures import BBox -from dedoc.data_structures import BBoxAnnotation, ConfidenceAnnotation, LineMetadata, LineWithMeta +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation +from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation +from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_page import OcrPage from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_cells @@ -62,7 +65,8 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"], return self.__create_lines_with_meta(tree_nodes, originalbox_to_fastocrbox, page_image) - def __handle_one_batch(self, src_image: np.ndarray, tree_table_nodes: List["TableTree"], num_batch: int, language: str = "rus") -> Tuple[OcrPage, List[BBox]]: # noqa + def __handle_one_batch(self, src_image: np.ndarray, tree_table_nodes: List["TableTree"], num_batch: int, language: str = "rus") \ + -> Tuple[OcrPage, List[BBox]]: # noqa concatenated, chunk_boxes = self.__concat_images(src_image=src_image, tree_table_nodes=tree_table_nodes) if self.config.get("debug_mode", False): debug_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables", "batches") @@ -120,7 +124,8 @@ def __nodes2batch(self, tree_nodes: List["TableTree"]) -> Iterator[List["TableTr if len(batch) > 0: yield batch - def __create_lines_with_meta(self, tree_nodes: List["TableTree"], original_box_to_fast_ocr_box: dict, original_image: np.ndarray) -> List[List[LineWithMeta]]: # noqa + def __create_lines_with_meta(self, tree_nodes: List["TableTree"], original_box_to_fast_ocr_box: dict, original_image: np.ndarray) \ + -> List[List[LineWithMeta]]: # noqa nodes_lines = [] for node in tree_nodes: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index d685b8e2..0a9a54a3 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -1,21 +1,11 @@ -import os -from datetime import datetime from typing import List, Optional, Tuple -import cv2 -import numpy as np -from dedocutils.preprocessing import AdaptiveBinarizer, SkewCorrector +from numpy import ndarray -from dedoc.config import get_config -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader -from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier -from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor -from dedoc.utils import supported_image_types -from dedoc.utils.parameter_utils import get_path_param class PdfImageReader(PdfBaseReader): @@ -41,6 +31,13 @@ class PdfImageReader(PdfBaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedocutils.preprocessing import AdaptiveBinarizer, SkewCorrector + from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier + from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor + from dedoc.config import get_config + from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.utils import supported_image_types + supported_image_extensions = {ext for ext in supported_image_types if ext.startswith(".")} super().__init__( config=config, @@ -54,10 +51,15 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.ocr = OCRLineExtractor(config=self.config) def _process_one_page(self, - image: np.ndarray, + image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]: + import os + from datetime import datetime + import cv2 + from dedoc.utils.parameter_utils import get_path_param + # --- Step 1: correct orientation and detect column count --- rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters) if self.config.get("debug_mode", False): @@ -89,7 +91,7 @@ def _process_one_page(self, lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page) return lines, tables, page.attachments, [angle] - def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool, float]: + def _detect_column_count_and_orientation(self, image: ndarray, parameters: ParametersForParseDoc) -> Tuple[ndarray, bool, float]: """ Function : - detects the number of page columns @@ -97,6 +99,11 @@ def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: Pa - rotates the page on detected angle Return: rotated_image and indicator if the page is one-column """ + import os + from datetime import datetime + import cv2 + from dedoc.utils.parameter_utils import get_path_param + columns, angle = None, None if parameters.is_one_column_document is None or parameters.document_orientation is None: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py index 80055a9b..0b14f034 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py @@ -2,9 +2,10 @@ from typing import List import numpy as np -from dedocutils.data_structures import BBox +from dedocutils.data_structures.bbox import BBox -from dedoc.data_structures import ConfidenceAnnotation, LineWithMeta +from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_cell_extractor import OCRCellExtractor from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_cells diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 56c7d2ae..4eaed54c 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -1,43 +1,18 @@ -import json -import math -import os -import shutil -import subprocess -import uuid from typing import List, Optional, Tuple -import numpy as np from dedocutils.data_structures import BBox +from numpy import ndarray from dedoc.common.exceptions.java_not_found_error import JavaNotFoundError from dedoc.common.exceptions.tabby_pdf_error import TabbyPdfError -from dedoc.data_structures.cell_with_meta import CellWithMeta -from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation -from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation -from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation -from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation -from dedoc.data_structures.concrete_annotations.linked_text_annotation import LinkedTextAnnotation -from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation -from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation -from dedoc.data_structures.concrete_annotations.style_annotation import StyleAnnotation from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.table import Table -from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment -from dedoc.readers.pdf_reader.data_classes.tables.location import Location from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader -from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor -from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth -from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_page_slice, get_param_pdf_with_txt_layer, \ - get_param_with_attachments -from dedoc.utils.pdf_utils import get_pdf_page_count -from dedoc.utils.utils import calculate_file_hash, get_unique_name class PdfTabbyReader(PdfBaseReader): @@ -51,6 +26,9 @@ class PdfTabbyReader(PdfBaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + import os + from dedoc.extensions import recognized_extensions, recognized_mimes + super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) self.tabby_java_version = "2.0.0" self.jar_name = "ispras_tbl_extr.jar" @@ -67,6 +45,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer return super().can_read(file_path=file_path, mime=mime, extension=extension) and get_param_pdf_with_txt_layer(parameters) == "tabby" def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: @@ -77,6 +56,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ + from dedoc.utils.parameter_utils import get_param_with_attachments parameters = {} if parameters is None else parameters warnings = [] lines, tables, tables_on_images, attachments, document_metadata = self.__extract(path=file_path, parameters=parameters, warnings=warnings) @@ -93,6 +73,11 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure def __extract(self, path: str, parameters: dict, warnings: list)\ -> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]: + import math + from dedoc.utils.pdf_utils import get_pdf_page_count + from dedoc.utils.utils import calculate_file_hash + from dedoc.utils.parameter_utils import get_param_page_slice, get_param_with_attachments + all_lines, all_tables, all_tables_on_images, all_attached_images = [], [], [], [] with_attachments = get_param_with_attachments(parameters) document_metadata = None @@ -137,6 +122,12 @@ def __extract(self, path: str, parameters: dict, warnings: list)\ return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]: + import uuid + from dedoc.data_structures.cell_with_meta import CellWithMeta + from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation + from dedoc.data_structures.line_metadata import LineMetadata + from dedoc.data_structures.table_metadata import TableMetadata + tables = [] tables_on_image = [] page_number = page["number"] @@ -178,6 +169,13 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]: return tables, tables_on_image def __get_attached_images(self, page: dict, parameters: dict, path: str) -> List[PdfImageAttachment]: + import os + import shutil + import uuid + from dedoc.readers.pdf_reader.data_classes.tables.location import Location + from dedoc.utils.utils import get_unique_name + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis + attachments_dir = get_param_attachments_dir(parameters, path) need_content_analysis = get_param_need_content_analysis(parameters) @@ -204,6 +202,17 @@ def __get_attached_images(self, page: dict, parameters: dict, path: str) -> List return image_attachment_list def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]: + from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation + from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation + from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation + from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation + from dedoc.data_structures.concrete_annotations.linked_text_annotation import LinkedTextAnnotation + from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation + from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation + from dedoc.data_structures.concrete_annotations.style_annotation import StyleAnnotation + from dedoc.data_structures.line_metadata import LineMetadata + from dedoc.readers.pdf_reader.data_classes.tables.location import Location + lines = [] page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"]) prev_line = None @@ -260,6 +269,9 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith return lines def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_type: str) -> HierarchyLevel: + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth + if line_type == HierarchyLevel.header: header_level = get_dotted_item_depth(line.line) header_level = header_level if header_level != -1 else 1 @@ -271,9 +283,12 @@ def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_ return HierarchyLevel(None, None, True, line_type) def __jar_path(self) -> str: + import os return os.environ.get("TABBY_JAR", self.default_config["JAR_PATH"]) def __run(self, path: str = None, encoding: str = "utf-8", start_page: int = None, end_page: int = None) -> bytes: + import subprocess + args = ["java"] + ["-jar", self.__jar_path(), "-i", path] if start_page is not None and end_page is not None: args += ["-sp", str(start_page), "-ep", str(end_page)] @@ -288,13 +303,15 @@ def __run(self, path: str = None, encoding: str = "utf-8", start_page: int = Non raise TabbyPdfError(e.stderr.decode(encoding)) def __process_pdf(self, path: str, start_page: int = None, end_page: int = None) -> dict: + import json + output = self.__run(path=path, start_page=start_page, end_page=end_page) response = output.decode("UTF-8") document = json.loads(response) if response else {} return document def _process_one_page(self, - image: np.ndarray, + image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]: diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index abf9c38d..d7bb2b6a 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -1,15 +1,12 @@ from typing import List, Optional, Tuple -import numpy as np from dedocutils.data_structures import BBox +from numpy import ndarray -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor -from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer class PdfTxtlayerReader(PdfBaseReader): @@ -21,7 +18,11 @@ class PdfTxtlayerReader(PdfBaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes + super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) + + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor self.extractor_layer = PdfminerExtractor(config=self.config) def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: @@ -33,10 +34,11 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer return super().can_read(file_path=file_path, mime=mime, extension=extension) and get_param_pdf_with_txt_layer(parameters) == "true" def _process_one_page(self, - image: np.ndarray, + image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]: diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py index 8d17f4d8..1d030885 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py @@ -146,7 +146,7 @@ def __extract_image(self, @staticmethod def __get_image(path: str, page_num: int) -> np.ndarray: - image_page = np.array(get_page_image(path=path, page_id=page_num)) # noqa + image_page = np.array(get_page_image(path=path, page_id=page_num)) image_page = np.array(image_page) if len(image_page.shape) == 2: image_page = cv2.cvtColor(image_page, cv2.COLOR_GRAY2BGR) diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar index 2d22e7c2..1115c4d6 100644 Binary files a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar differ diff --git a/dedoc/readers/pdf_reader/utils/line_object_linker.py b/dedoc/readers/pdf_reader/utils/line_object_linker.py index 9b70b87c..0e562bdc 100644 --- a/dedoc/readers/pdf_reader/utils/line_object_linker.py +++ b/dedoc/readers/pdf_reader/utils/line_object_linker.py @@ -57,7 +57,7 @@ def link_objects(self, lines: List[LineWithLocation], tables: List[ScanTable], i self.logger.warning(f"Unsupported page object type {page_object}") if self.config.get("debug_mode", False): raise Exception(f"Unsupported page object type {page_object}") - best_line.annotations.append(annotation) # noqa + best_line.annotations.append(annotation) return lines def _add_lines(self, all_objects: List[Union[LineWithLocation, ScanTable, PdfImageAttachment]], lines_key: str, objects_with_line_candidate: dict) -> None: diff --git a/dedoc/readers/pptx_reader/paragraph.py b/dedoc/readers/pptx_reader/paragraph.py index 2dfcb952..129ac3a3 100644 --- a/dedoc/readers/pptx_reader/paragraph.py +++ b/dedoc/readers/pptx_reader/paragraph.py @@ -1,7 +1,16 @@ from bs4 import Tag -from dedoc.data_structures import AlignmentAnnotation, BoldAnnotation, HierarchyLevel, ItalicAnnotation, LineMetadata, LineWithMeta, SizeAnnotation, \ - StrikeAnnotation, SubscriptAnnotation, SuperscriptAnnotation, UnderlinedAnnotation +from dedoc.data_structures.concrete_annotations.alignment_annotation import AlignmentAnnotation +from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation +from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation +from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation +from dedoc.data_structures.concrete_annotations.strike_annotation import StrikeAnnotation +from dedoc.data_structures.concrete_annotations.subscript_annotation import SubscriptAnnotation +from dedoc.data_structures.concrete_annotations.superscript_annotation import SuperscriptAnnotation +from dedoc.data_structures.concrete_annotations.underlined_annotation import UnderlinedAnnotation +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor from dedoc.utils.annotation_merger import AnnotationMerger diff --git a/dedoc/readers/pptx_reader/pptx_reader.py b/dedoc/readers/pptx_reader/pptx_reader.py index 2d68b850..b3f581e6 100644 --- a/dedoc/readers/pptx_reader/pptx_reader.py +++ b/dedoc/readers/pptx_reader/pptx_reader.py @@ -1,21 +1,12 @@ -import zipfile from typing import Dict, List, Optional from bs4 import BeautifulSoup, Tag -from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor -from dedoc.data_structures import AttachAnnotation, Table, TableAnnotation -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.data_structures.table import Table from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor -from dedoc.readers.pptx_reader.shape import PptxShape -from dedoc.readers.pptx_reader.table import PptxTable -from dedoc.utils.office_utils import get_bs_from_zip -from dedoc.utils.parameter_utils import get_param_with_attachments class PptxReader(BaseReader): @@ -25,6 +16,10 @@ class PptxReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor + from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor + super().__init__(config=config, recognized_extensions=recognized_extensions.pptx_like_format, recognized_mimes=recognized_mimes.pptx_like_format) self.attachments_extractor = PptxAttachmentsExtractor(config=self.config) self.numbering_extractor = NumberingExtractor() @@ -34,6 +29,10 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure The method return document content with all document's lines, tables and attachments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + from dedoc.data_structures.line_metadata import LineMetadata + from dedoc.readers.pptx_reader.shape import PptxShape + from dedoc.utils.parameter_utils import get_param_with_attachments + with_attachments = get_param_with_attachments(parameters) attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else [] attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments} @@ -71,6 +70,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=[]) def __get_slides_bs(self, path: str, xml_prefix: str, xml_postfix: str) -> List[BeautifulSoup]: + import zipfile + from dedoc.utils.office_utils import get_bs_from_zip + with zipfile.ZipFile(path) as document: xml_names = document.namelist() filtered_names = [file_name for file_name in xml_names if file_name.startswith(xml_prefix) and file_name.endswith(xml_postfix)] @@ -94,6 +96,10 @@ def __get_slide_images_rels(self, path: str) -> Dict[str, str]: return images_rels def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, table_xml: Tag, properties_extractor: PropertiesExtractor) -> None: + from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation + from dedoc.data_structures.line_metadata import LineMetadata + from dedoc.readers.pptx_reader.table import PptxTable + table = PptxTable(table_xml, page_id, self.numbering_extractor, properties_extractor).to_table() if len(lines) == 0: @@ -102,6 +108,8 @@ def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: i tables.append(table) def __add_attach_annotation(self, line: LineWithMeta, image_rel_id: str, attachment_name2uid: dict, images_rels: dict) -> None: + from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation + try: image_name = images_rels[image_rel_id] image_uid = attachment_name2uid[image_name] diff --git a/dedoc/readers/pptx_reader/properties_extractor.py b/dedoc/readers/pptx_reader/properties_extractor.py index 67c0c919..213bafb8 100644 --- a/dedoc/readers/pptx_reader/properties_extractor.py +++ b/dedoc/readers/pptx_reader/properties_extractor.py @@ -1,11 +1,8 @@ -from copy import deepcopy from dataclasses import dataclass from typing import Dict, Optional from bs4 import Tag -from dedoc.utils.office_utils import get_bs_from_zip - @dataclass class Properties: @@ -42,6 +39,8 @@ def get_properties(self, xml: Tag, level: int, properties: Optional[Properties] """ + from copy import deepcopy + properties = properties or self.lvl2default_properties.get(level, Properties()) new_properties = deepcopy(properties) if not xml: @@ -83,6 +82,8 @@ def __update_alignment(self, xml: Tag, properties: Properties) -> None: properties.alignment = self.alignment_mapping[alignment] def __get_default_properties_mapping(self, file_path: str) -> Dict[int, Properties]: + from dedoc.utils.office_utils import get_bs_from_zip + lvl2properties = {} presentation_xml = get_bs_from_zip(file_path, "ppt/presentation.xml", remove_spaces=True) diff --git a/dedoc/readers/pptx_reader/shape.py b/dedoc/readers/pptx_reader/shape.py index b1c548d3..f0590cbb 100644 --- a/dedoc/readers/pptx_reader/shape.py +++ b/dedoc/readers/pptx_reader/shape.py @@ -3,7 +3,7 @@ from bs4 import Tag -from dedoc.data_structures import LineWithMeta +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor from dedoc.readers.pptx_reader.paragraph import PptxParagraph from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor diff --git a/dedoc/readers/pptx_reader/table.py b/dedoc/readers/pptx_reader/table.py index cbe7febb..d711d8ab 100644 --- a/dedoc/readers/pptx_reader/table.py +++ b/dedoc/readers/pptx_reader/table.py @@ -2,7 +2,9 @@ from bs4 import Tag -from dedoc.data_structures import CellWithMeta, Table, TableMetadata +from dedoc.data_structures.cell_with_meta import CellWithMeta +from dedoc.data_structures.table import Table +from dedoc.data_structures.table_metadata import TableMetadata from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor from dedoc.readers.pptx_reader.shape import PptxShape diff --git a/dedoc/readers/reader_composition.py b/dedoc/readers/reader_composition.py index 6fd23e06..f37ffe4d 100644 --- a/dedoc/readers/reader_composition.py +++ b/dedoc/readers/reader_composition.py @@ -1,10 +1,8 @@ -import os from typing import List, Optional from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader -from dedoc.utils.utils import get_mime_extension class ReaderComposition(object): @@ -31,6 +29,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None, extension: Opt :param mime: MIME type of file :return: intermediate representation of the document with lines, tables and attachments """ + import os + from dedoc.utils.utils import get_mime_extension + mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) for reader in self.readers: diff --git a/dedoc/readers/txt_reader/raw_text_reader.py b/dedoc/readers/txt_reader/raw_text_reader.py index f834b371..2cb13f6d 100644 --- a/dedoc/readers/txt_reader/raw_text_reader.py +++ b/dedoc/readers/txt_reader/raw_text_reader.py @@ -1,20 +1,9 @@ -import codecs -import gzip -import re -import time from typing import Iterable, List, Optional, Tuple -from unicodedata import normalize from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation -from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation -from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor -from dedoc.utils.utils import calculate_file_hash, get_encoding, get_mime_extension class RawTextReader(BaseReader): @@ -23,6 +12,9 @@ class RawTextReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + import re + from dedoc.extensions import recognized_extensions, recognized_mimes + super().__init__(config=config, recognized_extensions=recognized_extensions.txt_like_format, recognized_mimes=recognized_mimes.txt_like_format) self.space_regexp = re.compile(r"^\s+") @@ -31,6 +23,8 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + from dedoc.utils.utils import get_mime_extension + mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) # this code differs from BaseReader because other formats can have text/plain mime type if extension: @@ -50,12 +44,20 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return self._postprocess(result) def __get_encoding(self, path: str, parameters: dict) -> str: + from dedoc.utils.utils import get_encoding + if parameters.get("encoding"): return parameters["encoding"] else: return get_encoding(path, "utf-8") def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]: + import time + from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation + from dedoc.data_structures.line_metadata import LineMetadata + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.utils.utils import calculate_file_hash + lines = [] file_hash = calculate_file_hash(path=path) number_of_empty_lines = 0 @@ -86,6 +88,10 @@ def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]: return lines def __get_lines(self, path: str, encoding: str) -> Iterable[Tuple[int, str]]: + import codecs + import gzip + from unicodedata import normalize + if path.lower().endswith("txt"): with codecs.open(path, errors="ignore", encoding=encoding) as file: for line_id, line in enumerate(file): @@ -107,6 +113,8 @@ def __get_starting_spacing(self, line: Optional[LineWithMeta]) -> int: return space_this.end() - space_this.start() def __is_paragraph(self, line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> bool: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + if not line.metadata.tag_hierarchy_level.can_be_multiline and \ line.metadata.tag_hierarchy_level.line_type not in (HierarchyLevel.raw_text, HierarchyLevel.unknown): return True diff --git a/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py b/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py index 86e2522e..f33605b8 100644 --- a/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py +++ b/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py @@ -1,9 +1,6 @@ from typing import Optional -from dedoc.data_structures.document_content import DocumentContent -from dedoc.data_structures.document_metadata import DocumentMetadata from dedoc.data_structures.parsed_document import ParsedDocument -from dedoc.data_structures.tree_node import TreeNode from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.structure_constructors.abstract_structure_constructor import AbstractStructureConstructor @@ -19,6 +16,10 @@ def construct(self, document: UnstructuredDocument, parameters: Optional[dict] = Build the linear structure representation for the given document intermediate representation. To get the information about the parameters look at the documentation of :class:`~dedoc.structure_constructors.AbstractStructureConstructor`. """ + from dedoc.data_structures.document_content import DocumentContent + from dedoc.data_structures.document_metadata import DocumentMetadata + from dedoc.data_structures.tree_node import TreeNode + lines = document.lines tree = TreeNode.create(lines=[]) for line in lines: diff --git a/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py b/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py index 5c986c1b..14e00c80 100644 --- a/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py +++ b/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py @@ -1,12 +1,7 @@ from typing import List, Optional, Tuple -from dedoc.data_structures.document_content import DocumentContent -from dedoc.data_structures.document_metadata import DocumentMetadata -from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.parsed_document import ParsedDocument -from dedoc.data_structures.tree_node import TreeNode from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.structure_constructors.abstract_structure_constructor import AbstractStructureConstructor @@ -38,6 +33,11 @@ def construct(self, document: UnstructuredDocument, parameters: Optional[dict] = Build the tree structure representation for the given document intermediate representation. To get the information about the parameters look at the documentation of :class:`~dedoc.structure_constructors.AbstractStructureConstructor`. """ + from dedoc.data_structures.document_content import DocumentContent + from dedoc.data_structures.document_metadata import DocumentMetadata + + from dedoc.data_structures.tree_node import TreeNode + document_name, not_document_name = self.__get_document_name(document.lines) not_document_name = self.__add_lists(not_document_name) tree = TreeNode.create(lines=document_name) @@ -88,9 +88,12 @@ def __add_lists(self, not_document_name: List[LineWithMeta]) -> List[LineWithMet @staticmethod def __create_list_line(line: LineWithMeta) -> LineWithMeta: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + from dedoc.data_structures.line_metadata import LineMetadata + hierarchy_level = HierarchyLevel( level_1=line.metadata.hierarchy_level.level_1, - level_2=line.metadata.hierarchy_level.level_2 - 0.5, # noqa it is intentionaly for lists + level_2=line.metadata.hierarchy_level.level_2 - 0.5, line_type="list", can_be_multiline=False ) diff --git a/dedoc/structure_extractors/abstract_structure_extractor.py b/dedoc/structure_extractors/abstract_structure_extractor.py index 54b9633a..e9e57bdd 100644 --- a/dedoc/structure_extractors/abstract_structure_extractor.py +++ b/dedoc/structure_extractors/abstract_structure_extractor.py @@ -1,12 +1,7 @@ -import logging from abc import ABC, abstractmethod -from copy import deepcopy from typing import List, Optional from dedoc.data_structures.annotation import Annotation -from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation -from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation -from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument @@ -26,6 +21,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None: """ :param config: configuration of the extractor, e.g. logger for logging """ + import logging + self.config = {} if config is None else config self.logger = self.config.get("logger", logging.getLogger()) @@ -54,6 +51,9 @@ def _postprocess(self, lines: List[LineWithMeta], paragraph_type: List[str], reg :param excluding_regexps: list of filtering garbage regular pattern according to list of paragraph types :return: new post-processed list of LineWithMeta """ + from copy import deepcopy + from dedoc.data_structures.hierarchy_level import HierarchyLevel + if self.config.get("labeling_mode", False): return lines @@ -95,6 +95,9 @@ def _postprocess(self, lines: List[LineWithMeta], paragraph_type: List[str], reg @staticmethod def _select_annotations(annotations: List[Annotation], start: int, end: int) -> List[Annotation]: + from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation + from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation + assert start <= end res = [] for annotation in annotations: diff --git a/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py index 142982c2..16b7794d 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py @@ -1,16 +1,10 @@ -import os from abc import ABC, abstractmethod from typing import List, Optional, Tuple -from dedoc.config import get_config from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_mimes from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor -from dedoc.structure_extractors.feature_extractors.law_text_features import LawTextFeatures -from dedoc.structure_extractors.hierarchy_level_builders.law_builders.stub_hierarchy_level_builder import StubHierarchyLevelBuilder -from dedoc.structure_extractors.line_type_classifiers.law_classifier import LawLineTypeClassifier class AbstractLawStructureExtractor(AbstractStructureExtractor, ABC): @@ -25,6 +19,12 @@ def __init__(self, *, config: Optional[dict] = None) -> None: :param config: some configuration for document parsing """ super().__init__(config=config) + import os + from dedoc.config import get_config + + from dedoc.structure_extractors.hierarchy_level_builders.law_builders.stub_hierarchy_level_builder import StubHierarchyLevelBuilder + from dedoc.structure_extractors.line_type_classifiers.law_classifier import LawLineTypeClassifier + path = os.path.join(get_config()["resources_path"], "line_type_classifiers") self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.pkl.gz"), config=self.config) self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.pkl.gz"), config=self.config) @@ -39,6 +39,8 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N To get the information about the method's parameters look at the documentation of the class \ :class:`~dedoc.structure_extractors.AbstractStructureExtractor`. """ + from dedoc.extensions import recognized_mimes + if document.metadata.get("file_type") in recognized_mimes.txt_like_format: document.lines = self.__preprocess_lines(document.lines) predictions = self.txt_classifier.predict(document.lines) @@ -173,8 +175,10 @@ def __get_result(self, application_start: int, labels: List[str], last_body_unit return result def _postprocess_roman(self, hierarchy_level: HierarchyLevel, line: LineWithMeta) -> LineWithMeta: - if hierarchy_level.line_type == "subsection" and LawTextFeatures.roman_regexp.match(line.line): - match = LawTextFeatures.roman_regexp.match(line.line) + from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import roman_regexp + + if hierarchy_level.line_type == "subsection" and roman_regexp.match(line.line): + match = roman_regexp.match(line.line) prefix = line.line[match.start(): match.end()] suffix = line.line[match.end():] symbols = [("T", "I"), ("Т", "I"), ("У", "V"), ("П", "II"), ("Ш", "III"), ("Г", "I")] diff --git a/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py index 8e6e4a50..a4a81dc7 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py @@ -1,8 +1,8 @@ from typing import List, Optional -from dedoc.data_structures import HierarchyLevel, UnstructuredDocument from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.structure_extractors import AbstractStructureExtractor +from dedoc.data_structures.unstructured_document import UnstructuredDocument +from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor class ArticleStructureExtractor(AbstractStructureExtractor): @@ -22,6 +22,8 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N To get the information about the method's parameters look at the documentation of the class \ :class:`~dedoc.structure_extractors.AbstractStructureExtractor`. """ + from dedoc.data_structures.hierarchy_level import HierarchyLevel + for line in document.lines: if line.metadata.tag_hierarchy_level is None or line.metadata.tag_hierarchy_level.is_unknown(): line.metadata.tag_hierarchy_level = HierarchyLevel.create_raw_text() diff --git a/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py index 324f4622..cef62b9e 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py @@ -1,14 +1,10 @@ -import re from abc import ABC -from collections import OrderedDict from enum import Enum from typing import Dict, Iterable, List, Optional from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor -from dedoc.structure_extractors.concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor -from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor class LawDocType(Enum): @@ -129,6 +125,8 @@ def __type_detect(self, lines: List[str]) -> Optional[LawDocType]: Search for type N in first lines. Roud robin type search for each line batch. """ + import re + first_lines = self.__create_line_batches(lines, batch_size=self.hat_batch_size, batch_count=self.hat_batch_count) # Hack for ЗАКОН ... КОДЕКС ... @@ -151,6 +149,9 @@ def __type_detect(self, lines: List[str]) -> Optional[LawDocType]: return None def __get_extractor_by_type(self, doc_type: Optional[LawDocType]) -> AbstractStructureExtractor: + from dedoc.structure_extractors.concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor + from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor + if doc_type is None: self.logger.info(f"Dynamic document type not found, using base: {LawStructureExtractor.document_type}") return self.extractors[LawStructureExtractor.document_type] @@ -199,6 +200,8 @@ def __create_line_batches(self, lines: List[str], batch_size: int, batch_count: return batch_lines def __text_clean(self, text: str) -> str: + from collections import OrderedDict + bad_characters = OrderedDict({"\u0438\u0306": "й", "\u0439\u0306": "й", "\u0418\u0306": "Й", "\u0419\u0306": "Й"}) for bad_c, good_c in bad_characters.items(): text = text.replace(bad_c, good_c) diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py index da6e40cf..3bfaeb21 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py @@ -4,12 +4,6 @@ from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor -from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_prefix -from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix -from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix -from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix -from dedoc.structure_extractors.feature_extractors.list_features.prefix.letter_prefix import LetterPrefix -from dedoc.structure_extractors.feature_extractors.list_features.prefix.prefix import LinePrefix class DefaultStructureExtractor(AbstractStructureExtractor): @@ -18,9 +12,15 @@ class DefaultStructureExtractor(AbstractStructureExtractor): You can find the description of this type of structure in the section :ref:`other_structure`. """ + from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix + from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix + from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix + from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix + from dedoc.structure_extractors.feature_extractors.list_features.prefix.prefix import LinePrefix + document_type = "other" - prefix_list: List[LinePrefix] = [DottedPrefix, BracketPrefix, LetterPrefix, BulletPrefix] + prefix_list: List[LinePrefix] = [DottedPrefix, BracketPrefix, AnyLetterPrefix, BulletPrefix] def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ @@ -62,6 +62,12 @@ def __get_hl_with_tag(self, line: LineWithMeta) -> HierarchyLevel: @staticmethod def get_hl_list_using_regexp(line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> HierarchyLevel: + from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_prefix + from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix + from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix + from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix + from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix + prefix = get_prefix(DefaultStructureExtractor.prefix_list, line) # TODO dotted list without space after numbering, like "1.Some text" @@ -77,7 +83,7 @@ def get_hl_list_using_regexp(line: LineWithMeta, previous_line: Optional[LineWit return HierarchyLevel(4, 1, False, line_type=HierarchyLevel.list_item) # here is russian and english letters return HierarchyLevel(3, 1, False, line_type=HierarchyLevel.list_item) - if prefix.name == LetterPrefix.name: # list like a) + if prefix.name == AnyLetterPrefix.name: # list like a) return HierarchyLevel(4, 1, False, line_type=HierarchyLevel.list_item) if prefix.name == BulletPrefix.name: # bullet list diff --git a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py index c08674c6..28508d50 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py @@ -1,16 +1,8 @@ -import os -import re from typing import List, Optional -from dedoc.config import get_config from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor -from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor -from dedoc.structure_extractors.hierarchy_level_builders.diploma_builder.body_builder import DiplomaBodyBuilder -from dedoc.structure_extractors.hierarchy_level_builders.header_builder.header_hierarchy_level_builder import HeaderHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.toc_builder.toc_builder import TocBuilder -from dedoc.structure_extractors.line_type_classifiers.diploma_classifier import DiplomaLineTypeClassifier class DiplomaStructureExtractor(AbstractStructureExtractor): @@ -26,6 +18,15 @@ def __init__(self, *, config: Optional[dict] = None) -> None: :param config: some configuration for document parsing """ super().__init__(config=config) + import os + import re + from dedoc.config import get_config + from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor + from dedoc.structure_extractors.hierarchy_level_builders.diploma_builder.body_builder import DiplomaBodyBuilder + from dedoc.structure_extractors.hierarchy_level_builders.header_builder.header_hierarchy_level_builder import HeaderHierarchyLevelBuilder + from dedoc.structure_extractors.hierarchy_level_builders.toc_builder.toc_builder import TocBuilder + from dedoc.structure_extractors.line_type_classifiers.diploma_classifier import DiplomaLineTypeClassifier + self.toc_extractor = TOCFeatureExtractor() self.header_builder = HeaderHierarchyLevelBuilder() self.toc_builder = TocBuilder() diff --git a/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py index 0d78c783..a070ee06 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py @@ -1,15 +1,10 @@ -import os -import re from typing import Dict, List, Optional, Tuple, Union -import pandas as pd +from pandas import DataFrame -from dedoc.config import get_config -from dedoc.data_structures import HierarchyLevel, LineWithMeta, UnstructuredDocument -from dedoc.structure_extractors import AbstractStructureExtractor -from dedoc.structure_extractors.feature_extractors.fintoc_feature_extractor import FintocFeatureExtractor -from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor -from dedoc.structure_extractors.line_type_classifiers.fintoc_classifier import FintocClassifier +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.data_structures.unstructured_document import UnstructuredDocument +from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor class FintocStructureExtractor(AbstractStructureExtractor): @@ -25,7 +20,14 @@ class FintocStructureExtractor(AbstractStructureExtractor): def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - from dedoc.readers import PdfTxtlayerReader # to exclude circular imports + import os + import re + from dedoc.config import get_config + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader # to exclude circular imports + from dedoc.structure_extractors.feature_extractors.fintoc_feature_extractor import FintocFeatureExtractor + from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor + from dedoc.structure_extractors.line_type_classifiers.fintoc_classifier import FintocClassifier + self.pdf_reader = PdfTxtlayerReader(config=self.config) self.toc_extractor = TOCFeatureExtractor() self.features_extractor = FintocFeatureExtractor() @@ -54,6 +56,8 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N :param file_path: path to the file on disk. :return: document content with added additional information about title/non-title lines and hierarchy levels of titles. """ + from dedoc.data_structures.hierarchy_level import HierarchyLevel + parameters = {} if parameters is None else parameters language = self.__get_param_language(parameters=parameters) @@ -87,7 +91,7 @@ def __get_param_language(self, parameters: dict) -> str: self.logger.warning(f"Language {language} is not supported by this extractor. Use default language (en)") return "en" - def get_features(self, documents_dict: Dict[str, List[LineWithMeta]]) -> Tuple[pd.DataFrame, List[List[LineWithMeta]]]: + def get_features(self, documents_dict: Dict[str, List[LineWithMeta]]) -> Tuple[DataFrame, List[List[LineWithMeta]]]: toc_lines, documents = [], [] for file_path, document_lines in documents_dict.items(): toc_lines.append(self.__get_toc(file_path=file_path)) @@ -109,6 +113,8 @@ def __get_toc(self, file_path: Optional[str]) -> List[Dict[str, Union[LineWithMe """ Try to get TOC from PDF automatically. If TOC wasn't extracted automatically, it is extracted using regular expressions. """ + import os + if file_path is None or not file_path.lower().endswith(".pdf"): return [] @@ -122,6 +128,8 @@ def __get_toc(self, file_path: Optional[str]) -> List[Dict[str, Union[LineWithMe return self.toc_extractor.get_toc(lines) def __get_automatic_toc(self, path: str) -> List[Dict[str, Union[LineWithMeta, str]]]: + import os + result = [] with os.popen(f'pdftocio -p "{path}"') as out: toc = out.readlines() diff --git a/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py index 549603dc..227ed3c7 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py @@ -2,12 +2,6 @@ from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.concrete_structure_extractors.abstract_law_structure_extractor import AbstractLawStructureExtractor -from dedoc.structure_extractors.hierarchy_level_builders.header_builder.header_hierarchy_level_builder import HeaderHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.law_builders.application_builder.application_foiv_hierarchy_level_builder import \ - ApplicationFoivHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_foiv_hierarchy_level_builder import BodyFoivHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.law_builders.cellar_builder import CellarHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number, regexps_foiv_item class FoivLawStructureExtractor(AbstractLawStructureExtractor): @@ -19,7 +13,16 @@ class FoivLawStructureExtractor(AbstractLawStructureExtractor): document_type = "foiv_law" def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number super().__init__(config=config) + + from dedoc.structure_extractors.hierarchy_level_builders.header_builder.header_hierarchy_level_builder import HeaderHierarchyLevelBuilder + from dedoc.structure_extractors.hierarchy_level_builders.law_builders.application_builder.application_foiv_hierarchy_level_builder import \ + ApplicationFoivHierarchyLevelBuilder + from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_foiv_hierarchy_level_builder import \ + BodyFoivHierarchyLevelBuilder + from dedoc.structure_extractors.hierarchy_level_builders.law_builders.cellar_builder import CellarHierarchyLevelBuilder + self.hierarchy_level_builders = [ HeaderHierarchyLevelBuilder(), BodyFoivHierarchyLevelBuilder(), @@ -33,6 +36,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.hl_type = "foiv" def _postprocess_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]: + from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_foiv_item return self._postprocess(lines=lines, paragraph_type=["item", "subitem", "subitem"], regexps=[regexps_foiv_item, self.regexps_subitem_with_number, self.regexps_subitem_with_char], diff --git a/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py b/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py index f360011a..f94bdedf 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py @@ -1,14 +1,7 @@ -import re from typing import List, Optional from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.concrete_structure_extractors.abstract_law_structure_extractor import AbstractLawStructureExtractor -from dedoc.structure_extractors.hierarchy_level_builders.header_builder.header_hierarchy_level_builder import HeaderHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.law_builders.application_builder.application_law_hierarchy_level_builder import \ - ApplicationLawHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_law_hierarchy_level_builder import BodyLawHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.law_builders.cellar_builder import CellarHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number, regexps_number class LawStructureExtractor(AbstractLawStructureExtractor): @@ -21,6 +14,15 @@ class LawStructureExtractor(AbstractLawStructureExtractor): def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) + + import re + from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number, regexps_number + from dedoc.structure_extractors.hierarchy_level_builders.header_builder.header_hierarchy_level_builder import HeaderHierarchyLevelBuilder + from dedoc.structure_extractors.hierarchy_level_builders.law_builders.application_builder.application_law_hierarchy_level_builder import \ + ApplicationLawHierarchyLevelBuilder + from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_law_hierarchy_level_builder import BodyLawHierarchyLevelBuilder + from dedoc.structure_extractors.hierarchy_level_builders.law_builders.cellar_builder import CellarHierarchyLevelBuilder + self.hierarchy_level_builders = [ HeaderHierarchyLevelBuilder(), BodyLawHierarchyLevelBuilder(), diff --git a/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py index b48efa57..15cc3d86 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py @@ -1,16 +1,7 @@ -import os from typing import Optional -from dedoc.config import get_config from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_mimes from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor -from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix -from dedoc.structure_extractors.hierarchy_level_builders.header_builder.header_hierarchy_level_builder import HeaderHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.toc_builder.toc_builder import TocBuilder -from dedoc.structure_extractors.hierarchy_level_builders.tz_builder.body_builder import TzBodyBuilder -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number, regexps_number, regexps_subitem -from dedoc.structure_extractors.line_type_classifiers.tz_classifier import TzLineTypeClassifier class TzStructureExtractor(AbstractStructureExtractor): @@ -26,6 +17,14 @@ def __init__(self, *, config: Optional[dict] = None) -> None: :param config: some configuration for document parsing """ super().__init__(config=config) + + import os + from dedoc.config import get_config + from dedoc.structure_extractors.hierarchy_level_builders.header_builder.header_hierarchy_level_builder import HeaderHierarchyLevelBuilder + from dedoc.structure_extractors.hierarchy_level_builders.toc_builder.toc_builder import TocBuilder + from dedoc.structure_extractors.hierarchy_level_builders.tz_builder.body_builder import TzBodyBuilder + from dedoc.structure_extractors.line_type_classifiers.tz_classifier import TzLineTypeClassifier + self.header_builder = HeaderHierarchyLevelBuilder() self.body_builder = TzBodyBuilder() self.toc_builder = TocBuilder() @@ -39,6 +38,10 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N To get the information about the method's parameters look at the documentation of the class \ :class:`~dedoc.structure_extractors.AbstractStructureExtractor`. """ + from dedoc.extensions import recognized_mimes + from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number, regexps_number, regexps_subitem + from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix + if document.metadata.get("file_type") in recognized_mimes.txt_like_format: predictions = self.txt_classifier.predict(document.lines) else: diff --git a/dedoc/structure_extractors/feature_extractors/law_text_features.py b/dedoc/structure_extractors/feature_extractors/law_text_features.py index 8969af04..6e039119 100644 --- a/dedoc/structure_extractors/feature_extractors/law_text_features.py +++ b/dedoc/structure_extractors/feature_extractors/law_text_features.py @@ -8,7 +8,7 @@ from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor from dedoc.structure_extractors.feature_extractors.list_features.list_features_extractor import ListFeaturesExtractor from dedoc.structure_extractors.feature_extractors.utils_feature_extractor import normalization_by_min_max -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_year +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_year, roman_regexp class LawTextFeatures(AbstractFeatureExtractor): @@ -18,7 +18,6 @@ class LawTextFeatures(AbstractFeatureExtractor): named_regexp = [ re.compile(r"^(Статья|(Г|г)лава|ГЛАВА|ЧАСТЬ|Часть|Раздел|РАЗДЕЛ|\$|§)\s*((\d+\.*)+|[IVXХxхviУП]{1,3}\.?)\s*") ] - roman_regexp = re.compile(r"\s*(I|Г|T|Т|II|П|III|Ш|ТУ|TУ|IV|V|У|VI|УТ|УT|VII|УТТ|VIII|I[XХ]|[XХ]|[XХ]I|[XХ]II)\.\s+") regexp_application_begin = re.compile( r"^(\'|\")?(((П|п)риложение)|((У|у)твержден)[оаы]?){1}(( )*([№nN]?( )*(\d){1,3})?( )*)" r"((к распоряжению)|(к постановлению)|(к приказу))?\s*$" @@ -37,7 +36,7 @@ class LawTextFeatures(AbstractFeatureExtractor): def __init__(self, text_features_only: bool = False) -> None: super().__init__() - self.regexps_start = self.regexps_items + self.regexps_subitem + [self.roman_regexp] + self.regexps_start = self.regexps_items + self.regexps_subitem + [roman_regexp] self.text_features_only = text_features_only def parameters(self) -> dict: @@ -140,7 +139,7 @@ def _one_line_features(self, line: LineWithMeta, total_lines: int, start_page: i bracket_cnt = max(0, bracket_cnt - 1) yield "bracket_num", bracket_cnt - if self.roman_regexp.match(line.line) and len(line.line.strip()) > 3: + if roman_regexp.match(line.line) and len(line.line.strip()) > 3: yield "roman_regexp", 1 else: yield "roman_regexp", 0 diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/any_letter_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/any_letter_prefix.py index f7e4e457..38a1cd82 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/any_letter_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/any_letter_prefix.py @@ -20,7 +20,7 @@ class AnyLetterPrefix(LinePrefix): regexp = re.compile(r"^\s*\w\)") - def predecessor(self, other: "LinePrefix") -> bool: + def predecessor(self, other: LinePrefix) -> bool: return isinstance(other, AnyLetterPrefix) @staticmethod diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_prefix.py index c9a67d90..eb712a1a 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_prefix.py @@ -18,7 +18,7 @@ def __init__(self, prefix: str, indent: float) -> None: super().__init__(prefix, indent=indent) self.prefix_num = int(self.prefix[:-1]) - def predecessor(self, other: "LinePrefix") -> bool: + def predecessor(self, other: LinePrefix) -> bool: return isinstance(other, BracketPrefix) and self.prefix_num == other.prefix_num + 1 @staticmethod diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py index d71a6de9..8089d53e 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py @@ -22,7 +22,7 @@ def __init__(self, prefix: str, indent: float) -> None: super().__init__(prefix, indent=indent) self.prefix_num = roman.fromRoman(self.prefix[:-1].upper().strip()) - def predecessor(self, other: "LinePrefix") -> bool: + def predecessor(self, other: LinePrefix) -> bool: return isinstance(other, BracketRomanPrefix) and self.prefix_num == other.prefix_num + 1 @staticmethod diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/bullet_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/bullet_prefix.py index 6d5f6235..0c5402a3 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/bullet_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/bullet_prefix.py @@ -19,7 +19,7 @@ class BulletPrefix(LinePrefix): regexp = re.compile(r"^\s*(-|—|−|–|®|\.|•|\,|‚|©|⎯|°|\*|>|\| -|●|♣|①|▪|\*|\+)") - def predecessor(self, other: "LinePrefix") -> bool: + def predecessor(self, other: LinePrefix) -> bool: return isinstance(other, BulletPrefix) and self.prefix == other.prefix @staticmethod diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/empty_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/empty_prefix.py index ed36d564..dcc1c416 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/empty_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/empty_prefix.py @@ -11,7 +11,7 @@ class EmptyPrefix(LinePrefix): def __init__(self, prefix: str = None, indent: float = 0) -> None: super().__init__("", indent=indent) - def predecessor(self, other: "LinePrefix") -> bool: + def predecessor(self, other: LinePrefix) -> bool: return False @staticmethod diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/letter_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/letter_prefix.py index 6ad602ab..fc198308 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/letter_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/letter_prefix.py @@ -29,7 +29,7 @@ def order(self) -> float: else: return ord(letter) - def predecessor(self, other: "LinePrefix") -> bool: + def predecessor(self, other: LinePrefix) -> bool: return isinstance(other, LetterPrefix) and 1 >= (self.order - other.order) > 0 @staticmethod diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py index e30c6a44..02264584 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py @@ -23,7 +23,7 @@ def __init__(self, prefix: str, indent: float) -> None: super().__init__(prefix, indent=indent) self.prefix_num = roman.fromRoman(self.prefix[:-1].upper().strip()) - def predecessor(self, other: "LinePrefix") -> bool: + def predecessor(self, other: LinePrefix) -> bool: return isinstance(other, RomanPrefix) and self.prefix_num == other.prefix_num + 1 @staticmethod diff --git a/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py index 130a5560..8b656173 100755 --- a/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py +++ b/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py @@ -54,7 +54,7 @@ def _handle_one_document(self, document: List[LineWithMeta], get_feature: method stack = [] for line in document: - while len(stack) > 0 and self.__compare_lines(stack[-1], line, get_feature, std) <= 0: # noqa + while len(stack) > 0 and self.__compare_lines(stack[-1], line, get_feature, std) <= 0: stack.pop() result.append(len(stack)) stack.append(line) diff --git a/dedoc/structure_extractors/hierarchy_level_builders/abstract_hierarchy_level_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/abstract_hierarchy_level_builder.py index e4123fa4..d94a892d 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/abstract_hierarchy_level_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/abstract_hierarchy_level_builder.py @@ -3,7 +3,7 @@ from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.structure_extractors.feature_extractors.law_text_features import LawTextFeatures +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import roman_regexp class AbstractHierarchyLevelBuilder(abc.ABC): @@ -39,8 +39,8 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s @staticmethod def _postprocess_roman(hierarchy_level: HierarchyLevel, line: LineWithMeta) -> LineWithMeta: - if hierarchy_level.line_type == "subsection" and LawTextFeatures.roman_regexp.match(line.line): - match = LawTextFeatures.roman_regexp.match(line.line) + if hierarchy_level.line_type == "subsection" and roman_regexp.match(line.line): + match = roman_regexp.match(line.line) prefix = line.line[match.start(): match.end()] suffix = line.line[match.end():] symbols = [("T", "I"), ("Т", "I"), ("У", "V"), ("П", "II"), ("Ш", "III"), ("Г", "I")] diff --git a/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py index 7f26fad1..c935d8b4 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py @@ -1,9 +1,9 @@ from typing import List, Tuple -from dedoc.data_structures import BoldAnnotation +from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.structure_extractors import DefaultStructureExtractor +from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth from dedoc.structure_extractors.hierarchy_level_builders.abstract_hierarchy_level_builder import AbstractHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.abstract_body_hierarchy_level_builder import \ diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/application_builder/abstract_application_hierarchy_level_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/application_builder/abstract_application_hierarchy_level_builder.py index 1343e7d6..99dc8f2b 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/application_builder/abstract_application_hierarchy_level_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/application_builder/abstract_application_hierarchy_level_builder.py @@ -8,7 +8,7 @@ from dedoc.structure_extractors.feature_extractors.law_text_features import LawTextFeatures from dedoc.structure_extractors.hierarchy_level_builders.abstract_hierarchy_level_builder import AbstractHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.law_builders.structure_unit.abstract_structure_unit import AbstractStructureUnit -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_item_with_bracket, regexps_number +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_item_with_bracket, regexps_number, roman_regexp class AbstractApplicationHierarchyLevelBuilder(AbstractHierarchyLevelBuilder, abc.ABC): @@ -16,6 +16,7 @@ class AbstractApplicationHierarchyLevelBuilder(AbstractHierarchyLevelBuilder, ab regexps_item = regexps_item_with_bracket regexps_part = regexps_number regexp_application_begin = LawTextFeatures.regexp_application_begin + roman_regexp = roman_regexp @property @abc.abstractmethod @@ -63,7 +64,7 @@ def _line_2level(self, text: str, label: str, init_hl_depth: int, previous_hl: H label = "application" if label == "raw_text" and LawTextFeatures.regexp_application_begin.match(text): label = "application" - if (label == "application" or label == "raw_text") and LawTextFeatures.roman_regexp.match(text): + if (label == "application" or label == "raw_text") and roman_regexp.match(text): label = "structure_unit" if label == "structure_unit": diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/abstract_body_hierarchy_level_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/abstract_body_hierarchy_level_builder.py index f7efcbae..eafd25a9 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/abstract_body_hierarchy_level_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/abstract_body_hierarchy_level_builder.py @@ -6,10 +6,10 @@ from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.structure_extractors.feature_extractors.law_text_features import LawTextFeatures from dedoc.structure_extractors.hierarchy_level_builders.abstract_hierarchy_level_builder import AbstractHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.law_builders.structure_unit.abstract_structure_unit import AbstractStructureUnit -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number, regexps_item_with_bracket, regexps_number, regexps_subitem +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number, regexps_item_with_bracket, regexps_number, \ + regexps_subitem, roman_regexp class AbstractBodyHierarchyLevelBuilder(AbstractHierarchyLevelBuilder, abc.ABC): @@ -18,6 +18,7 @@ class AbstractBodyHierarchyLevelBuilder(AbstractHierarchyLevelBuilder, abc.ABC): regexps_part = regexps_number ends_of_number = regexps_ends_of_number regexps_subitem = regexps_subitem + roman_regexp = roman_regexp @property @abc.abstractmethod @@ -68,7 +69,7 @@ def _line_2level(self, text: str, label: str, init_hl_depth: int, previous_hl: H if label == "header": label = "raw_text" - if (label in ("application", "raw_text", "cellar")) and LawTextFeatures.roman_regexp.match(text): + if (label in ("application", "raw_text", "cellar")) and self.roman_regexp.match(text): label = "structure_unit" if label == "structure_unit": diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/foiv_structure_unit.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/foiv_structure_unit.py index 53666e89..41ea9ad2 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/foiv_structure_unit.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/foiv_structure_unit.py @@ -1,9 +1,8 @@ from typing import Optional, Tuple from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.structure_extractors.feature_extractors.law_text_features import LawTextFeatures from dedoc.structure_extractors.hierarchy_level_builders.law_builders.structure_unit.abstract_structure_unit import AbstractStructureUnit -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_foiv_item, regexps_item_with_bracket, regexps_subitem +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_foiv_item, regexps_item_with_bracket, regexps_subitem, roman_regexp class FoivStructureUnitBuilder(AbstractStructureUnit): @@ -13,7 +12,7 @@ class FoivStructureUnitBuilder(AbstractStructureUnit): regexps_subitem_with_number = regexps_item_with_bracket def structure_unit(self, text: str, init_hl_depth: int, previous_hl: Optional[HierarchyLevel]) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: - if text.lower().startswith("глава") or LawTextFeatures.roman_regexp.match(text): + if text.lower().startswith("глава") or roman_regexp.match(text): hl = HierarchyLevel(init_hl_depth + 4, 0, True, "chapter") return hl, hl diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/law_structure_unit.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/law_structure_unit.py index e9a64a35..a700350f 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/law_structure_unit.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/law_structure_unit.py @@ -1,9 +1,9 @@ from typing import Optional, Tuple from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.structure_extractors.feature_extractors.law_text_features import LawTextFeatures from dedoc.structure_extractors.hierarchy_level_builders.law_builders.structure_unit.abstract_structure_unit import AbstractStructureUnit -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number, regexps_foiv_item, regexps_item_with_bracket, regexps_subitem +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number, regexps_foiv_item, regexps_item_with_bracket, \ + regexps_subitem, roman_regexp class LawStructureUnitBuilder(AbstractStructureUnit): @@ -12,6 +12,7 @@ class LawStructureUnitBuilder(AbstractStructureUnit): regexps_part = regexps_foiv_item ends_of_number = regexps_ends_of_number regexps_subitem = regexps_subitem + roman_regexp = roman_regexp def structure_unit(self, text: str, init_hl_depth: int, previous_hl: Optional[HierarchyLevel]) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: if text.lower().startswith("часть"): @@ -20,7 +21,7 @@ def structure_unit(self, text: str, init_hl_depth: int, previous_hl: Optional[Hi if text.lower().startswith("раздел"): hl = HierarchyLevel(init_hl_depth + 2, 0, True, "section") # 4 return hl, hl - if LawTextFeatures.roman_regexp.match(text): # match roman numbers + if self.roman_regexp.match(text): # match roman numbers hl = HierarchyLevel(init_hl_depth + 3, 0, True, "subsection") # 5 return hl, hl if text.lower().startswith("глава"): diff --git a/dedoc/structure_extractors/hierarchy_level_builders/utils_reg.py b/dedoc/structure_extractors/hierarchy_level_builders/utils_reg.py index 32ca7a41..046b2fe5 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/utils_reg.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/utils_reg.py @@ -17,3 +17,4 @@ # others regexps_year = re.compile(r"(19\d\d|20\d\d)") +roman_regexp = re.compile(r"\s*(I|Г|T|Т|II|П|III|Ш|ТУ|TУ|IV|V|У|VI|УТ|УT|VII|УТТ|VIII|I[XХ]|[XХ]|[XХ]I|[XХ]II)\.\s+") diff --git a/dedoc/structure_extractors/line_type_classifiers/law_classifier.py b/dedoc/structure_extractors/line_type_classifiers/law_classifier.py index b79e6be7..6f116fe3 100644 --- a/dedoc/structure_extractors/line_type_classifiers/law_classifier.py +++ b/dedoc/structure_extractors/line_type_classifiers/law_classifier.py @@ -23,10 +23,10 @@ def predict(self, lines: List[LineWithMeta]) -> List[str]: return [] features = self.feature_extractor.transform([lines]) - labels_probability = self.classifier.predict_proba(features) # noqa + labels_probability = self.classifier.predict_proba(features) # mark lines inside quotes as raw_text - inside_quotes = np.array(LawTextFeatures()._inside_quotes(lines), dtype=bool) # noqa + inside_quotes = np.array(LawTextFeatures()._inside_quotes(lines), dtype=bool) raw_text_id = list(self.classifier.classes_).index("raw_text") labels_probability[inside_quotes, raw_text_id] = 1 labels = [self.classifier.classes_[label_id] for label_id in labels_probability.argmax(1)] diff --git a/dedoc/utils/image_utils.py b/dedoc/utils/image_utils.py index 4014e621..ea7d963a 100644 --- a/dedoc/utils/image_utils.py +++ b/dedoc/utils/image_utils.py @@ -1,12 +1,9 @@ from copy import deepcopy from typing import List, Tuple -import PIL -import cv2 import numpy as np from PIL import Image, ImageDraw from dedocutils.data_structures import BBox -from scipy.ndimage import maximum_filter def get_highest_pixel_frequency(image: np.ndarray) -> int: @@ -18,7 +15,7 @@ def get_highest_pixel_frequency(image: np.ndarray) -> int: return color -def get_bbox_from_image(image: Image, bbox: BBox, resize: Tuple[int, int] = (300, 15)) -> PIL: +def get_bbox_from_image(image: Image.Image, bbox: BBox, resize: Tuple[int, int] = (300, 15)) -> Image.Image: """ take image and bbox and crop bbox from this image, resize it if necessary and return. @param image: pil image @@ -29,7 +26,7 @@ def get_bbox_from_image(image: Image, bbox: BBox, resize: Tuple[int, int] = (300 """ rectangle = (bbox.x_top_left, bbox.y_top_left, bbox.x_bottom_right, bbox.y_bottom_right) if isinstance(image, np.ndarray): - image = PIL.Image.fromarray(image) + image = Image.fromarray(image) cropped = image.crop(rectangle) if resize is not None: cropped = cropped.resize((300, 15)).convert("RGB") @@ -41,6 +38,8 @@ def rotate_image(image: np.ndarray, angle: float, color_bound: Tuple[int, int, i Rotates an image (angle in degrees) and expands image to avoid cropping (do bounds of color_bound) Changes width and height of image (image.shape != rotated_image.shape) """ + import cv2 + height, width = image.shape[:2] image_center = (width / 2, height / 2) rotation_mat = cv2.getRotationMatrix2D(image_center, angle, 1.) @@ -64,6 +63,9 @@ def crop_image_text(image: np.ndarray) -> BBox: @param image: original image @return: cropped image """ + import cv2 + from scipy.ndimage import maximum_filter + im_height, im_width = image.shape[0], image.shape[1] edges = cv2.Canny(image, 100, 200) edges = maximum_filter(edges, (10, 10)) @@ -79,7 +81,7 @@ def crop_image_text(image: np.ndarray) -> BBox: return BBox(x_top_left=0, y_top_left=0, width=im_width, height=im_height) -def draw_rectangle(image: PIL.Image, x_top_left: int, y_top_left: int, width: int, height: int, color: Tuple[int, int, int] = (0, 0, 0)) -> np.ndarray: +def draw_rectangle(image: Image.Image, x_top_left: int, y_top_left: int, width: int, height: int, color: Tuple[int, int, int] = (0, 0, 0)) -> np.ndarray: if color == "black": color = (0, 0, 0) source_img = deepcopy(image).convert("RGBA") @@ -94,7 +96,7 @@ def draw_rectangle(image: PIL.Image, x_top_left: int, y_top_left: int, width: in return np.array(source_img) -def get_concat_v(images: List[Image.Image]) -> Image: +def get_concat_v(images: List[Image.Image]) -> Image.Image: if len(images) == 1: return images[0] width = max((image.width for image in images)) diff --git a/dedoc/utils/parameter_utils.py b/dedoc/utils/parameter_utils.py index 0060e38e..461f3654 100644 --- a/dedoc/utils/parameter_utils.py +++ b/dedoc/utils/parameter_utils.py @@ -3,7 +3,7 @@ from logging import Logger from typing import Any, Dict, Optional, Tuple -from dedoc.config import RESOURCES_PATH, get_config +from dedoc.config import get_config def get_param_language(parameters: Optional[dict]) -> str: @@ -164,7 +164,7 @@ def get_path_param(parameters: Optional[dict], path_key: str) -> str: if path_value is None: default_config = get_config() - path_value = default_config.get(path_key, RESOURCES_PATH) + path_value = default_config.get(path_key, default_config["resources_path"]) os.makedirs(path_value, exist_ok=True) return path_value diff --git a/dedoc/utils/pdf_utils.py b/dedoc/utils/pdf_utils.py index ba574dfd..ddc4fe60 100644 --- a/dedoc/utils/pdf_utils.py +++ b/dedoc/utils/pdf_utils.py @@ -1,11 +1,10 @@ from typing import Optional from PIL.Image import Image -from pdf2image import convert_from_path -from pypdf import PdfReader def get_pdf_page_count(path: str) -> Optional[int]: + from pypdf import PdfReader try: reader = PdfReader(path) return len(reader.pages) @@ -20,5 +19,7 @@ def get_page_image(path: str, page_id: int) -> Optional[Image]: @param page_id: page id starts from zero @return: pil image if success None otherwise """ + from pdf2image import convert_from_path + images = convert_from_path(path, first_page=page_id + 1, last_page=page_id + 1) return images[0] if len(images) > 0 else None diff --git a/dedoc/utils/train_dataset_utils.py b/dedoc/utils/train_dataset_utils.py index fc29e8f4..7770fe26 100644 --- a/dedoc/utils/train_dataset_utils.py +++ b/dedoc/utils/train_dataset_utils.py @@ -2,13 +2,14 @@ import os from typing import List -import PIL import numpy as np -from PIL.Image import Image +from PIL import Image + +from dedoc.data_structures.line_with_meta import LineWithMeta def __to_pil(image: np.ndarray) -> Image: - return PIL.Image.fromarray(image) + return Image.fromarray(image) def __create_images_path(config: dict) -> None: @@ -30,7 +31,7 @@ def _get_images_path(config: dict, document_name: str) -> str: return images_path -def save_line_with_meta(lines: List["LineWithMeta"], original_document: str, *, config: dict) -> None: # noqa +def save_line_with_meta(lines: List[LineWithMeta], original_document: str, *, config: dict) -> None: __create_images_path(config) # merge lines with the same bbox @@ -45,7 +46,7 @@ def save_line_with_meta(lines: List["LineWithMeta"], original_document: str, *, out.write("\n") -def __postprocess_lines(lines: List["LineWithMeta"]) -> List["LineWithMeta"]: # noqa +def __postprocess_lines(lines: List[LineWithMeta]) -> List[LineWithMeta]: postprocessed_lines = [] prev_bbox = None for line in lines: diff --git a/dedoc/utils/utils.py b/dedoc/utils/utils.py index 8e745252..ec950f84 100644 --- a/dedoc/utils/utils.py +++ b/dedoc/utils/utils.py @@ -1,22 +1,12 @@ import datetime -import difflib -import gzip -import hashlib import json import mimetypes import os -import random import re import shutil import time from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple, TypeVar -import magic -import puremagic -import requests -from Levenshtein._levenshtein import ratio -from charset_normalizer import from_bytes -from dateutil.parser import parse from fastapi import UploadFile from dedoc.data_structures.document_content import DocumentContent @@ -106,6 +96,8 @@ def get_unique_name(filename: str) -> str: """ Return a unique name by template [timestamp]_[random number 0..1000][extension] """ + import random + _, ext = splitext_(filename) ts = int(time.time()) rnd = random.randint(0, 1000) @@ -145,6 +137,9 @@ def get_file_mime_type(path: str) -> str: def get_file_mime_by_content(path: str) -> str: + import magic + import puremagic + mime = magic.from_file(path, mime=True) if mime == "application/octet-stream": # for files with mime in {"image/x-sun-raster", "image/x-ms-bmp"} @@ -175,6 +170,8 @@ def special_match(strg: str, regular_pattern: str = r"[^.?!,:;'\"\n\r ]") -> boo def calculate_file_hash(path: str) -> str: + import hashlib + with open(path, "rb") as file: file_hash = hashlib.md5() chunk = file.read(8192) @@ -200,6 +197,9 @@ def get_encoding(path: str, default: str = None) -> Optional[str]: """ try to define encoding of the given file """ + import gzip + from charset_normalizer import from_bytes + try: if path.endswith(".gz"): with gzip.open(path, "r") as file: @@ -209,12 +209,14 @@ def get_encoding(path: str, default: str = None) -> Optional[str]: blob = file.read() dammit = from_bytes(blob) return dammit.best().encoding - except: # noqa ignore exception and return default encoding + except Exception: return default def similarity(s1: str, s2: str) -> float: """string similarity""" + import difflib + normalized1 = s1.lower() normalized2 = s2.lower() matcher = difflib.SequenceMatcher(None, normalized1, normalized2) @@ -222,6 +224,8 @@ def similarity(s1: str, s2: str) -> float: def similarity_levenshtein(str1: str, str2: str) -> float: + from Levenshtein._levenshtein import ratio + str1 = str1.lower() str2 = str2.lower() return ratio(str1, str2) @@ -233,6 +237,8 @@ def convert_datetime(time_string: str) -> int: :param time_str: string of time in ISO/IEC 8824 format (D:YYYYMMDDHHmmSSOHH'mm'). Example: "D:20210202145619+00'16'" :return: UnixTime (type: int) """ + from dateutil.parser import parse + # convert utc-part OHH'mm' into iso-format ±HHMM[SS[.ffffff]] 'D:20191028113639Z' # description of time-format can see # https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf, page 160 @@ -261,6 +267,8 @@ def check_filename_length(filename: str) -> str: def send_file(host: str, file_name: str, file_path: str, parameters: dict) -> Dict[str, Any]: + import requests + with open(file_path, "rb") as file: # file we want to parse files = {"file": (file_name, file)} diff --git a/docker-compose.yml b/docker-compose.yml index 85378db9..58d36ef2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,6 @@ services: mem_limit: 16G build: context: . - dockerfile: Dockerfile restart: always tty: true ports: @@ -22,7 +21,6 @@ services: - dedoc build: context: . - dockerfile: Dockerfile tty: true environment: DOC_READER_HOST: "dedoc" diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 2834f209..3d911775 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,19 @@ Changelog ========= +v2.2.4 (2024-06-20) +------------------- +Release note: `v2.2.4 `_ + +* Show page division and page numbers in the HTML output representation (API usage, return_format="html"). +* Make imports from dedoc library faster. +* Added tutorial how to add a new language to dedoc (not finished entirely). +* Added additional page_id metadata for multi-page nodes (structure_type="tree" in API, `TreeConstructor` in the library). +* Updated OCR and orientation/columns classification benchmarks. +* Minor edits of `README.md`. +* Fixed empty cells handling in `CSVReader`. +* Fixed bounding boxes extraction for text in tables for `PdfTabbyReader`. + v2.2.3 (2024-06-05) ------------------- Release note: `v2.2.3 `_ diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index e380930d..c6d068b4 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -160,7 +160,7 @@ Below are the instructions for installing the package ``virtualenvwrapper``: Install trusted torch (verified version) ----------------------------------------------- +---------------------------------------- You can install a trusted library ``torch`` (as a verified version of the library, verified by tools developed by the Ivannikov Institute for System Programming of the Russian Academy of Sciences). diff --git a/docs/source/index.rst b/docs/source/index.rst index 779a6adb..0f2aed1d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -230,6 +230,7 @@ For a document of unknown or unsupported domain there is an option to use defaul tutorials/add_new_doc_format tutorials/add_new_structure_type tutorials/creating_document_classes + tutorials/add_new_language .. toctree:: diff --git a/docs/source/tutorials/add_new_language.rst b/docs/source/tutorials/add_new_language.rst new file mode 100644 index 00000000..94f5c66e --- /dev/null +++ b/docs/source/tutorials/add_new_language.rst @@ -0,0 +1,114 @@ +.. _add_language: + +Adding support for a new language to Dedoc +========================================== + +By default, dedoc supports handling Russian and English languages. +The most important part of language support is OCR (for images, PDF). +If you don't need parse images and PDF files, you don't need to do anything. + +To parse images with a new language, additional Tesseract language packages should be installed. +The list of languages supported by Tesseract are enlisted `here `_ (see **Languages** section). + +.. seealso:: + The instruction with Tesseract installation can be found :ref:`here `. + +.. warning:: + Not all languages are fully supported by dedoc even with installed Tesseract packages. The more detailed information will appear soon. + + +Add new language in docker +-------------------------- + +Similar to the :ref:`installation tutorial `, beforehand one should clone the dedoc repository and go to the `dedoc` directory: + +.. code-block:: bash + + git clone https://github.com/ispras/dedoc + cd dedoc + +Then one should decide, which languages should be supported, and look for them in the +`list of supported languages `_ (**Languages** section). +For each language, ``LangCode`` is used to configure it. +For example, if we need to add French and Spanish, we should use ``fra`` and ``spa`` language codes. + + +Using docker build +****************** + +For passing the list of languages while building docker image, the ``LANGUAGES`` argument is used. +Languages should be enlisted in string and separated by spaces. +For example, for adding French and Spanish we should use the following command: + +.. code-block:: bash + + docker build --build-arg LANGUAGES="fra spa" . + +One may also choose a tag for an image, e.g. ``dedocproject/dedoc_multilang:latest``, and run the container: + +.. code-block:: bash + + docker build -t dedocproject/dedoc_multilang:latest --build-arg LANGUAGES="fra spa" . + docker run -p 1231:1231 --rm dedocproject/dedoc_multilang python3 /dedoc_root/dedoc/main.py + + +Using docker-compose +******************** + +For passing the list of languages while building docker image, the ``LANGUAGES`` argument is used in the ``docker-compose.yml`` file. +Languages should be enlisted in string and separated by spaces. +For example, for adding French and Spanish we should add the following lines to the ``docker-compose.yml`` file: + +.. code-block:: yaml + :emphasize-lines: 8-9 + + version: '2.4' + + services: + dedoc: + mem_limit: 16G + build: + context: . + args: + LANGUAGES: "fra spa" + restart: always + tty: true + ports: + - 1231:1231 + environment: + DOCREADER_PORT: 1231 + GROBID_HOST: "grobid" + GROBID_PORT: 8070 + +Then, the service can be run with the following command: + +.. code-block:: bash + + docker-compose up --build + + +Add new language locally +------------------------ + +Suppose Tesseract OCR 5 is already installed on the computer (or see :ref:`instruction `). +For each language, the following command should be executed (``lang`` is one language code): + +.. code-block:: bash + + apt install -y tesseract-ocr-$lang + +For example, for adding French and Spanish we should use the following commands: + +.. code-block:: bash + + apt install -y tesseract-ocr-fra + apt install -y tesseract-ocr-spa + +Or we can install all packages with one command using ``LANGUAGES`` variable: + +.. code-block:: bash + + export LANGUAGES="fra spa" + for lang in $LANGUAGES; do apt install -y tesseract-ocr-$lang; done + +Then the dedoc library can be used with new languages or dedoc API can be run locally (see :ref:`instruction `) for more details. diff --git a/labeling/tests/test_images_creators.py b/labeling/tests/test_images_creators.py index 8b37ba3d..fd2cb1b7 100644 --- a/labeling/tests/test_images_creators.py +++ b/labeling/tests/test_images_creators.py @@ -12,8 +12,10 @@ from dedoc.dedoc_manager import DedocManager from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition -from dedoc.readers import PdfImageReader, PdfTabbyReader, PdfTxtlayerReader from dedoc.readers.docx_reader.docx_reader import DocxReader +from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader from dedoc.readers.reader_composition import ReaderComposition from dedoc.readers.txt_reader.raw_text_reader import RawTextReader from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor diff --git a/labeling/train_dataset/api/api.py b/labeling/train_dataset/api/api.py index 917c9b65..14228f40 100644 --- a/labeling/train_dataset/api/api.py +++ b/labeling/train_dataset/api/api.py @@ -149,7 +149,7 @@ def handle_archive() -> Response: @app.post("/upload_archive") -def upload_archive(file: UploadFile = File(...), query_params: TrainDatasetParameters = Depends()) -> Response: # noqa +def upload_archive(file: UploadFile = File(...), query_params: TrainDatasetParameters = Depends()) -> Response: """ Run the whole pipeline of task making. """ diff --git a/pyproject.toml b/pyproject.toml index f0f8754c..307b4797 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,10 +52,11 @@ lint = [ "flake8-annotations==2.9.1", "flake8-bugbear==23.3.12", "flake8-builtins==2.1.0", - "flake8-fill-one-line>=0.4.0", + "flake8-fill-one-line>=0.4.2", "flake8-import-order==0.18.2", "flake8-multiline-containers==0.0.19", "flake8-print==5.0.0", + "flake8-tidy-imports==4.10.0", "flake8-quotes==3.3.2", "flake8-use-fstring==1.4", "pycodestyle==2.9.0", diff --git a/resources/benchmarks/orient_classifier_scores.txt b/resources/benchmarks/orient_classifier_scores.txt new file mode 100644 index 00000000..9fe55d01 --- /dev/null +++ b/resources/benchmarks/orient_classifier_scores.txt @@ -0,0 +1,25 @@ + +Orientation predictions: ++-------+-----------+--------+-------+-------+ +| Class | Precision | Recall | F1 | Count | ++=======+===========+========+=======+=======+ +| 0 | 0.998 | 1 | 0.999 | 537 | ++-------+-----------+--------+-------+-------+ +| 90 | 1 | 0.998 | 0.999 | 537 | ++-------+-----------+--------+-------+-------+ +| 180 | 1 | 0.998 | 0.999 | 537 | ++-------+-----------+--------+-------+-------+ +| 270 | 0.998 | 1 | 0.999 | 537 | ++-------+-----------+--------+-------+-------+ +| AVG | 0.999 | 0.999 | 0.999 | None | ++-------+-----------+--------+-------+-------+ +Column predictions: ++-------+-----------+--------+-------+-------+ +| Class | Precision | Recall | F1 | Count | ++=======+===========+========+=======+=======+ +| 1 | 1 | 0.999 | 0.999 | 1692 | ++-------+-----------+--------+-------+-------+ +| 2 | 0.996 | 1 | 0.998 | 456 | ++-------+-----------+--------+-------+-------+ +| AVG | 0.999 | 0.999 | 0.999 | None | ++-------+-----------+--------+-------+-------+ \ No newline at end of file diff --git a/resources/benchmarks/tesseract_benchmark.txt b/resources/benchmarks/tesseract_benchmark.txt deleted file mode 100644 index fd980a45..00000000 --- a/resources/benchmarks/tesseract_benchmark.txt +++ /dev/null @@ -1,256 +0,0 @@ -Tesseract version is 5.0.0 -Table 1 - Accuracy for each file -+---------------+---------------------+-------+-----------------+--------------+ -| Dataset | Image name | --psm | Amount of words | Accuracy OCR | -+===============+=====================+=======+=================+==============+ -| english-words | Kaspersky | 6 | 111 | 99.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | USB | 6 | 4 | 85.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words1 | 6 | 19 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words2 | 6 | 9 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words3 | 6 | 9 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| others | Zaklyuchenie_nevrol | 4 | 525 | 83.800 | -| | oga_00 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| others | Zaklyuchenie_nevrol | 4 | 241 | 88.800 | -| | oga_01 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| others | napalm_doc_2_2_6 | 4 | 124 | 86.100 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 695 | 99.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 696 | 99.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 699 | 99.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | article_multiline | 4 | 471 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | fstek17_00 | 4 | 192 | 95.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | fstek17_01 | 4 | 332 | 99.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | law_image | 4 | 182 | 99.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | napalm_doc_13_2 | 4 | 243 | 97.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukaz_prezidenta_1 | 4 | 264 | 99.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukodeksrf_00 | 4 | 287 | 99.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukodeksrf_01 | 4 | 340 | 99.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 146 | 95.700 | -| | 0 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 276 | 99.600 | -| | 1 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 165 | 98.800 | -| | 2 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 90 | 99.600 | -| | 3 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_00 | 4 | 78 | 97.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_01 | 4 | 296 | 98.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_02 | 4 | 309 | 98.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_03 | 4 | 337 | 98.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_04 | 4 | 257 | 96.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_05 | 4 | 238 | 98.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_06 | 4 | 219 | 93.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_07 | 4 | 233 | 98.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_08 | 4 | 284 | 97.200 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_09 | 4 | 154 | 97.500 | -+---------------+---------------------+-------+-----------------+--------------+ - -Table 2 - AVG by each type of symbols: -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | -| t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | -| | g_Char | l_Symb | | ase_Ch | al_Sym | | words | cy | -| | s | ols | | ars | bols | | | | -+========+========+========+========+========+========+========+=======+=======+ -| englis | 100 | 99.333 | 100 | 0 | 0 | 94.540 | 152 | 97.06 | -| h- | | | | | | | | 0 | -| words | | | | | | | | | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| others | 90.967 | 77.400 | 89.533 | 0 | 0 | 86.433 | 890 | 86.23 | -| | | | | | | | | 3 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| tz-npa | 99.268 | 91.064 | 92.076 | 0 | 0 | 99.480 | 7483 | 98.39 | -| | | | | | | | | 6 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ - -Table 3 -OCR error by symbol: -+--------+---------------------------------------------------------------------+ -| Symbol | Cnt Errors & Correct-Generated | -+========+=====================================================================+ -| | ['3 & -> ', '2 & < 6> -> <б>', '2 & < > -> <__>', "2 & | -| | <1 > -> <'>", '2 & <и > -> <н>'] | -+--------+---------------------------------------------------------------------+ -| . | ['5 & <.> -> <,>', '3 & <3.> -> < De>', '3 & -> ', '2 & | -| | <6.> -> ', '2 & <г.> -> <Г>'] | -+--------+---------------------------------------------------------------------+ -| , | ['66 & <,> -> <.>', '3 & <ва,> -> <нь>'] | -+--------+---------------------------------------------------------------------+ -| 1 | ['6 & <1> -> <|>', '4 & <1С> -> ', "3 & <1> -> <'>", '3 & <№1> | -| | -> ', '3 & <№1»> -> ', "2 & <1 > -> <'>", '2 & <1C> -> | -| | ', '2 & <1C> -> <С>', '2 & <1> -> ', '1 & <1> -> <Г>', '1 & | -| | <1> -> <Т>'] | -+--------+---------------------------------------------------------------------+ -| е | ['6 & <е> -> <с>', '2 & <не> -> ', '2 & <ре> -> <с>', '1 & <е> | -| | -> <а>'] | -+--------+---------------------------------------------------------------------+ -| н | ['2 & <н> -> <и>', '2 & <не> -> ', '1 & <н> -> <й>', '1 & <н> | -| | -> <п>'] | -+--------+---------------------------------------------------------------------+ -| и | ['3 & <ти> -> < TH>', '3 & <тип> -> ', '2 & <и > -> <н>', '2 & | -| | <ис> -> <не>'] | -+--------+---------------------------------------------------------------------+ -| а | ['3 & <ва,> -> <нь>'] | -+--------+---------------------------------------------------------------------+ -| о | ['2 & <то> -> ', '1 & <о> -> <0>'] | -+--------+---------------------------------------------------------------------+ -| т | ['7 & <т> -> <г>', '4 & <т> -> < г>', '3 & <ти> -> < TH>', '3 & | -| | <тип> -> ', '2 & <то> -> '] | -+--------+---------------------------------------------------------------------+ -| 2 | ['2 & <28> -> ', '2 & <28> -> <ИР>', '2 & <28> -> <Я >'] | -+--------+---------------------------------------------------------------------+ -| л | ['2 & <л> -> <п>'] | -+--------+---------------------------------------------------------------------+ -| С | ['6 & <СЗВ> -> ', '4 & <1С> -> ', '4 & <ОС> -> ', '3 & | -| | <С> -> ', '2 & <СА> -> ', '1 & <С> -> <—>'] | -+--------+---------------------------------------------------------------------+ -| 3 | ['3 & <3.> -> < De>', '1 & <3> -> '] | -+--------+---------------------------------------------------------------------+ -| г | ['2 & <г.> -> <Г>', '2 & <г> -> <т >', '2 & <г> -> <т>', '2 & <гр> | -| | -> ', '2 & <гр> -> <тв>'] | -+--------+---------------------------------------------------------------------+ -| N | ['22 & -> <М>'] | -+--------+---------------------------------------------------------------------+ -| в | ['3 & <ва,> -> <нь>', '1 & <в> -> <В>', '1 & <в> -> <п>'] | -+--------+---------------------------------------------------------------------+ -| р | ['2 & <гр> -> ', '2 & <гр> -> <тв>', '2 & <ре> -> <с>'] | -+--------+---------------------------------------------------------------------+ -| Н | ['6 & <Н> -> <* П>', '6 & <Н> -> <° >', '3 & <Н> -> <¢ П>', '2 & | -| | <ЕН> -> <ек>', '2 & <Н> -> <. >', '2 & <Н> -> <И>'] | -+--------+---------------------------------------------------------------------+ -| с | ['2 & <ис> -> <не>', '1 & <с> -> ', '1 & <с> -> <©>', '1 & <с> | -| | -> <е>'] | -+--------+---------------------------------------------------------------------+ -| А | ['2 & <СА> -> '] | -+--------+---------------------------------------------------------------------+ -| И | ['3 & <И> -> ', '1 & <И> -> <Й>', '1 & <И> -> <Н>', '1 & <И> | -| | -> <П>'] | -+--------+---------------------------------------------------------------------+ -| д | ['3 & <д> -> <л>'] | -+--------+---------------------------------------------------------------------+ -| Е | ['2 & <ЕН> -> <ек>'] | -+--------+---------------------------------------------------------------------+ -| О | ['4 & <ОС> -> ', '2 & <ВО> -> <Ю>', '2 & <Об> -> <06>', '1 & | -| | <О> -> <о>'] | -+--------+---------------------------------------------------------------------+ -| П | ['1 & <П> -> <И>'] | -+--------+---------------------------------------------------------------------+ -| Т | ['4 & <Т> -> <Г>', '3 & <МРТ> -> ', '3 & <ТЗР> -> '] | -+--------+---------------------------------------------------------------------+ -| п | ['3 & <тип> -> ', '2 & <п> -> <и>', '2 & <п> -> <н>'] | -+--------+---------------------------------------------------------------------+ -| В | ['6 & <СЗВ> -> ', '2 & <ВЗ> -> <Ръ>', '2 & <ВО> -> <Ю>'] | -+--------+---------------------------------------------------------------------+ -| 0 | ['3 & <608> -> '] | -+--------+---------------------------------------------------------------------+ -| - | ['3 & <-> -> <=>', '1 & <-> -> <|>'] | -+--------+---------------------------------------------------------------------+ -| 6 | ['3 & <608> -> ', '2 & < 6> -> <б>', '2 & <6.> -> '] | -+--------+---------------------------------------------------------------------+ -| I | ['3 & -> ', '3 & -> <Ш>', '3 & -> <УП>', '1 | -| | & -> <|>'] | -+--------+---------------------------------------------------------------------+ -| М | ['3 & <МРТ> -> '] | -+--------+---------------------------------------------------------------------+ -| Р | ['3 & <МРТ> -> ', '3 & <ТЗР> -> '] | -+--------+---------------------------------------------------------------------+ -| б | ['2 & <Об> -> <06>'] | -+--------+---------------------------------------------------------------------+ -| 5 | ['2 & <75> -> <#2>'] | -+--------+---------------------------------------------------------------------+ -| ; | ['8 & <;> -> <:>'] | -+--------+---------------------------------------------------------------------+ -| ь | ['2 & <ь> -> < Ь>'] | -+--------+---------------------------------------------------------------------+ -| 8 | ['3 & <608> -> ', '2 & <28> -> ', '2 & <28> -> <ИР>', '2 & | -| | <28> -> <Я >'] | -+--------+---------------------------------------------------------------------+ -| E | ['6 & -> <ЕВР>'] | -+--------+---------------------------------------------------------------------+ -| З | ['6 & <СЗВ> -> ', '3 & <БЗ> -> <653>', '3 & <ТЗР> -> ', | -| | '2 & <ВЗ> -> <Ръ>'] | -+--------+---------------------------------------------------------------------+ -| 7 | ['2 & <75> -> <#2>'] | -+--------+---------------------------------------------------------------------+ -| ц | ['1 & <ц> -> <щ>'] | -+--------+---------------------------------------------------------------------+ -| ч | ['1 & <ч> -> <з>'] | -+--------+---------------------------------------------------------------------+ -| C | ['2 & <1C> -> ', '2 & <1C> -> <С>', '2 & -> <С>'] | -+--------+---------------------------------------------------------------------+ -| Б | ['3 & <БЗ> -> <653>'] | -+--------+---------------------------------------------------------------------+ -| Д | ['1 & <Д> -> <З>'] | -+--------+---------------------------------------------------------------------+ -| й | ['1 & <й> -> <:>'] | -+--------+---------------------------------------------------------------------+ -| Ц | ['1 & <Ц> -> <Т>'] | -+--------+---------------------------------------------------------------------+ -| P | ['6 & -> <ЕВР>'] | -+--------+---------------------------------------------------------------------+ -| R | ['6 & -> <ЕВР>'] | -+--------+---------------------------------------------------------------------+ -| a | ['4 & -> <на>', '1 & -> <а>'] | -+--------+---------------------------------------------------------------------+ -| G | ['2 & -> <С>'] | -+--------+---------------------------------------------------------------------+ -| H | ['4 & -> <на>'] | -+--------+---------------------------------------------------------------------+ -| V | ['3 & -> <УП>'] | -+--------+---------------------------------------------------------------------+ -| m | ['2 & -> '] | -+--------+---------------------------------------------------------------------+ -| | | ['1 & <|> -> <1>'] | -+--------+---------------------------------------------------------------------+ -| № | ['3 & <№1> -> ', '3 & <№1»> -> '] | -+--------+---------------------------------------------------------------------+ -| Ю | ['2 & <Ю> -> <1О>'] | -+--------+---------------------------------------------------------------------+ -| Y | ['1 & -> <У>'] | -+--------+---------------------------------------------------------------------+ -| _ | ['1 & <_> -> < >'] | -+--------+---------------------------------------------------------------------+ -| c | ['1 & -> <с>'] | -+--------+---------------------------------------------------------------------+ -| d | ['1 & -> <4>'] | -+--------+---------------------------------------------------------------------+ -| o | ['2 & -> '] | -+--------+---------------------------------------------------------------------+ -| y | ['1 & -> <у>'] | -+--------+---------------------------------------------------------------------+ -| » | ['3 & <№1»> -> '] | -+--------+---------------------------------------------------------------------+ -| щ | ['1 & <щ> -> <ш>'] | -+--------+---------------------------------------------------------------------+ -| ‚ | ['2 & <‚> -> <_,>'] | -+--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/resources/benchmarks/tesseract_benchmark_Correction.SAGE_CORRECTION.txt b/resources/benchmarks/tesseract_benchmark_Correction.SAGE_CORRECTION.txt new file mode 100644 index 00000000..1cc5782f --- /dev/null +++ b/resources/benchmarks/tesseract_benchmark_Correction.SAGE_CORRECTION.txt @@ -0,0 +1,473 @@ +Tesseract version is 5.0.0 +Correction step: Correction.SAGE_CORRECTION + +Table 1 - Accuracy for each file ++---------------+----------------+--------------+---------------+--------------+ +| Dataset | Image name | OCR language | Amount of | Accuracy OCR | +| | | | words | | ++===============+================+==============+===============+==============+ +| english-words | Kaspersky | rus+eng | 111 | 99.300 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | USB | rus+eng | 4 | 0 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | words1 | rus+eng | 19 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | words2 | rus+eng | 9 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | words3 | rus+eng | 9 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| low_quality | VKR_5 | rus | 68 | 50.700 | ++---------------+----------------+--------------+---------------+--------------+ +| others | Zaklyuchenie_n | rus | 525 | 83.200 | +| | evrologa_00 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| others | Zaklyuchenie_n | rus | 241 | 87.100 | +| | evrologa_01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| others | napalm_doc_2_2 | rus | 124 | 85.100 | +| | _6 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 05df9bb8-88bf- | rus | 301 | 99.300 | +| | 4bae-8eb4-dcce | | | | +| | 4961e588-2 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 05df9bb8-88bf- | rus | 230 | 97.400 | +| | 4bae-8eb4-dcce | | | | +| | 4961e588-3 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 1.620e+14 | rus | 695 | 99.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 1.620e+14 | rus | 696 | 99.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 1.620e+14 | rus | 699 | 99.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 155 | 88.700 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 266 | 97.700 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 307 | 95.800 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-03 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 343 | 96.900 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-04 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 482b19a1-6f22- | rus | 262 | 98.100 | +| | 4ed1-99c8-88a4 | | | | +| | f5ef18f8-001 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 482b19a1-6f22- | rus | 236 | 92.300 | +| | 4ed1-99c8-88a4 | | | | +| | f5ef18f8-002 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 5bb5625f-7765- | rus | 188 | 92.200 | +| | 48e3-ae49-4e4e | | | | +| | 974c9902-01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 5bb5625f-7765- | rus | 59 | 94.900 | +| | 48e3-ae49-4e4e | | | | +| | 974c9902-02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 6d3e9329-9716- | rus | 151 | 98.700 | +| | 4024-89d1-1d48 | | | | +| | b93ee790-01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 6d3e9329-9716- | rus | 243 | 97.800 | +| | 4024-89d1-1d48 | | | | +| | b93ee790-02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 6d3e9329-9716- | rus | 322 | 97.900 | +| | 4024-89d1-1d48 | | | | +| | b93ee790-03 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | LAW_11 | rus | 194 | 88.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_11 | rus | 76 | 94.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_21 | rus | 61 | 97.500 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_22 | rus | 278 | 98.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_23 | rus | 277 | 98 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_24 | rus | 288 | 99.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_25 | rus | 347 | 99.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_26 | rus | 192 | 99 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_27 | rus | 173 | 98 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_28 | rus | 133 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_29 | rus | 182 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_30 | rus | 178 | 98.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_31 | rus | 37 | 97.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_32 | rus | 221 | 99.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_33 | rus | 312 | 95.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_34 | rus | 83 | 92.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_35 | rus | 355 | 97.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_1 | rus | 86 | 99.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_2 | rus | 87 | 98.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_3 | rus | 89 | 95.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_4 | rus | 89 | 90.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_6 | rus | 117 | 99.100 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | article_multil | rus | 471 | 99.900 | +| | ine | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | fstek17_00 | rus | 192 | 92.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | fstek17_01 | rus | 332 | 99.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | law_image | rus | 182 | 99.100 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | napalm_doc_13_ | rus | 243 | 96.900 | +| | 2 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ukodeksrf_00 | rus | 287 | 99 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ukodeksrf_01 | rus | 340 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 146 | 94.700 | +| | ons_00 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 276 | 98.600 | +| | ons_01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 93 | 99 | +| | ons_02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 54 | 99.600 | +| | ons_03 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_00 | rus | 78 | 96.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_01 | rus | 296 | 96.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_02 | rus | 309 | 98.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_03 | rus | 337 | 96.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_04 | rus | 257 | 77.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_05 | rus | 238 | 97.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_06 | rus | 219 | 94.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_07 | rus | 233 | 95.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_08 | rus | 284 | 98.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_09 | rus | 154 | 95.900 | ++---------------+----------------+--------------+---------------+--------------+ + +Table 2 - AVG by each type of symbols: ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | +| t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | +| | g_Char | l_Symb | | ase_Ch | al_Sym | | words | cy | +| | s | ols | | ars | bols | | | | ++========+========+========+========+========+========+========+=======+=======+ +| englis | 79.820 | 66 | 50 | 0 | 0 | 80 | 152 | 79.86 | +| h- | | | | | | | | 0 | +| words | | | | | | | | | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| low_qu | 92.600 | 60 | 46.100 | 0 | 0 | 78.200 | 68 | 50.70 | +| ality | | | | | | | | 0 | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| others | 89.933 | 76.967 | 87.167 | 0 | 0 | 87.100 | 890 | 85.13 | +| | | | | | | | | 3 | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| tz- | 97.920 | 91.678 | 94.608 | 0 | 0 | 99.100 | 14029 | 96.77 | +| npa- | | | | | | | | 3 | +| vkr | | | | | | | | | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ + +Table 3 -OCR error by symbol: ++--------+---------------------------------------------------------------------+ +| Symbol | Cnt Errors & Correct-Generated | ++========+=====================================================================+ +| | ['8 & <№ > -> ', '4 & <* > -> <4.>', '3 & <— 1> -> <19>', '3 & | +| | <— П> -> <И>', '2 & < 3> -> <З>', '2 & < г> -> <.>', '2 & < г> -> | +| | <К>', '2 & < г> -> <т>', '2 & < п> -> <тн>', '2 & < —> -> <0>', '2 | +| | & < ‚> -> <,>', "2 & <1 > -> <'>", '2 & <8 > -> <Р>', '2 & <; > -> | +| | <.>', '2 & <и > -> <н>', '2 & <й > -> <ст>', '1 & < > -> <(>', '1 & | +| | < > -> '] | ++--------+---------------------------------------------------------------------+ +| . | ['10 & <.> -> <,>', '3 & <3.1> -> <ЗЛА>', '3 & <3Г.> -> <5>', '2 & | +| | <8.> -> <$>', '2 & <8.> -> <5>', '2 & -> <9>', '2 & <г.> -> | +| | <ГТ>', '1 & <.> -> <:>'] | ++--------+---------------------------------------------------------------------+ +| — | ['7 & <—> -> <->', '3 & <— 1> -> <19>', '3 & <— П> -> <И>', '2 & < | +| | —> -> <0>'] | ++--------+---------------------------------------------------------------------+ +| № | ['170 & <№> -> ', '8 & <№ > -> ', '3 & <№17> -> <ДК>', '3 & | +| | <№> -> ', '3 & <№> -> ', '1 & <№> -> <и>'] | ++--------+---------------------------------------------------------------------+ +| 1 | ['4 & <1C> -> ', '4 & <1> -> <3>', "3 & <1> -> <'>", '3 & <3.1> | +| | -> <ЗЛА>', '3 & <31"> -> < А>', '3 & <— 1> -> <19>', '3 & <№17> -> | +| | <ДК>', "2 & <1 > -> <'>", '2 & <11> -> <И>', '2 & <1C> -> <С>', '2 | +| | & <1> -> <5>', '1 & <1> -> ', '1 & <1> -> <(>', '1 & <1> -> | +| | <2>', '1 & <1> -> <4>', '1 & <1> -> <Г>', '1 & <1> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| , | ['69 & <,> -> <.>', '3 & <ва,> -> <нь>', '2 & <Ш,> -> <П.>', '1 & | +| | <,> -> <;>'] | ++--------+---------------------------------------------------------------------+ +| е | ['2 & <е-> -> <ав>', '2 & <е-> -> <им>', '2 & <е-> -> <уд>', '2 & | +| | <е> -> <га>', '2 & <е> -> <и>', '2 & <е> -> <ё>', '2 & <ле> -> | +| | <У>', '1 & <е> -> <й>', '1 & <е> -> <о>'] | ++--------+---------------------------------------------------------------------+ +| и | ['3 & <и> -> <е>', '2 & <и > -> <н>', '2 & <и> -> <И>', '2 & <из> | +| | -> <по>', '2 & <ис> -> <не>', '2 & <си> -> <ен>', '1 & <и> -> <В>', | +| | '1 & <и> -> <Н>', '1 & <и> -> <а>', '1 & <и> -> <н>', '1 & <и> -> | +| | <ь>'] | ++--------+---------------------------------------------------------------------+ +| а | ['8 & <а> -> <о>', '3 & <ва,> -> <нь>', '2 & <ав> -> <ыс>', '2 & | +| | <па> -> <те>', '1 & <а> -> <Б>', '1 & <а> -> <б>', '1 & <а> -> | +| | <у>'] | ++--------+---------------------------------------------------------------------+ +| н | ['3 & <льн> -> <з>', '3 & <нс> -> <эро>', '2 & <н> -> <п>', '1 & | +| | <н> -> <й>'] | ++--------+---------------------------------------------------------------------+ +| о | ['4 & <по> -> <на>', '3 & <фок> -> <М>', '2 & <о-> -> <ым>', '2 & | +| | <от> -> <и>', '1 & <о> -> <у>', '1 & <о> -> <я>'] | ++--------+---------------------------------------------------------------------+ +| т | ['2 & <от> -> <и>', '2 & <рт> -> <й>', '2 & <т> -> < >', '2 & <т> | +| | -> <1>', '2 & <т> -> <Д>', '2 & <т> -> <г>', '2 & <т> -> <ин>', '2 | +| | & <эт> -> <Юг>', '1 & <т> -> <б>', '1 & <т> -> <л>', '1 & <т> -> | +| | <н>'] | ++--------+---------------------------------------------------------------------+ +| - | ['6 & <-> -> <мы>', '4 & <-> -> <го>', '3 & <-> -> < и >', '3 & <-> | +| | -> <м>', '3 & <-> -> <ния>', '3 & <-> -> <тов>', '3 & <-> -> | +| | <тых>', '2 & <-> -> <»>', '2 & <-> -> <ия>', '2 & <-> -> <ки>', '2 | +| | & <-> -> <ли>', '2 & <-> -> <ма>', '2 & <-> -> <мо>', '2 & <-> -> | +| | <ры>', '2 & <-> -> <сы>', '2 & <-> -> <ы>', '2 & <е-> -> <ав>', '2 | +| | & <е-> -> <им>', '2 & <е-> -> <уд>', '2 & <о-> -> <ым>', '1 & <-> | +| | -> <ь>'] | ++--------+---------------------------------------------------------------------+ +| р | ['3 & <гр> -> <тав>', '2 & <р> -> <ол>', '2 & <рт> -> <й>', '2 & | +| | <эр> -> <ци>', '1 & <р> -> <Р>', '1 & <р> -> <д>'] | ++--------+---------------------------------------------------------------------+ +| с | ['3 & <нс> -> <эро>', '2 & <(с> -> <С>', '2 & <ис> -> <не>', '2 & | +| | <с> -> <ез>', '2 & <с> -> <ец>', '2 & <си> -> <ен>', '1 & <с> -> | +| | <е>'] | ++--------+---------------------------------------------------------------------+ +| 3 | ['3 & <3.1> -> <ЗЛА>', '3 & <31"> -> < А>', '3 & <3Г.> -> <5>', '2 | +| | & < 3> -> <З>', '1 & <3> -> <1>', '1 & <3> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| И | ['2 & <И> -> <АН>', '1 & <И> -> <В>', '1 & <И> -> <Й>', '1 & <И> -> | +| | <Н>'] | ++--------+---------------------------------------------------------------------+ +| 2 | ['2 & <28> -> <ИР>', '2 & <28> -> <Я>', '1 & <2> -> <1>', '1 & <2> | +| | -> <3>'] | ++--------+---------------------------------------------------------------------+ +| в | ['3 & <ва,> -> <нь>', '2 & <ав> -> <ыс>', '2 & <в> -> <по>', '1 & | +| | <в> -> <м>'] | ++--------+---------------------------------------------------------------------+ +| л | ['3 & <льн> -> <з>', '2 & <ле> -> <У>', '1 & <л> -> <Д>', '1 & <л> | +| | -> <Л>', '1 & <л> -> <д>', '1 & <л> -> <т>'] | ++--------+---------------------------------------------------------------------+ +| 6 | ['1 & <6> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| г | ['3 & <гр> -> <тав>', '2 & < г> -> <.>', '2 & < г> -> <К>', '2 & < | +| | г> -> <т>', '2 & <г.> -> <ГТ>', '2 & <г> -> <т>', '1 & <г> -> <Г>'] | ++--------+---------------------------------------------------------------------+ +| А | ['3 & <МАЯ> -> <сам>'] | ++--------+---------------------------------------------------------------------+ +| д | ['1 & <д> -> <з>', '1 & <д> -> <л>', '1 & <д> -> <п>', '1 & <д> -> | +| | <ц>'] | ++--------+---------------------------------------------------------------------+ +| E | ['36 & -> <ЕВР>', '6 & -> <ЕКР>', '6 & -> <УЕВ>', | +| | '3 & -> <ЕЕР>', '1 & -> <Е>'] | ++--------+---------------------------------------------------------------------+ +| О | ['2 & <О> -> <СЯ>', '2 & <ПО> -> <по>', '1 & <О> -> <Ю>', '1 & <О> | +| | -> <о>'] | ++--------+---------------------------------------------------------------------+ +| б | ['2 & <б> -> <6>', '1 & <б> -> <ш>'] | ++--------+---------------------------------------------------------------------+ +| Н | ['2 & <Н> -> <ЕМ>', '1 & <Н> -> <И>'] | ++--------+---------------------------------------------------------------------+ +| N | ['23 & -> <М>'] | ++--------+---------------------------------------------------------------------+ +| Е | ['2 & <ЕМ> -> <Ш>', '1 & <Е> -> <в>'] | ++--------+---------------------------------------------------------------------+ +| 4 | ['1 & <4> -> <6>', '1 & <4> -> <7>'] | ++--------+---------------------------------------------------------------------+ +| у | ['5 & <у> -> <ы>'] | ++--------+---------------------------------------------------------------------+ +| п | ['4 & <по> -> <на>', '2 & < п> -> <тн>', '2 & <п> -> <нн>', '2 & | +| | <па> -> <те>', '1 & <п> -> <к>'] | ++--------+---------------------------------------------------------------------+ +| Т | ['3 & <Т> -> <Г>'] | ++--------+---------------------------------------------------------------------+ +| P | ['36 & -> <ЕВР>', '6 & -> <ЕКР>'] | ++--------+---------------------------------------------------------------------+ +| Р | ['1 & <Р> -> <р>'] | ++--------+---------------------------------------------------------------------+ +| 0 | ['2 & <08> -> <9Ф>'] | ++--------+---------------------------------------------------------------------+ +| R | ['36 & -> <ЕВР>', '6 & -> <ЕКР>'] | ++--------+---------------------------------------------------------------------+ +| м | ['3 & <мы> -> <«СП>', '2 & <мы> -> <ру>', '1 & <м> -> <й>'] | ++--------+---------------------------------------------------------------------+ +| ы | ['3 & <мы> -> <«СП>', '2 & <мы> -> <ру>', '1 & <ы> -> <а>', '1 & | +| | <ы> -> <б>'] | ++--------+---------------------------------------------------------------------+ +| я | ['2 & <яз> -> <л>', '1 & <я> -> <а>'] | ++--------+---------------------------------------------------------------------+ +| I | ['3 & -> <Ш>', '3 & -> <ТТХ>', '3 & -> <УП>', '2 & | +| | -> <1>', '2 & -> <”>'] | ++--------+---------------------------------------------------------------------+ +| C | ['7 & -> <С>', '6 & -> <С.>', '4 & <1C> -> ', '2 & | +| | <1C> -> <С>', '2 & -> <ОС>'] | ++--------+---------------------------------------------------------------------+ +| й | ['3 & <й> -> <е>', '2 & <й > -> <ст>', '2 & <й> -> <го>', '2 & <й> | +| | -> <е >', '2 & <й> -> <е:>'] | ++--------+---------------------------------------------------------------------+ +| П | ['3 & <П> -> <ИР >', '3 & <— П> -> <И>', '2 & <ПО> -> <по>', '1 & | +| | <П> -> <И>', '1 & <П> -> <К>'] | ++--------+---------------------------------------------------------------------+ +| 7 | ['3 & <№17> -> <ДК>', '1 & <7> -> <1>'] | ++--------+---------------------------------------------------------------------+ +| М | ['3 & <МАЯ> -> <сам>', '2 & <ЕМ> -> <Ш>'] | ++--------+---------------------------------------------------------------------+ +| 8 | ['2 & <08> -> <9Ф>', '2 & <28> -> <ИР>', '2 & <28> -> <Я>', '2 & <8 | +| | > -> <Р>', '2 & <8.> -> <$>', '2 & <8.> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| r | ['2 & -> <Ка>', '2 & -> <ки>', '2 & -> <н>', '1 & | +| | -> <г>'] | ++--------+---------------------------------------------------------------------+ +| ь | ['3 & <льн> -> <з>', '1 & <ь> -> <т>'] | ++--------+---------------------------------------------------------------------+ +| o | ['3 & <(no> -> <по>', '3 & -> <2о>'] | ++--------+---------------------------------------------------------------------+ +| u | ['2 & -> <Ка>', '2 & -> <ки>', '2 & -> <н>'] | ++--------+---------------------------------------------------------------------+ +| з | ['2 & <из> -> <по>', '2 & <яз> -> <л>', '1 & <з> -> <3>'] | ++--------+---------------------------------------------------------------------+ +| к | ['3 & <фок> -> <М>', '1 & <к> -> <1>', '1 & <к> -> <с>'] | ++--------+---------------------------------------------------------------------+ +| : | ['6 & -> <С.>', '5 & <:> -> <.>'] | ++--------+---------------------------------------------------------------------+ +| ; | ['9 & <;> -> <:>', '2 & <; > -> <.>'] | ++--------+---------------------------------------------------------------------+ +| ч | ['1 & <ч> -> <д>'] | ++--------+---------------------------------------------------------------------+ +| a | ['4 & -> <на>', '2 & -> <На>', '2 & -> <а>'] | ++--------+---------------------------------------------------------------------+ +| В | ['2 & <ВЗ> -> <РИ>'] | ++--------+---------------------------------------------------------------------+ +| ц | ['1 & <ц> -> <«>', '1 & <ц> -> <С>', '1 & <ц> -> <щ>'] | ++--------+---------------------------------------------------------------------+ +| _ | ['1 & <_> -> <Х>'] | ++--------+---------------------------------------------------------------------+ +| Б | ['2 & <БЗ> -> <53>'] | ++--------+---------------------------------------------------------------------+ +| w | ['3 & -> ', '3 & -> <ув>'] | ++--------+---------------------------------------------------------------------+ +| d | ['3 & -> <рар>', '1 & -> <4>'] | ++--------+---------------------------------------------------------------------+ +| e | ['2 & -> <Не>'] | ++--------+---------------------------------------------------------------------+ +| O | ['2 & -> <ОС>'] | ++--------+---------------------------------------------------------------------+ +| Д | ['1 & <Д> -> <З>'] | ++--------+---------------------------------------------------------------------+ +| З | ['2 & <БЗ> -> <53>', '2 & <ВЗ> -> <РИ>', '1 & <З> -> <У>'] | ++--------+---------------------------------------------------------------------+ +| Ц | ['1 & <Ц> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| Я | ['3 & <МАЯ> -> <сам>'] | ++--------+---------------------------------------------------------------------+ +| " | ['3 & <31"> -> < А>', '2 & <""> -> <с>'] | ++--------+---------------------------------------------------------------------+ +| D | ['3 & -> <ЕЕР>', '2 & -> <П>'] | ++--------+---------------------------------------------------------------------+ +| f | ['3 & -> <рар>'] | ++--------+---------------------------------------------------------------------+ +| ( | ['3 & <(no> -> <по>', '2 & <(с> -> <С>'] | ++--------+---------------------------------------------------------------------+ +| A | ['2 & -> <$А>'] | ++--------+---------------------------------------------------------------------+ +| H | ['4 & -> <на>', '3 & -> <КНМ>', '2 & -> <На>', '2 & | +| | -> <Не>'] | ++--------+---------------------------------------------------------------------+ +| V | ['3 & -> <УП>', '2 & -> <”>'] | ++--------+---------------------------------------------------------------------+ +| b | ['1 & -> <Ь>'] | ++--------+---------------------------------------------------------------------+ +| g | ['3 & -> <2о>'] | ++--------+---------------------------------------------------------------------+ +| n | ['3 & <(no> -> <по>'] | ++--------+---------------------------------------------------------------------+ +| p | ['3 & -> <рар>'] | ++--------+---------------------------------------------------------------------+ +| | | ['1 & <|> -> <1>'] | ++--------+---------------------------------------------------------------------+ +| Г | ['3 & <3Г.> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| Ю | ['2 & <Ю> -> <1 >'] | ++--------+---------------------------------------------------------------------+ +| ш | ['1 & <ш> -> <с>', '1 & <ш> -> <щ>'] | ++--------+---------------------------------------------------------------------+ +| * | ['4 & <* > -> <4.>'] | ++--------+---------------------------------------------------------------------+ +| B | ['6 & -> <УЕВ>'] | ++--------+---------------------------------------------------------------------+ +| F | ['3 & -> <ЕЕР>'] | ++--------+---------------------------------------------------------------------+ +| S | ['2 & -> <$А>'] | ++--------+---------------------------------------------------------------------+ +| э | ['2 & <эр> -> <ци>', '2 & <эт> -> <Юг>'] | ++--------+---------------------------------------------------------------------+ +| ю | ['1 & <ю> -> <у>'] | ++--------+---------------------------------------------------------------------+ +| G | ['2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| M | ['3 & -> <КНМ>'] | ++--------+---------------------------------------------------------------------+ +| c | ['2 & -> <9>'] | ++--------+---------------------------------------------------------------------+ +| v | ['3 & -> <2о>'] | ++--------+---------------------------------------------------------------------+ +| » | ['3 & <»> -> <22%>'] | ++--------+---------------------------------------------------------------------+ +| Х | ['1 & <Х> -> <Д>'] | ++--------+---------------------------------------------------------------------+ +| Ш | ['2 & <Ш,> -> <П.>'] | ++--------+---------------------------------------------------------------------+ +| ф | ['3 & <фок> -> <М>'] | ++--------+---------------------------------------------------------------------+ +| ‚ | ['2 & < ‚> -> <,>'] | ++--------+---------------------------------------------------------------------+ +| L | ['2 & -> <ГХ>'] | ++--------+---------------------------------------------------------------------+ +| W | ['6 & -> <УЕВ>'] | ++--------+---------------------------------------------------------------------+ +| X | ['3 & -> <ТТХ>', '2 & -> <ГХ>'] | ++--------+---------------------------------------------------------------------+ +| y | ['2 & -> <П>', '1 & -> <у>'] | ++--------+---------------------------------------------------------------------+ +| K | ['3 & -> <КНМ>'] | ++--------+---------------------------------------------------------------------+ +| Y | ['1 & -> <У>'] | ++--------+---------------------------------------------------------------------+ +| ₁ | ['1 & <₁> -> <1>'] | ++--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/resources/benchmarks/tesseract_benchmark_Correction.WITHOUT_CORRECTION.txt b/resources/benchmarks/tesseract_benchmark_Correction.WITHOUT_CORRECTION.txt new file mode 100644 index 00000000..e4b6a15a --- /dev/null +++ b/resources/benchmarks/tesseract_benchmark_Correction.WITHOUT_CORRECTION.txt @@ -0,0 +1,443 @@ +Tesseract version is 5.0.0 +Correction step: Correction.WITHOUT_CORRECTION + +Table 1 - Accuracy for each file ++---------------+----------------+--------------+---------------+--------------+ +| Dataset | Image name | OCR language | Amount of | Accuracy OCR | +| | | | words | | ++===============+================+==============+===============+==============+ +| english-words | Kaspersky | rus+eng | 111 | 99.600 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | USB | rus+eng | 4 | 0 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | words1 | rus+eng | 19 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | words2 | rus+eng | 9 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | words3 | rus+eng | 9 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| low_quality | VKR_5 | rus | 68 | 51.600 | ++---------------+----------------+--------------+---------------+--------------+ +| others | Zaklyuchenie_n | rus | 525 | 83.800 | +| | evrologa_00 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| others | Zaklyuchenie_n | rus | 241 | 88.600 | +| | evrologa_01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| others | napalm_doc_2_2 | rus | 124 | 86.300 | +| | _6 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 05df9bb8-88bf- | rus | 301 | 99.600 | +| | 4bae-8eb4-dcce | | | | +| | 4961e588-2 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 05df9bb8-88bf- | rus | 230 | 97.700 | +| | 4bae-8eb4-dcce | | | | +| | 4961e588-3 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 1.620e+14 | rus | 695 | 99.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 1.620e+14 | rus | 696 | 99.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 1.620e+14 | rus | 699 | 99.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 155 | 92.500 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 266 | 99.300 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 307 | 97.400 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-03 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 343 | 99.600 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-04 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 482b19a1-6f22- | rus | 262 | 99.900 | +| | 4ed1-99c8-88a4 | | | | +| | f5ef18f8-001 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 482b19a1-6f22- | rus | 236 | 94.100 | +| | 4ed1-99c8-88a4 | | | | +| | f5ef18f8-002 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 5bb5625f-7765- | rus | 188 | 95.100 | +| | 48e3-ae49-4e4e | | | | +| | 974c9902-01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 5bb5625f-7765- | rus | 59 | 95.200 | +| | 48e3-ae49-4e4e | | | | +| | 974c9902-02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 6d3e9329-9716- | rus | 151 | 99.400 | +| | 4024-89d1-1d48 | | | | +| | b93ee790-01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 6d3e9329-9716- | rus | 243 | 98.100 | +| | 4024-89d1-1d48 | | | | +| | b93ee790-02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 6d3e9329-9716- | rus | 322 | 98.700 | +| | 4024-89d1-1d48 | | | | +| | b93ee790-03 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | LAW_11 | rus | 194 | 91.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_11 | rus | 76 | 95 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_21 | rus | 61 | 98.500 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_22 | rus | 278 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_23 | rus | 277 | 98 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_24 | rus | 288 | 99.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_25 | rus | 347 | 99.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_26 | rus | 192 | 99.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_27 | rus | 173 | 98.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_28 | rus | 133 | 99.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_29 | rus | 182 | 99.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_30 | rus | 178 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_31 | rus | 37 | 99.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_32 | rus | 221 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_33 | rus | 312 | 96.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_34 | rus | 83 | 92.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_35 | rus | 355 | 98.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_1 | rus | 86 | 99.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_2 | rus | 87 | 99.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_3 | rus | 89 | 95.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_4 | rus | 89 | 91.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_6 | rus | 117 | 99.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | article_multil | rus | 471 | 100 | +| | ine | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | fstek17_00 | rus | 192 | 95.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | fstek17_01 | rus | 332 | 99.500 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | law_image | rus | 182 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | napalm_doc_13_ | rus | 243 | 97.400 | +| | 2 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ukodeksrf_00 | rus | 287 | 99 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ukodeksrf_01 | rus | 340 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 146 | 96.100 | +| | ons_00 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 276 | 99.400 | +| | ons_01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 93 | 99.300 | +| | ons_02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 54 | 99.800 | +| | ons_03 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_00 | rus | 78 | 97.100 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_01 | rus | 296 | 97 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_02 | rus | 309 | 98.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_03 | rus | 337 | 97.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_04 | rus | 257 | 78.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_05 | rus | 238 | 98.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_06 | rus | 219 | 95.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_07 | rus | 233 | 95.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_08 | rus | 284 | 98.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_09 | rus | 154 | 95.700 | ++---------------+----------------+--------------+---------------+--------------+ + +Table 2 - AVG by each type of symbols: ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | +| t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | +| | g_Char | l_Symb | | ase_Ch | al_Sym | | words | cy | +| | s | ols | | ars | bols | | | | ++========+========+========+========+========+========+========+=======+=======+ +| englis | 80 | 66 | 50 | 0 | 0 | 80 | 152 | 79.92 | +| h- | | | | | | | | 0 | +| words | | | | | | | | | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| low_qu | 89.700 | 70 | 46.100 | 0 | 0 | 75.900 | 68 | 51.60 | +| ality | | | | | | | | 0 | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| others | 90.833 | 77.267 | 87.167 | 0 | 0 | 87.100 | 890 | 86.23 | +| | | | | | | | | 3 | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| tz- | 98.292 | 93.183 | 94.602 | 0 | 0 | 99.164 | 14029 | 97.54 | +| npa- | | | | | | | | 1 | +| vkr | | | | | | | | | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ + +Table 3 -OCR error by symbol: ++--------+---------------------------------------------------------------------+ +| Symbol | Cnt Errors & Correct-Generated | ++========+=====================================================================+ +| | ['4 & <* > -> <.>', '3 & <(с > -> <С>', '3 & <— 1> -> <19>', '3 & | +| | <— П> -> <И>', '3 & <— н> -> <и>', '2 & < 6> -> <б>', '2 & < > -> | +| | <__>', '2 & < г> -> <.т>', '2 & < г> -> <т>', "2 & <1 > -> <'>", '2 | +| | & <8 > -> <Р>', '2 & <; > -> <.>', '2 & -> <№>', '2 & <е > -> | +| | <в>', '2 & <и > -> <н>', '2 & <й > -> <ст>', '1 & < > -> <_>'] | ++--------+---------------------------------------------------------------------+ +| . | ['10 & <.> -> <,>', '3 & <3.1> -> <ЗЛА>', '3 & <3Г.> -> <5>', '2 & | +| | <.3> -> < >', '2 & <1.> -> <„>', '2 & <8.> -> <$>', '2 & <8.> -> | +| | <5>', '2 & -> <|9>', '2 & <г.> -> <Г>', '1 & <.> -> <:>'] | ++--------+---------------------------------------------------------------------+ +| — | ['8 & <—> -> <_>', '7 & <—> -> <->', '4 & <—> -> <=>', '3 & <— 1> | +| | -> <19>', '3 & <— П> -> <И>', '3 & <— н> -> <и>'] | ++--------+---------------------------------------------------------------------+ +| 1 | ['4 & <1C> -> ', "3 & <1> -> <'>", '3 & <1> -> <|>', '3 & <3.1> | +| | -> <ЗЛА>', '3 & <— 1> -> <19>', "2 & <1 > -> <'>", '2 & <1.> -> | +| | <„>', '2 & <11> -> <И>', '2 & <1C> -> <С>', '2 & <1> -> <[>', '1 & | +| | <1> -> ', '1 & <1> -> <(>', '1 & <1> -> <4>', '1 & <1> -> <\\>', | +| | '1 & <1> -> <Г>', '1 & <1> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| , | ['69 & <,> -> <.>', '3 & <ва,> -> <нь>', '2 & <Ш,> -> <П.>', '1 & | +| | <,> -> <;>', '1 & <,> -> <‚>'] | ++--------+---------------------------------------------------------------------+ +| е | ['5 & <е> -> <с>', '3 & <ект> -> <тн>', '2 & <е > -> <в>', '2 & | +| | <е-> -> <.>', '2 & <е> -> <а>', '2 & <ем> -> <вы>', '2 & <ен> -> | +| | <ая>', '2 & <ле> -> <ыи>'] | ++--------+---------------------------------------------------------------------+ +| н | ['3 & <на-> -> ', '3 & <— н> -> <и>', '2 & <ен> -> <ая>', '2 & | +| | <н> -> <и>', '1 & <н> -> <в>', '1 & <н> -> <й>', '1 & <н> -> <п>'] | ++--------+---------------------------------------------------------------------+ +| и | ['4 & <ис> -> <не>', '3 & <цио> -> <с>', '2 & <Ди> -> <по>', '2 & | +| | <и > -> <н>', '2 & <и> -> <н>', '2 & <из> -> <по>', '1 & <и> -> | +| | <И>', '1 & <и> -> <я>'] | ++--------+---------------------------------------------------------------------+ +| а | ['3 & <аво> -> <ыс>', '3 & <ва,> -> <нь>', '3 & <на-> -> ', '2 & | +| | <ав> -> <иы>'] | ++--------+---------------------------------------------------------------------+ +| о | ['3 & <аво> -> <ыс>', '3 & <ор> -> <ель>', '3 & <цио> -> <с>', '2 & | +| | <по> -> <иб>', '2 & <фо> -> <уп>'] | ++--------+---------------------------------------------------------------------+ +| т | ['7 & <т> -> <г>', '4 & <т> -> < г>', '3 & <ект> -> <тн>', '2 & | +| | <рт> -> <й>', '2 & <т> -> <1>', '2 & <эт> -> <уг>'] | ++--------+---------------------------------------------------------------------+ +| с | ['7 & <с> -> <е>', '4 & <ис> -> <не>', '3 & <(с > -> <С>', '1 & <с> | +| | -> <©>', '1 & <с> -> <з>'] | ++--------+---------------------------------------------------------------------+ +| р | ['3 & <ор> -> <ель>', '2 & <гр> -> <тв>', '2 & <рт> -> <й>', '2 & | +| | <эр> -> <ци>', '1 & <р> -> <Р>', '1 & <р> -> <й>'] | ++--------+---------------------------------------------------------------------+ +| 3 | ['3 & <3.1> -> <ЗЛА>', '3 & <3Г.> -> <5>', '2 & <.3> -> < >', '1 & | +| | <3> -> <5>', '1 & <3> -> <З>'] | ++--------+---------------------------------------------------------------------+ +| И | ['1 & <И> -> <Й>', '1 & <И> -> <Н>', '1 & <И> -> <П>', '1 & <И> -> | +| | <и>'] | ++--------+---------------------------------------------------------------------+ +| в | ['3 & <аво> -> <ыс>', '3 & <ва,> -> <нь>', '2 & <ав> -> <иы>', '1 & | +| | <в> -> <п>'] | ++--------+---------------------------------------------------------------------+ +| л | ['5 & <л> -> <п>', '3 & <глу> -> <по>', '2 & <ле> -> <ыи>', '1 & | +| | <л> -> <д>', '1 & <л> -> <и>'] | ++--------+---------------------------------------------------------------------+ +| 2 | ['2 & <28> -> <ИР>', '2 & <28> -> <Я >', '1 & <2> -> <1>', '1 & <2> | +| | -> <3>'] | ++--------+---------------------------------------------------------------------+ +| 6 | ['2 & < 6> -> <б>', '1 & <6> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| д | ['4 & <д> -> <л>', '3 & <д> -> <; ц>', '2 & <д> -> <; >'] | ++--------+---------------------------------------------------------------------+ +| О | ['2 & <ВО> -> <Ю>', '2 & <ПО> -> <по>', '1 & <О> -> <С>', '1 & <О> | +| | -> <о>'] | ++--------+---------------------------------------------------------------------+ +| г | ['3 & <глу> -> <по>', '2 & < г> -> <.т>', '2 & < г> -> <т>', '2 & | +| | <г.> -> <Г>', '2 & <г> -> ', '2 & <гр> -> <тв>', '1 & <г> -> < | +| | >', '1 & <г> -> <т>'] | ++--------+---------------------------------------------------------------------+ +| E | ['39 & -> <ЕВР>', '6 & -> <ЕКР>', '3 & -> <2ЕЮ>', | +| | '3 & -> <ЕЕР>', '1 & -> <Е>'] | ++--------+---------------------------------------------------------------------+ +| Н | ['2 & <ЕН> -> <ек>', '2 & <Н> -> <И>'] | ++--------+---------------------------------------------------------------------+ +| п | ['5 & <п> -> <и>', '2 & <п> -> <н>', '2 & <п> -> <нн>', '2 & <по> | +| | -> <иб>', '1 & <п> -> <т>'] | ++--------+---------------------------------------------------------------------+ +| N | ['23 & -> <М>', '2 & -> <№>'] | ++--------+---------------------------------------------------------------------+ +| Е | ['2 & <ЕН> -> <ек>', '1 & <Е> -> <в>'] | ++--------+---------------------------------------------------------------------+ +| б | ['2 & <б> -> <6>'] | ++--------+---------------------------------------------------------------------+ +| Т | ['4 & <Т> -> <Г>'] | ++--------+---------------------------------------------------------------------+ +| P | ['39 & -> <ЕВР>', '6 & -> <ЕКР>'] | ++--------+---------------------------------------------------------------------+ +| Р | ['1 & <Р> -> <з>'] | ++--------+---------------------------------------------------------------------+ +| R | ['39 & -> <ЕВР>', '6 & -> <ЕКР>'] | ++--------+---------------------------------------------------------------------+ +| у | ['3 & <глу> -> <по>'] | ++--------+---------------------------------------------------------------------+ +| 0 | ['2 & <08> -> <9Ф>'] | ++--------+---------------------------------------------------------------------+ +| П | ['3 & <— П> -> <И>', '2 & <П> -> <И>', '2 & <ПО> -> <по>', '1 & <П> | +| | -> <Г>'] | ++--------+---------------------------------------------------------------------+ +| я | ['2 & <я> -> <го>', '1 & <я> -> <л>'] | ++--------+---------------------------------------------------------------------+ +| I | ['3 & -> <Ш>', '3 & -> <130>', '3 & -> <УП>', '2 | +| | & -> <ТХ>', '2 & -> <1>', '1 & -> <|>'] | ++--------+---------------------------------------------------------------------+ +| м | ['2 & <ем> -> <вы>'] | ++--------+---------------------------------------------------------------------+ +| C | ['7 & -> <С>', '6 & -> <С.>', '4 & <1C> -> ', '2 & | +| | <1C> -> <С>', '2 & -> <ОС>'] | ++--------+---------------------------------------------------------------------+ +| - | ['3 & <на-> -> ', '2 & <е-> -> <.>', '1 & <-> -> < >', '1 & <-> | +| | -> <|>'] | ++--------+---------------------------------------------------------------------+ +| 7 | ['2 & <75> -> <#>', '1 & <7> -> <1>', '1 & <7> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| 5 | ['2 & <75> -> <#>'] | ++--------+---------------------------------------------------------------------+ +| 8 | ['2 & <08> -> <9Ф>', '2 & <28> -> <ИР>', '2 & <28> -> <Я >', '2 & | +| | <8 > -> <Р>', '2 & <8.> -> <$>', '2 & <8.> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| r | ['4 & -> <га>', '4 & -> <ги>', '1 & -> <г>'] | ++--------+---------------------------------------------------------------------+ +| й | ['2 & <й > -> <ст>', '2 & <й> -> <го>', '1 & <й> -> <:>'] | ++--------+---------------------------------------------------------------------+ +| В | ['2 & <ВЗ> -> <Ръ>', '2 & <ВО> -> <Ю>'] | ++--------+---------------------------------------------------------------------+ +| o | ['3 & <(no> -> <по>', '3 & -> <2оу>'] | ++--------+---------------------------------------------------------------------+ +| u | ['4 & -> <га>', '4 & -> <ги>'] | ++--------+---------------------------------------------------------------------+ +| з | ['2 & <из> -> <по>', '1 & <з> -> <3>'] | ++--------+---------------------------------------------------------------------+ +| ; | ['9 & <;> -> <:>', '2 & <; > -> <.>'] | ++--------+---------------------------------------------------------------------+ +| ч | ['2 & <ч> -> <пр>', '1 & <ч> -> <з>'] | ++--------+---------------------------------------------------------------------+ +| : | ['6 & -> <С.>', '5 & <:> -> <.>'] | ++--------+---------------------------------------------------------------------+ +| a | ['4 & -> <на>', '2 & -> <На>', '2 & -> <а>'] | ++--------+---------------------------------------------------------------------+ +| к | ['3 & <ект> -> <тн>', '1 & <к> -> <ш>'] | ++--------+---------------------------------------------------------------------+ +| Б | ['2 & <БЗ> -> <53>', '1 & <Б> -> <Ъ>'] | ++--------+---------------------------------------------------------------------+ +| ц | ['3 & <цио> -> <с>', '1 & <ц> -> <п>', '1 & <ц> -> <ш>', '1 & <ц> | +| | -> <щ>'] | ++--------+---------------------------------------------------------------------+ +| w | ['3 & -> <ууу>'] | ++--------+---------------------------------------------------------------------+ +| d | ['3 & -> <рар>', '1 & -> <4>'] | ++--------+---------------------------------------------------------------------+ +| e | ['2 & -> <Не>'] | ++--------+---------------------------------------------------------------------+ +| O | ['3 & -> <130>', '2 & -> <ОС>'] | ++--------+---------------------------------------------------------------------+ +| Д | ['2 & <Ди> -> <по>', '1 & <Д> -> <З>'] | ++--------+---------------------------------------------------------------------+ +| З | ['2 & <БЗ> -> <53>', '2 & <ВЗ> -> <Ръ>', '1 & <З> -> <У>'] | ++--------+---------------------------------------------------------------------+ +| Ц | ['1 & <Ц> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| D | ['3 & -> <2ЕЮ>', '3 & -> <ЕЕР>', '2 & -> <Пу>'] | ++--------+---------------------------------------------------------------------+ +| f | ['3 & -> <рар>'] | ++--------+---------------------------------------------------------------------+ +| ( | ['3 & <(no> -> <по>', '3 & <(с > -> <С>'] | ++--------+---------------------------------------------------------------------+ +| A | ['2 & -> <$А>'] | ++--------+---------------------------------------------------------------------+ +| H | ['4 & -> <на>', '3 & -> <КНМ>', '2 & -> <На>', '2 & | +| | -> <Не>'] | ++--------+---------------------------------------------------------------------+ +| V | ['3 & -> <УП>'] | ++--------+---------------------------------------------------------------------+ +| b | ['1 & -> <Ь>'] | ++--------+---------------------------------------------------------------------+ +| g | ['3 & -> <2оу>'] | ++--------+---------------------------------------------------------------------+ +| n | ['3 & <(no> -> <по>'] | ++--------+---------------------------------------------------------------------+ +| p | ['3 & -> <рар>'] | ++--------+---------------------------------------------------------------------+ +| Г | ['3 & <3Г.> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| Ю | ['2 & <Ю> -> <1О>'] | ++--------+---------------------------------------------------------------------+ +| * | ['4 & <* > -> <.>'] | ++--------+---------------------------------------------------------------------+ +| F | ['3 & -> <2ЕЮ>', '3 & -> <ЕЕР>'] | ++--------+---------------------------------------------------------------------+ +| S | ['3 & -> <130>', '2 & -> <$А>'] | ++--------+---------------------------------------------------------------------+ +| « | ['1 & <«> -> <<>'] | ++--------+---------------------------------------------------------------------+ +| » | ['2 & <»> -> <2%>'] | ++--------+---------------------------------------------------------------------+ +| ш | ['1 & <ш> -> <щ>'] | ++--------+---------------------------------------------------------------------+ +| э | ['2 & <эр> -> <ци>', '2 & <эт> -> <уг>'] | ++--------+---------------------------------------------------------------------+ +| ю | ['1 & <ю> -> <о>'] | ++--------+---------------------------------------------------------------------+ +| G | ['2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| M | ['3 & -> <КНМ>'] | ++--------+---------------------------------------------------------------------+ +| _ | ['1 & <_> -> < >'] | ++--------+---------------------------------------------------------------------+ +| c | ['2 & -> <|9>'] | ++--------+---------------------------------------------------------------------+ +| v | ['3 & -> <2оу>'] | ++--------+---------------------------------------------------------------------+ +| | | ['1 & <|> -> <1>'] | ++--------+---------------------------------------------------------------------+ +| ф | ['2 & <фо> -> <уп>'] | ++--------+---------------------------------------------------------------------+ +| L | ['2 & -> <ГХ>'] | ++--------+---------------------------------------------------------------------+ +| X | ['2 & -> <ТХ>', '2 & -> <ГХ>'] | ++--------+---------------------------------------------------------------------+ +| y | ['2 & -> <Пу>', '1 & -> <у>'] | ++--------+---------------------------------------------------------------------+ +| Ш | ['2 & <Ш,> -> <П.>'] | ++--------+---------------------------------------------------------------------+ +| щ | ['1 & <щ> -> <ш>'] | ++--------+---------------------------------------------------------------------+ +| № | ['1 & <№> -> <ы>'] | ++--------+---------------------------------------------------------------------+ +| K | ['3 & -> <КНМ>'] | ++--------+---------------------------------------------------------------------+ +| Y | ['1 & -> <У>'] | ++--------+---------------------------------------------------------------------+ +| ₁ | ['1 & <₁> -> <1>'] | ++--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/resources/benchmarks/tesseract_benchmark_sage-correction.txt b/resources/benchmarks/tesseract_benchmark_sage-correction.txt deleted file mode 100644 index f75ea71e..00000000 --- a/resources/benchmarks/tesseract_benchmark_sage-correction.txt +++ /dev/null @@ -1,359 +0,0 @@ -Tesseract version is 5.0.0 -Correction step: _sage-correction - -Table 1 - Accuracy for each file -+---------------+---------------------+-------+-----------------+--------------+ -| Dataset | Image name | --psm | Amount of words | Accuracy OCR | -+===============+=====================+=======+=================+==============+ -| english-words | Kaspersky | 6 | 111 | 99.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | USB | 6 | 4 | 80.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words1 | 6 | 19 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words2 | 6 | 9 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words3 | 6 | 9 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 0 | 4 | 315 | 94.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 1 | 4 | 308 | 94.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 2 | 4 | 238 | 96.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 3 | 4 | 313 | 96.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 4 | 4 | 218 | 94.100 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 5 | 4 | 291 | 94 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 6 | 4 | 268 | 95.200 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 7 | 4 | 390 | 95.100 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 8 | 4 | 117 | 94 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 9 | 4 | 294 | 97.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| others | Zaklyuchenie_nevrol | 4 | 525 | 83 | -| | oga_00 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| others | Zaklyuchenie_nevrol | 4 | 241 | 87 | -| | oga_01 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| others | napalm_doc_2_2_6 | 4 | 124 | 85 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 695 | 99.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 696 | 99.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 699 | 99.400 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | article_multiline | 4 | 471 | 99.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | fstek17_00 | 4 | 192 | 92.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | fstek17_01 | 4 | 332 | 99.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | law_image | 4 | 182 | 99.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | napalm_doc_13_2 | 4 | 243 | 96.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukaz_prezidenta_1 | 4 | 264 | 98.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukodeksrf_00 | 4 | 287 | 99.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukodeksrf_01 | 4 | 340 | 99.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 146 | 94.400 | -| | 0 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 276 | 98.800 | -| | 1 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 165 | 98.500 | -| | 2 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 90 | 99.400 | -| | 3 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_00 | 4 | 78 | 97.400 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_01 | 4 | 296 | 98 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_02 | 4 | 309 | 98.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_03 | 4 | 337 | 98.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_04 | 4 | 257 | 96.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_05 | 4 | 238 | 97.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_06 | 4 | 219 | 93.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_07 | 4 | 233 | 98.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_08 | 4 | 284 | 95.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_09 | 4 | 154 | 97.600 | -+---------------+---------------------+-------+-----------------+--------------+ - -Table 2 - AVG by each type of symbols: -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | -| t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | -| | g_Char | l_Symb | | ase_Ch | al_Sym | | words | cy | -| | s | ols | | ars | bols | | | | -+========+========+========+========+========+========+========+=======+=======+ -| englis | 94.820 | 99.333 | 100 | 0 | 0 | 94.540 | 152 | 96.04 | -| h- | | | | | | | | 0 | -| words | | | | | | | | | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| low_qu | 99.190 | 75.340 | 94.544 | 0 | 0 | 97.640 | 2752 | 95.29 | -| ality | | | | | | | | 0 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| others | 89.767 | 77.100 | 89.533 | 0 | 0 | 86.433 | 890 | 85 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| tz-npa | 98.956 | 90.920 | 92.104 | 0 | 0 | 99.488 | 7483 | 97.92 | -| | | | | | | | | 0 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ - -Table 3 -OCR error by symbol: -+--------+---------------------------------------------------------------------+ -| Symbol | Cnt Errors & Correct-Generated | -+========+=====================================================================+ -| | ['3 & <. №> -> < No>', '2 & < 2> -> ', '2 & < г> -> <К>', '2 & < | -| | ‚> -> <,>', "2 & <1 > -> <'>", '2 & <и > -> <н>', '2 & <№ > -> | -| | '] | -+--------+---------------------------------------------------------------------+ -| . | ['54 & <.> -> <,>', '3 & <. №> -> < No>', '3 & <3.> -> < De>', '3 & | -| | <В.В> -> ', '2 & <Г.> -> <С>', '2 & <г.> -> <ГТ>', '2 & <п.> -> | -| | <,>'] | -+--------+---------------------------------------------------------------------+ -| , | ['80 & <,> -> <.>', '3 & <ва,> -> <нь>', '1 & <,> -> <»>'] | -+--------+---------------------------------------------------------------------+ -| е | ['6 & <не> -> ', '4 & <е> -> <ё>', '3 & <все> -> <Ко>', '3 & | -| | <ге> -> <Кри>', '3 & <е-> -> <бов>', '3 & <е> -> <а>', '3 & <цев> | -| | -> ', '3 & <че-> -> <и»>', '2 & <е> -> <и>', '2 & <е> -> | -| | <ми>', '2 & <е> -> <с>', '2 & <ее> -> ', '2 & <ле> -> <У>', '1 | -| | & <е> -> <Е>', '1 & <е> -> <о>'] | -+--------+---------------------------------------------------------------------+ -| о | ['6 & <то> -> ', '3 & <По> -> ', '3 & <Про> -> <Ис>', '3 & | -| | <но> -> ', '3 & <она> -> ', '3 & <под> -> ', '3 & | -| | <фок> -> <М>', '2 & <во> -> <за>', '2 & <до> -> ', '2 & <до> -> | -| | ', '2 & <о> -> <ак>', '2 & <о> -> <у>', '2 & <об> -> <бы>', '2 | -| | & <по> -> <10>', '2 & <то> -> ', '1 & <о> -> <в>', '1 & <о> -> | -| | <я>'] | -+--------+---------------------------------------------------------------------+ -| а | ['5 & <а> -> <о>', '4 & <на> -> ', '3 & <Нам> -> ', '3 & | -| | <а> -> <ёту>', '3 & <ва,> -> <нь>', '3 & <на> -> <под>', '3 & <она> | -| | -> ', '3 & <рак> -> <Ли>', '3 & <сан> -> <еви>', '3 & <так> -> | -| | ', '2 & <Ла> -> <А>', '2 & <а> -> <ся>', '2 & <ва> -> <к>', '2 | -| | & <на> -> ', '1 & <а> -> <Б>', '1 & <а> -> <е>', '1 & <а> -> | -| | <у>', '1 & <а> -> <ы>', '1 & <а> -> <ь>'] | -+--------+---------------------------------------------------------------------+ -| н | ['6 & <не> -> ', '4 & <на> -> ', '3 & <на> -> <под>', '3 & | -| | <но> -> ', '3 & <она> -> ', '3 & <сан> -> <еви>', '2 & | -| | <йн> -> <ем>', '2 & <н> -> <п>', '2 & <на> -> ', '2 & <нк> -> | -| | <х>', '2 & <ны> -> <им>', '1 & <н> -> <Н>', '1 & <н> -> <и>', '1 & | -| | <н> -> <й>', '1 & <н> -> <л>', '1 & <н> -> <м>', '1 & <н> -> <ф>'] | -+--------+---------------------------------------------------------------------+ -| и | ['4 & <и> -> <е>', '3 & <ив> -> <ьюж>', '3 & <тип> -> ', '3 & | -| | <ции> -> <узы>', '2 & <и > -> <н>', '2 & <и> -> <10>', '2 & <и> -> | -| | <ей>', '2 & <и> -> <мм>', '2 & <ис> -> <не>', '2 & <их> -> ', | -| | '2 & <их> -> ', '2 & <си> -> <ен>', '1 & <и> -> <В>', '1 & <и> | -| | -> <а>', '1 & <и> -> <с>', '1 & <и> -> <ь>'] | -+--------+---------------------------------------------------------------------+ -| - | ['8 & <-> -> <но>', '6 & <-> -> <ния>', '5 & <-> -> <в>', '3 & <-> | -| | -> <жья>', '3 & <-> -> <ков>', '3 & <-> -> <нил>', '3 & <-> -> | -| | <щим>', '3 & <е-> -> <бов>', '3 & <че-> -> <и»>', '2 & <-> -> | -| | <ве>', '2 & <-> -> <да>', '2 & <-> -> <ие>', '2 & <-> -> <ко>', '2 | -| | & <-> -> <ли>', '2 & <-> -> <м">', '2 & <-> -> <м>', '2 & <-> -> | -| | <мо>', '2 & <-> -> <ны>', '2 & <-> -> <ры>', '2 & <-> -> <ых>', '2 | -| | & <-> -> <“>', '2 & <у-> -> <ем>', '2 & <ы-> -> <им>', '2 & <ы-> -> | -| | <ём>', '1 & <-> -> <">', '1 & <-> -> <»>', '1 & <-> -> <д>', '1 & | -| | <-> -> <л>', '1 & <-> -> <н>', '1 & <-> -> <ы>'] | -+--------+---------------------------------------------------------------------+ -| 1 | ["4 & <1> -> <'>", '4 & <1С> -> ', '3 & <1> -> <3>', '3 & <№1> | -| | -> ', '3 & <№1»> -> ', "2 & <1 > -> <'>", '2 & <1C> -> | -| | ', '2 & <1C> -> <С>', '2 & <1> -> <2>', '2 & <1> -> ', '1 & | -| | <1> -> ', '1 & <1> -> <5>', '1 & <1> -> <Г>', '1 & <1> -> <С>', | -| | '1 & <1> -> <Т>'] | -+--------+---------------------------------------------------------------------+ -| № | ['94 & <№> -> ', '6 & <№> -> ', '3 & <. №> -> < No>', '3 & | -| | <№1> -> ', '3 & <№1»> -> ', '2 & <№ > -> '] | -+--------+---------------------------------------------------------------------+ -| в | ['4 & <в> -> <6>', '3 & <ва,> -> <нь>', '3 & <все> -> <Ко>', '3 & | -| | <ив> -> <ьюж>', '3 & <ств> -> <У н>', '3 & <цев> -> ', '2 & | -| | <в> -> <«В>', '2 & <в> -> <зм>', '2 & <в> -> <м>', '2 & <в> -> | -| | <по>', '2 & <ва> -> <к>', '2 & <во> -> <за>', '1 & <в> -> ', '1 | -| | & <в> -> <В>', '1 & <в> -> <г>', '1 & <в> -> <н>'] | -+--------+---------------------------------------------------------------------+ -| с | ['3 & <все> -> <Ко>', '3 & <сан> -> <еви>', '3 & <ств> -> <У н>', | -| | '2 & <ис> -> <не>', '2 & <с> -> <Не>', '2 & <с> -> <От>', '2 & <си> | -| | -> <ен>', '1 & <с> -> ', '1 & <с> -> <б>', '1 & <с> -> <н>'] | -+--------+---------------------------------------------------------------------+ -| т | ['6 & <то> -> ', '3 & <ств> -> <У н>', '3 & <так> -> ', '3 | -| | & <тип> -> ', '2 & <т> -> <г>', '2 & <то> -> ', '1 & <т> | -| | -> <Д>', '1 & <т> -> <Т>', '1 & <т> -> <м>'] | -+--------+---------------------------------------------------------------------+ -| л | ['2 & <зл> -> <им>', '2 & <ле> -> <У>', '1 & <л> -> ', '1 & <л> | -| | -> <Л>', '1 & <л> -> <д>', '1 & <л> -> <т>'] | -+--------+---------------------------------------------------------------------+ -| р | ['3 & <Про> -> <Ис>', '3 & <гр> -> <тав>', '3 & <рак> -> <Ли>', '2 | -| | & <гр> -> ', '2 & <р> -> <ал>'] | -+--------+---------------------------------------------------------------------+ -| 2 | ['2 & < 2> -> ', '2 & <28> -> ', '2 & <28> -> <ИР>', '2 & | -| | <28> -> <Я>'] | -+--------+---------------------------------------------------------------------+ -| д | ['3 & <д> -> <Пен>', '3 & <под> -> ', '2 & <до> -> ', '2 & | -| | <до> -> ', '1 & <д> -> <Т>', '1 & <д> -> <Ц>'] | -+--------+---------------------------------------------------------------------+ -| г | ['3 & <ге> -> <Кри>', '3 & <гр> -> <тав>', '2 & < г> -> <К>', '2 & | -| | <г.> -> <ГТ>', '2 & <г> -> <т>', '2 & <гр> -> '] | -+--------+---------------------------------------------------------------------+ -| 3 | ['3 & <3.> -> < De>', '1 & <3> -> <">', '1 & <3> -> '] | -+--------+---------------------------------------------------------------------+ -| С | ['6 & <СЗВ> -> ', '4 & <1С> -> ', '3 & <ОС> -> ', '3 | -| | & <С> -> ', '2 & <ОС> -> '] | -+--------+---------------------------------------------------------------------+ -| N | ['22 & -> <М>'] | -+--------+---------------------------------------------------------------------+ -| А | ['2 & <А> -> ', '2 & <А> -> <Ли>'] | -+--------+---------------------------------------------------------------------+ -| И | ['2 & <И> -> <АН>', '1 & <И> -> <В>', '1 & <И> -> <Й>'] | -+--------+---------------------------------------------------------------------+ -| п | ['3 & <под> -> ', '3 & <тип> -> ', '2 & <п.> -> <,>', '2 | -| | & <п> -> <и >', '2 & <п> -> <л>', '2 & <по> -> <10>', '1 & <п> -> | -| | <П>'] | -+--------+---------------------------------------------------------------------+ -| к | ['3 & <рак> -> <Ли>', '3 & <так> -> ', '3 & <фок> -> <М>', '2 | -| | & <нк> -> <х>'] | -+--------+---------------------------------------------------------------------+ -| у | ['3 & <у> -> <ы>', '2 & <у-> -> <ем>'] | -+--------+---------------------------------------------------------------------+ -| Н | ['3 & <Нам> -> ', '2 & <Н> -> <ЕМ>', '1 & <Н> -> <И>'] | -+--------+---------------------------------------------------------------------+ -| Е | ['2 & <ЕМ> -> <Ш>'] | -+--------+---------------------------------------------------------------------+ -| О | ['3 & <ОС> -> ', '2 & <ОС> -> ', '2 & <Об> -> <06>', '1 & | -| | <О> -> ', '1 & <О> -> <Ю>', '1 & <О> -> <о>'] | -+--------+---------------------------------------------------------------------+ -| П | ['3 & <По> -> ', '3 & <Про> -> <Ис>', '2 & <П> -> <И>', '1 & | -| | <П> -> <К>', '1 & <П> -> <п>'] | -+--------+---------------------------------------------------------------------+ -| б | ['3 & <"б"> -> <“8”>', '2 & <Об> -> <06>', '2 & <б> -> <«Л>', '2 & | -| | <об> -> <бы>'] | -+--------+---------------------------------------------------------------------+ -| ы | ['2 & <ны> -> <им>', '2 & <ы-> -> <им>', '2 & <ы-> -> <ём>', '1 & | -| | <ы> -> <б>', '1 & <ы> -> <е>'] | -+--------+---------------------------------------------------------------------+ -| ; | ['9 & <;> -> <:>', '1 & <;> -> <,>', '1 & <;> -> <.>'] | -+--------+---------------------------------------------------------------------+ -| Т | ['3 & <МРТ> -> ', '3 & <Т> -> <Г>', '3 & <ТЗР> -> '] | -+--------+---------------------------------------------------------------------+ -| м | ['3 & <Нам> -> '] | -+--------+---------------------------------------------------------------------+ -| В | ['6 & <СЗВ> -> ', '3 & <В.В> -> ', '2 & <ВЗ> -> <РИ>'] | -+--------+---------------------------------------------------------------------+ -| 0 | ['3 & <608> -> '] | -+--------+---------------------------------------------------------------------+ -| I | ['3 & -> <Ш>', '3 & -> <УП>', '1 & -> '] | -+--------+---------------------------------------------------------------------+ -| М | ['3 & <МРТ> -> ', '2 & <ЕМ> -> <Ш>'] | -+--------+---------------------------------------------------------------------+ -| 6 | ['3 & <608> -> '] | -+--------+---------------------------------------------------------------------+ -| Р | ['3 & <МРТ> -> ', '3 & <ТЗР> -> '] | -+--------+---------------------------------------------------------------------+ -| ц | ['3 & <цев> -> ', '3 & <ции> -> <узы>', '2 & <ц> -> <С>', '1 & | -| | <ц> -> <щ>'] | -+--------+---------------------------------------------------------------------+ -| Л | ['2 & <Ла> -> <А>'] | -+--------+---------------------------------------------------------------------+ -| 5 | ['2 & <75> -> <2>'] | -+--------+---------------------------------------------------------------------+ -| з | ['2 & <зл> -> <им>'] | -+--------+---------------------------------------------------------------------+ -| 8 | ['3 & <608> -> ', '2 & <28> -> ', '2 & <28> -> <ИР>', '2 & | -| | <28> -> <Я>'] | -+--------+---------------------------------------------------------------------+ -| й | ['2 & <й> -> <е:>', '2 & <йн> -> <ем>'] | -+--------+---------------------------------------------------------------------+ -| " | ['3 & <"б"> -> <“8”>', '2 & <"> -> <“>', '1 & <"> -> <”>'] | -+--------+---------------------------------------------------------------------+ -| 7 | ['2 & <75> -> <2>'] | -+--------+---------------------------------------------------------------------+ -| E | ['3 & -> <ЕВР>'] | -+--------+---------------------------------------------------------------------+ -| З | ['6 & <СЗВ> -> ', '3 & <БЗ> -> <653>', '3 & <ТЗР> -> ', | -| | '2 & <ВЗ> -> <РИ>'] | -+--------+---------------------------------------------------------------------+ -| ч | ['3 & <че-> -> <и»>'] | -+--------+---------------------------------------------------------------------+ -| : | ['2 & <:> -> '] | -+--------+---------------------------------------------------------------------+ -| [ | ['2 & <[> -> <(>'] | -+--------+---------------------------------------------------------------------+ -| ] | ['2 & <]> -> <)>'] | -+--------+---------------------------------------------------------------------+ -| 4 | ['1 & <4> -> <“>'] | -+--------+---------------------------------------------------------------------+ -| C | ['2 & <1C> -> ', '2 & <1C> -> <С>', '2 & -> <С>'] | -+--------+---------------------------------------------------------------------+ -| Б | ['3 & <БЗ> -> <653>'] | -+--------+---------------------------------------------------------------------+ -| Д | ['1 & <Д> -> <З>'] | -+--------+---------------------------------------------------------------------+ -| | | ['1 & <|> -> <1>'] | -+--------+---------------------------------------------------------------------+ -| Ц | ['1 & <Ц> -> <Т>'] | -+--------+---------------------------------------------------------------------+ -| ш | ['2 & <ш> -> <«Ч>', '1 & <ш> -> <ч>'] | -+--------+---------------------------------------------------------------------+ -| P | ['3 & -> <ЕВР>'] | -+--------+---------------------------------------------------------------------+ -| R | ['3 & -> <ЕВР>'] | -+--------+---------------------------------------------------------------------+ -| a | ['4 & -> <на>', '1 & -> <а>'] | -+--------+---------------------------------------------------------------------+ -| х | ['2 & <их> -> ', '2 & <их> -> '] | -+--------+---------------------------------------------------------------------+ -| — | ['1 & <—> -> <->'] | -+--------+---------------------------------------------------------------------+ -| G | ['2 & -> <С>'] | -+--------+---------------------------------------------------------------------+ -| H | ['4 & -> <на>', '2 & -> <Из>'] | -+--------+---------------------------------------------------------------------+ -| V | ['3 & -> <УП>'] | -+--------+---------------------------------------------------------------------+ -| m | ['2 & -> '] | -+--------+---------------------------------------------------------------------+ -| ф | ['3 & <фок> -> <М>', '1 & <ф> -> <Ф>'] | -+--------+---------------------------------------------------------------------+ -| ю | ['1 & <ю> -> <у>'] | -+--------+---------------------------------------------------------------------+ -| c | ['2 & -> <со>', '1 & -> <с>'] | -+--------+---------------------------------------------------------------------+ -| o | ['2 & -> <со>', '2 & -> '] | -+--------+---------------------------------------------------------------------+ -| Ю | ['2 & <Ю> -> <1 >'] | -+--------+---------------------------------------------------------------------+ -| ‚ | ['2 & < ‚> -> <,>'] | -+--------+---------------------------------------------------------------------+ -| Y | ['1 & -> <У>'] | -+--------+---------------------------------------------------------------------+ -| _ | ['1 & <_> -> <Х>'] | -+--------+---------------------------------------------------------------------+ -| d | ['1 & -> <4>'] | -+--------+---------------------------------------------------------------------+ -| e | ['2 & -> <Из>'] | -+--------+---------------------------------------------------------------------+ -| x | ['1 & -> <х>'] | -+--------+---------------------------------------------------------------------+ -| y | ['1 & -> <у>'] | -+--------+---------------------------------------------------------------------+ -| » | ['3 & <№1»> -> '] | -+--------+---------------------------------------------------------------------+ -| Г | ['2 & <Г.> -> <С>'] | -+--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/resources/benchmarks/tesseract_benchmark_textblob-correction.txt b/resources/benchmarks/tesseract_benchmark_textblob-correction.txt deleted file mode 100644 index 2de957a5..00000000 --- a/resources/benchmarks/tesseract_benchmark_textblob-correction.txt +++ /dev/null @@ -1,318 +0,0 @@ -Tesseract version is 4.1.1 -Correction step: _textblob-correction - -Table 1 - Accuracy for each file -+---------------+---------------------+-------+-----------------+--------------+ -| Dataset | Image name | --psm | Amount of words | Accuracy OCR | -+===============+=====================+=======+=================+==============+ -| english-words | Kaspersky | 6 | 111 | 73.400 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | USB | 6 | 4 | 47.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words1 | 6 | 19 | 66.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words2 | 6 | 9 | 72.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words3 | 6 | 9 | 61.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| others | Zaklyuchenie_nevrol | 4 | 525 | 80.200 | -| | oga_00 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| others | Zaklyuchenie_nevrol | 4 | 241 | 87 | -| | oga_01 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| others | napalm_doc_2_2_6 | 4 | 124 | 84.400 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 695 | 98.100 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 696 | 98.100 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 699 | 97.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | article_multiline | 4 | 471 | 98.400 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | fstek17_00 | 4 | 192 | 91.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | fstek17_01 | 4 | 332 | 97.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | law_image | 4 | 182 | 99 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | napalm_doc_13_2 | 4 | 243 | 95.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukaz_prezidenta_1 | 4 | 264 | 97.100 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukodeksrf_00 | 4 | 287 | 98.200 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukodeksrf_01 | 4 | 340 | 97.200 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 146 | 94.900 | -| | 0 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 276 | 98.700 | -| | 1 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 165 | 98.700 | -| | 2 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 90 | 99.100 | -| | 3 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_00 | 4 | 78 | 91.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_01 | 4 | 296 | 94.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_02 | 4 | 309 | 96.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_03 | 4 | 337 | 95.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_04 | 4 | 257 | 94.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_05 | 4 | 238 | 96.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_06 | 4 | 219 | 95.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_07 | 4 | 233 | 96.400 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_08 | 4 | 284 | 94.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_09 | 4 | 154 | 93.700 | -+---------------+---------------------+-------+-----------------+--------------+ - -Table 2 - AVG by each type of symbols: -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | -| t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | -| | g_Char | l_Symb | | ase_Ch | al_Sym | | words | cy | -| | s | ols | | ars | bols | | | | -+========+========+========+========+========+========+========+=======+=======+ -| englis | 100 | 99.333 | 100 | 0 | 0 | 60.680 | 152 | 64.48 | -| h- | | | | | | | | 0 | -| words | | | | | | | | | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| others | 90.767 | 80.167 | 90.700 | 0 | 0 | 83.400 | 890 | 83.86 | -| | | | | | | | | 7 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| tz-npa | 99.328 | 91.692 | 85.916 | 0 | 0 | 97.300 | 7483 | 96.42 | -| | | | | | | | | 4 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ - -Table 3 -OCR error by symbol: -+--------+---------------------------------------------------------------------+ -| Symbol | Cnt Errors & Correct-Generated | -+========+=====================================================================+ -| о | ['198 & <по> -> ', '118 & <от> -> ', '46 & <об> -> ', | -| | '12 & <во> -> ', '12 & <то> -> ', '10 & <до> -> ', '8 & | -| | <со> -> ', '4 & <По> -> ', '4 & <Со> -> ', '4 & <но> -> | -| | ', '4 & <он> -> ', '3 & <и о> -> ', '2 & <го> -> ', | -| | '2 & <по> -> '] | -+--------+---------------------------------------------------------------------+ -| н | ['246 & <на> -> ', '92 & <не> -> ', '4 & <но> -> ', '4 | -| | & <он> -> ', '2 & <нa> -> ', '2 & <н> -> <и>', '2 & <на> -> | -| | '] | -+--------+---------------------------------------------------------------------+ -| а | ['246 & <на> -> ', '56 & <за> -> ', '6 & <На> -> ', '4 | -| | & <За> -> ', '2 & <на> -> ', '2 & <ра> -> '] | -+--------+---------------------------------------------------------------------+ -| е | ['92 & <не> -> ', '12 & <ед> -> ', '6 & <ее> -> ', '4 & | -| | <Не> -> ', '3 & <е> -> <с>', '3 & <пер> -> ', '2 & <же> -> | -| | ', '2 & <те> -> ', '1 & <е> -> <а>'] | -+--------+---------------------------------------------------------------------+ -| т | ['118 & <от> -> ', '36 & <ст> -> ', '12 & <то> -> ', '6 | -| | & <т> -> <г>', '6 & <ти> -> < of>', '3 & <От > -> ', '3 & <тип> | -| | -> ', '2 & <рт> -> ', '2 & <те> -> '] | -+--------+---------------------------------------------------------------------+ -| п | ['198 & <по> -> ', '3 & <Тип> -> ', '3 & <пер> -> ', | -| | '3 & <тип> -> ', '2 & <п> -> <и>', '2 & <по> -> ', '1 & | -| | <п> -> <н>'] | -+--------+---------------------------------------------------------------------+ -| | ['3 & <От > -> ', '3 & <и о> -> ', '3 & <с 6> -> ', '2 | -| | & <. > -> < ‘>', "2 & <1 > -> <'>", '2 & -> <№>'] | -+--------+---------------------------------------------------------------------+ -| 1 | ['104 & <1С> -> ', '18 & <1C> -> ', '8 & <1С> -> ', '4 | -| | & <11> -> <И>', '4 & <1C> -> ', "3 & <1> -> <'>", "2 & <1 > -> | -| | <'>", '2 & <1C> -> ', '2 & <1C> -> <С>', '2 & <1> -> <|>', '2 & | -| | <31> -> ', '1 & <1> -> <\\>'] | -+--------+---------------------------------------------------------------------+ -| и | ['34 & <из> -> ', '32 & <их> -> ', '12 & <им> -> ', '6 | -| | & <ти> -> < of>', '3 & <Тип> -> ', '3 & <и о> -> ', '3 & | -| | <тип> -> ', '2 & <ис> -> <не>'] | -+--------+---------------------------------------------------------------------+ -| С | ['104 & <1С> -> ', '8 & <1С> -> ', '8 & <ОС> -> ', '4 & | -| | <Со> -> ', '3 & <НДС> -> ', '2 & <ДС> -> ', '2 & <ЮС> | -| | -> <1О>', '1 & <С> -> <—>'] | -+--------+---------------------------------------------------------------------+ -| , | ['64 & <,> -> <.>', '6 & <ПО,> -> ', '1 & <,> -> <;>'] | -+--------+---------------------------------------------------------------------+ -| . | ['3 & <.> -> <,>', '3 & <3.> -> < He>', '2 & <. > -> < ‘>', '2 & | -| | <г.> -> <Г>'] | -+--------+---------------------------------------------------------------------+ -| с | ['36 & <ст> -> ', '8 & <со> -> ', '4 & <см> -> ', '3 & | -| | <с 6> -> ', '3 & <ься> -> < by>', '2 & <ис> -> <не>', '1 & <с> | -| | -> ', '1 & <с> -> <©>', '1 & <с> -> <е>'] | -+--------+---------------------------------------------------------------------+ -| з | ['56 & <за> -> ', '34 & <из> -> '] | -+--------+---------------------------------------------------------------------+ -| О | ['20 & <ПО> -> ', '14 & <Об> -> ', '8 & <ДО> -> ', '8 & | -| | <ОС> -> ', '6 & <ПО,> -> ', '4 & <АО> -> ', '4 & <ЛО> | -| | -> ', '4 & <МО> -> ', '3 & <От > -> '] | -+--------+---------------------------------------------------------------------+ -| л | ['6 & <для> -> ', '6 & <мл> -> ', '3 & <для> -> <11>', '3 & | -| | <для> -> ', '3 & <л> -> <п>', '2 & <Эл> -> ', '2 & <ул> -> | -| | '] | -+--------+---------------------------------------------------------------------+ -| б | ['46 & <об> -> ', '14 & <Об> -> '] | -+--------+---------------------------------------------------------------------+ -| д | ['12 & <ед> -> ', '10 & <до> -> ', '6 & <для> -> ', '3 & | -| | <д> -> <л>', '3 & <для> -> <11>', '3 & <для> -> ', '1 & <д> -> | -| | <2>'] | -+--------+---------------------------------------------------------------------+ -| З | ['56 & <ФЗ> -> ', '4 & <За> -> ', '3 & <БЗ> -> <653>', '3 & | -| | <ТЗР> -> ', '2 & <ВЗ> -> <Ръ>'] | -+--------+---------------------------------------------------------------------+ -| в | ['12 & <во> -> ', '1 & <в> -> ', '1 & <в> -> <В>', '1 & <в> | -| | -> <п>'] | -+--------+---------------------------------------------------------------------+ -| Ф | ['56 & <ФЗ> -> ', '12 & <РФ> -> ', '2 & <ФД> -> '] | -+--------+---------------------------------------------------------------------+ -| м | ['12 & <им> -> ', '12 & <мг> -> ', '6 & <мл> -> ', '4 & | -| | <см> -> ', '2 & <мм> -> ', '2 & <мы> -> '] | -+--------+---------------------------------------------------------------------+ -| г | ['12 & <мг> -> ', '2 & <г.> -> <Г>', '2 & <г> -> <т >', '2 & | -| | <г> -> <т>', '2 & <го> -> ', '2 & <гр> -> ', '2 & <гр> -> | -| | <тв>', '1 & <г> -> '] | -+--------+---------------------------------------------------------------------+ -| р | ['3 & <пер> -> ', '2 & <гр> -> ', '2 & <гр> -> <тв>', '2 & | -| | <ра> -> ', '2 & <рт> -> ', '2 & <ры> -> '] | -+--------+---------------------------------------------------------------------+ -| П | ['20 & <ПО> -> ', '6 & <ПО,> -> ', '4 & <По> -> ', '1 | -| | & <П> -> <И>'] | -+--------+---------------------------------------------------------------------+ -| Н | ['6 & <Н> -> <* П>', '6 & <На> -> ', '4 & <Не> -> ', '3 & | -| | <Н> -> <И>', '3 & <НДС> -> ', '2 & <ЕН> -> <ек>', '2 & <НБ> -> | -| | ', '1 & <Н> -> <П>'] | -+--------+---------------------------------------------------------------------+ -| 2 | ['4 & <28> -> ', '2 & <28> -> <Я >'] | -+--------+---------------------------------------------------------------------+ -| N | ['22 & -> <М>', '2 & -> <№>'] | -+--------+---------------------------------------------------------------------+ -| E | ['45 & -> <ЕВР>', '3 & -> <ЕКР>'] | -+--------+---------------------------------------------------------------------+ -| А | ['4 & <АО> -> ', '2 & <АД> -> '] | -+--------+---------------------------------------------------------------------+ -| a | ['6 & -> ', '2 & -> ', '2 & <нa> -> ', '1 & | -| | -> <а>'] | -+--------+---------------------------------------------------------------------+ -| И | ['4 & <ИБ> -> ', '2 & <ИТ> -> ', '1 & <И> -> <Н>'] | -+--------+---------------------------------------------------------------------+ -| я | ['6 & <для> -> ', '3 & <для> -> <11>', '3 & <для> -> ', '3 & | -| | <ься> -> < by>'] | -+--------+---------------------------------------------------------------------+ -| 3 | ['3 & <3.> -> < He>', '2 & <31> -> ', '1 & <3> -> '] | -+--------+---------------------------------------------------------------------+ -| P | ['45 & -> <ЕВР>', '3 & -> <ЕКР>'] | -+--------+---------------------------------------------------------------------+ -| R | ['45 & -> <ЕВР>', '3 & -> <ЕКР>', '3 & -> <ОСК>'] | -+--------+---------------------------------------------------------------------+ -| Д | ['12 & <БД> -> ', '8 & <ДО> -> ', '4 & <ЕД> -> ', '3 & | -| | <НДС> -> ', '2 & <АД> -> ', '2 & <ДС> -> ', '2 & <ФД> | -| | -> ', '1 & <Д> -> <З>'] | -+--------+---------------------------------------------------------------------+ -| e | ['2 & -> '] | -+--------+---------------------------------------------------------------------+ -| Е | ['4 & <ЕД> -> ', '2 & <ЕН> -> <ек>'] | -+--------+---------------------------------------------------------------------+ -| C | ['18 & <1C> -> ', '4 & <1C> -> ', '3 & -> <ОСК>', '2 | -| | & <1C> -> ', '2 & <1C> -> <С>', '2 & -> ', '2 & -> | -| | '] | -+--------+---------------------------------------------------------------------+ -| Р | ['12 & <РФ> -> ', '3 & <ТЗР> -> '] | -+--------+---------------------------------------------------------------------+ -| х | ['32 & <их> -> '] | -+--------+---------------------------------------------------------------------+ -| I | ['3 & -> <Ш>', '3 & -> <130>', '3 & -> ', '2 | -| | & -> ', '2 & -> <1>', '1 & -> <|>'] | -+--------+---------------------------------------------------------------------+ -| Б | ['12 & <БД> -> ', '4 & <ИБ> -> ', '3 & <БЗ> -> <653>', '2 & | -| | <НБ> -> '] | -+--------+---------------------------------------------------------------------+ -| Т | ['3 & <ТЗР> -> ', '3 & <Тип> -> ', '2 & <ИТ> -> ', '1 | -| | & <Т> -> <Г>'] | -+--------+---------------------------------------------------------------------+ -| 0 | ['3 & <608> -> '] | -+--------+---------------------------------------------------------------------+ -| М | ['4 & <МО> -> '] | -+--------+---------------------------------------------------------------------+ -| у | ['2 & <ул> -> '] | -+--------+---------------------------------------------------------------------+ -| 6 | ['3 & <608> -> ', '3 & <с 6> -> '] | -+--------+---------------------------------------------------------------------+ -| Л | ['4 & <ЛО> -> '] | -+--------+---------------------------------------------------------------------+ -| ь | ['3 & <ься> -> < by>'] | -+--------+---------------------------------------------------------------------+ -| - | ['1 & <-> -> <—>'] | -+--------+---------------------------------------------------------------------+ -| u | ['3 & -> '] | -+--------+---------------------------------------------------------------------+ -| ; | ['9 & <;> -> <:>'] | -+--------+---------------------------------------------------------------------+ -| В | ['2 & <ВЗ> -> <Ръ>'] | -+--------+---------------------------------------------------------------------+ -| ы | ['2 & <мы> -> ', '2 & <ры> -> '] | -+--------+---------------------------------------------------------------------+ -| c | ['1 & -> <с>'] | -+--------+---------------------------------------------------------------------+ -| p | ['2 & -> '] | -+--------+---------------------------------------------------------------------+ -| ц | ['1 & <ц> -> <щ>'] | -+--------+---------------------------------------------------------------------+ -| 5 | ['2 & <75> -> <#2>'] | -+--------+---------------------------------------------------------------------+ -| 8 | ['4 & <28> -> ', '3 & <608> -> ', '2 & <28> -> <Я >'] | -+--------+---------------------------------------------------------------------+ -| O | ['3 & -> <130>', '3 & -> <ОСК>', '2 & -> '] | -+--------+---------------------------------------------------------------------+ -| S | ['3 & -> <130>'] | -+--------+---------------------------------------------------------------------+ -| ч | ['1 & <ч> -> <з>'] | -+--------+---------------------------------------------------------------------+ -| K | ['3 & -> <КНМ>'] | -+--------+---------------------------------------------------------------------+ -| d | ['2 & -> ', '1 & -> <4>'] | -+--------+---------------------------------------------------------------------+ -| й | ['1 & <й> -> <:>'] | -+--------+---------------------------------------------------------------------+ -| 7 | ['2 & <75> -> <#2>'] | -+--------+---------------------------------------------------------------------+ -| H | ['6 & -> ', '3 & -> <КНМ>', '2 & -> '] | -+--------+---------------------------------------------------------------------+ -| V | ['3 & -> '] | -+--------+---------------------------------------------------------------------+ -| Ц | ['1 & <Ц> -> <Т>'] | -+--------+---------------------------------------------------------------------+ -| M | ['3 & -> <КНМ>'] | -+--------+---------------------------------------------------------------------+ -| № | ['6 & <№> -> '] | -+--------+---------------------------------------------------------------------+ -| G | ['2 & -> <С>'] | -+--------+---------------------------------------------------------------------+ -| | | ['1 & <|> -> <1>'] | -+--------+---------------------------------------------------------------------+ -| « | ['3 & <«_»> -> '] | -+--------+---------------------------------------------------------------------+ -| » | ['3 & <«_»> -> '] | -+--------+---------------------------------------------------------------------+ -| Э | ['2 & <Эл> -> '] | -+--------+---------------------------------------------------------------------+ -| Ю | ['2 & <ЮС> -> <1О>'] | -+--------+---------------------------------------------------------------------+ -| ж | ['2 & <же> -> '] | -+--------+---------------------------------------------------------------------+ -| X | ['2 & -> '] | -+--------+---------------------------------------------------------------------+ -| Y | ['1 & -> <У>'] | -+--------+---------------------------------------------------------------------+ -| _ | ['3 & <«_»> -> '] | -+--------+---------------------------------------------------------------------+ -| — | ['1 & <—> -> <->'] | -+--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/scripts/benchmark.py b/scripts/benchmark.py index fe4d359d..a82b2131 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -80,9 +80,10 @@ def get_times(spend_page_times: List, total_size: int, total_time: int, total_fi Task("pdf_tables", "pdf_tables", {}, get_pdf_page_count) ] print(tasks) - header = ["Dataset", "total_file_size", "total_files", "total_pages", # noqa - "total_time_raw", "throughput_raw", "mean_time_on_file_raw", "mean_time_cpu_on_page_raw", # noqa - "total_time_indp_cpu", "throughput_indp_cpu", "mean_time_on_file_indp_cpu", "mean_time_cpu_on_page_indp_cpu"] # noqa + header = [ + "Dataset", "total_file_size", "total_files", "total_pages", "total_time_raw", "throughput_raw", "mean_time_on_file_raw", "mean_time_cpu_on_page_raw", + "total_time_indp_cpu", "throughput_indp_cpu", "mean_time_on_file_indp_cpu", "mean_time_cpu_on_page_indp_cpu" + ] df = pd.DataFrame(columns=header) for directory, name, parameters, page_func in tasks: diff --git a/scripts/benchmark_pdf_attachments.py b/scripts/benchmark_pdf_attachments.py index 411f1275..1962a2bd 100644 --- a/scripts/benchmark_pdf_attachments.py +++ b/scripts/benchmark_pdf_attachments.py @@ -8,10 +8,13 @@ import wget -from dedoc.attachments_extractors import AbstractAttachmentsExtractor, PDFAttachmentsExtractor +from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor +from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor from dedoc.config import get_config -from dedoc.data_structures import AttachedFile -from dedoc.readers import BaseReader, PdfTabbyReader, PdfTxtlayerReader +from dedoc.data_structures.attached_file import AttachedFile +from dedoc.readers.base_reader import BaseReader +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader def get_reader_attachments(reader: BaseReader, input_dir: str, attachments_dir: str) -> dict: diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py index 117ba3e3..1ce5fcd9 100644 --- a/scripts/benchmark_table/benchmark_table.py +++ b/scripts/benchmark_table/benchmark_table.py @@ -9,7 +9,7 @@ from dedoc.api.api_utils import table2html from dedoc.config import get_config -from dedoc.readers import PdfImageReader +from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer from scripts.benchmark_table.metric import TEDS diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py index 9bd107ac..b1fec84f 100644 --- a/scripts/benchmark_table/metric.py +++ b/scripts/benchmark_table/metric.py @@ -11,7 +11,7 @@ # Source: https://github.com/ibm-aur-nlp/PubTabNet from collections import deque -from typing import Optional +from typing import Iterable, Optional import distance from apted import APTED, Config @@ -21,7 +21,8 @@ class TableTree(Tree): - def __init__(self, tag: str, colspan=None, rowspan=None, content=None, visible=None, *children): # noqa + def __init__(self, tag: str, colspan: Optional[int], rowspan: Optional[int], content: Optional[str], visible: Optional[bool], *children: "TableTree") \ + -> None: self.tag = tag self.colspan = colspan self.rowspan = rowspan @@ -44,13 +45,13 @@ def bracket(self) -> str: class CustomConfig(Config): @staticmethod - def maximum(*sequences): # noqa + def maximum(*sequences: Iterable[str]) -> int: """ Get maximum possible value """ return max(map(len, sequences)) - def normalized_distance(self, *sequences) -> float: # noqa + def normalized_distance(self, *sequences: Iterable[str]) -> float: """ Get distance from 0 to 1 """ diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py index d959a1f4..2538cdef 100644 --- a/scripts/benchmark_tl_correctness.py +++ b/scripts/benchmark_tl_correctness.py @@ -14,6 +14,15 @@ os.makedirs(path_result, exist_ok=True) path_result = os.path.join(path_result, "benchmarks_tl_correctness.json") +""" +Experiments are available -> https://github.com/alexander1999-hub/txt_layer_correctness/tree/main : + * generating synthetic incorrect text + * compare different classification models + * compare different input textual feature: TF-IDF and custom features + * compare on real data of correct/incorrect texts with GT using Levenstein (available on Confluence -> dataset page) +Here (in this script) we calculate an accuracy of selected model (XGboost on custom features) on real data without GT. Data are pdfs with textual layer) +""" + host = "http://localhost:1231" param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed")) diff --git a/scripts/create_txtlayer_dataset.py b/scripts/create_txtlayer_dataset.py index ca2e196c..24f22098 100644 --- a/scripts/create_txtlayer_dataset.py +++ b/scripts/create_txtlayer_dataset.py @@ -12,7 +12,7 @@ from bs4 import BeautifulSoup from tqdm import tqdm -from dedoc.readers import PdfImageReader +from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader class CorrectTextGenerator: @@ -46,7 +46,7 @@ def get_random_text(self, lang: str) -> str: article_text_fixed = re.sub(self.meta, "", article_text_fixed) article_text_fixed = re.sub(self.symbols, "", article_text_fixed) article_text_fixed = re.sub(r"\n+", "\n", article_text_fixed) - except: # noqa + except Exception: article_text_fixed = "" return article_text_fixed diff --git a/scripts/fintoc2022/dataset_loader.py b/scripts/fintoc2022/dataset_loader.py index 9d9560e2..abe2e67f 100755 --- a/scripts/fintoc2022/dataset_loader.py +++ b/scripts/fintoc2022/dataset_loader.py @@ -13,7 +13,8 @@ from Levenshtein._levenshtein import ratio from dedoc.config import get_config -from dedoc.readers import PdfTabbyReader, PdfTxtlayerReader +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader from train_dataset.data_structures.line_with_label import LineWithLabel diff --git a/scripts/tesseract_benchmark/ocr_correction.py b/scripts/tesseract_benchmark/ocr_correction.py deleted file mode 100644 index 89fb87a1..00000000 --- a/scripts/tesseract_benchmark/ocr_correction.py +++ /dev/null @@ -1,41 +0,0 @@ -import os -from typing import Tuple - -import torch -from sage.spelling_correction import AvailableCorrectors -from sage.spelling_correction import RuM2M100ModelForSpellingCorrection -from sage.spelling_correction.corrector import Corrector - -""" -Install sage library (for ocr correction step): -git clone https://github.com/ai-forever/sage.git -cd sage -pip install . -pip install -r requirements.txt - -Note: sage use 5.2 Gb GPU ...... -""" -USE_GPU = True - - -def correction(model: Corrector, ocr_text: str) -> str: - - corrected_lines = [] - for line in ocr_text.split("\n"): - corrected_lines.append(model.correct(line)[0]) - corrected_text = "\n".join(corrected_lines) - - return corrected_text - - -def init_correction_step(cache_dir: str) -> Tuple[Corrector, str]: - - corrected_path = os.path.join(cache_dir, "result_corrected") - os.makedirs(corrected_path, exist_ok=True) - corrector = RuM2M100ModelForSpellingCorrection.from_pretrained(AvailableCorrectors.m2m100_1B.value) # 4.49 Gb model (pytorch_model.bin) - if torch.cuda.is_available() and USE_GPU: - corrector.model.to(torch.device("cuda:0")) - print("use CUDA") - else: - print("use CPU") - return corrector, corrected_path diff --git a/scripts/tesseract_benchmark/requirements.txt b/scripts/tesseract_benchmark/requirements.txt deleted file mode 100644 index 5ef9a438..00000000 --- a/scripts/tesseract_benchmark/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -textblob==0.17.1 \ No newline at end of file diff --git a/scripts/tesseract_benchmark/text_blob_correction.py b/scripts/tesseract_benchmark/text_blob_correction.py deleted file mode 100644 index 73e8d70e..00000000 --- a/scripts/tesseract_benchmark/text_blob_correction.py +++ /dev/null @@ -1,9 +0,0 @@ -from textblob import TextBlob - - -class TextBlobCorrector: - def __init__(self) -> None: - return - - def correct(self, text: str) -> str: - return str(TextBlob(text).correct()) diff --git a/scripts/text_extraction_benchmark/analyze_ocr_errors.py b/scripts/text_extraction_benchmark/analyze_ocr_errors.py new file mode 100644 index 00000000..4ffd4697 --- /dev/null +++ b/scripts/text_extraction_benchmark/analyze_ocr_errors.py @@ -0,0 +1,74 @@ +import os +import re +from typing import List, Tuple + +from texttable import Texttable + + +def __parse_ocr_errors(lines: List[str]) -> List: + ocr_errors = [] + matched_errors = [(line_num, line) for line_num, line in enumerate(lines) if "Errors Marked Correct-Generated" in line][0] + for line in lines[matched_errors[0] + 1:]: + # example line: " 2 0 { 6}-{б}" + errors = re.findall(r"(\d+)", line)[0] + chars = re.findall(r"{(.*)}-{(.*)}", line)[0] + ocr_errors.append([errors, chars[0], chars[1]]) + + return ocr_errors + + +def __parse_symbol_info(lines: List[str]) -> Tuple[List, int]: + symbols_info = [] + matched_symbols = [(line_num, line) for line_num, line in enumerate(lines) if "Count Missed %Right" in line][-1] + start_block_line = matched_symbols[0] + + for line in lines[start_block_line + 1:]: + # example line: "1187 11 99.07 {<\n>}" + row_values = [value.strip() for value in re.findall(r"\d+.\d*|{\S+|\W+}", line)] + row_values[-1] = row_values[-1][1:-1] # get symbol value + symbols_info.append(row_values) + # Sort errors + symbols_info = sorted(symbols_info, key=lambda row: int(row[1]), reverse=True) # by missed + + return symbols_info, start_block_line + + +def get_summary_symbol_error(path_reports: str) -> Texttable: + # 1 - call accsum for get summary of all reports + accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "accsum")) + + if os.path.exists(f"{path_reports}/../accsum_report.txt"): + os.remove(f"{path_reports}/../accsum_report.txt") + + file_reports = " ".join([os.path.join(path_reports, f) for f in os.listdir(path_reports) if os.path.isfile(os.path.join(path_reports, f))]) + + command = f"{accuracy_script_path} {file_reports} >> {path_reports}/../accsum_report.txt" + os.system(command) + accsum_report_path = os.path.join(path_reports, "..", "accsum_report.txt") + + # 2 - parse report info + with open(accsum_report_path, "r") as f: + lines = f.readlines() + + symbols_info, start_symbol_block_line = __parse_symbol_info(lines) + ocr_errors = __parse_ocr_errors(lines[:start_symbol_block_line - 1]) + + # 3 - calculate ocr errors for a symbol + ocr_errors_by_symbol = {} + for symbol_info in symbols_info: + ocr_errors_by_symbol[symbol_info[-1]] = [] + for ocr_err in ocr_errors: + if ocr_err[-1] == "" or len(ocr_err[-2]) > 3 or len(ocr_err[-1]) > 3: # to ignore errors with long text (len > 3) or without text + continue + if symbol_info[-1] in ocr_err[-2]: + ocr_errors_by_symbol[symbol_info[-1]].append(f"{ocr_err[0]} & <{ocr_err[1]}> -> <{ocr_err[2]}>") + + # 4 - create table with OCR errors + ocr_err_by_symbol_table = Texttable() + title = [["Symbol", "Cnt Errors & Correct-Generated"]] + ocr_err_by_symbol_table.add_rows(title) + for symbol, value in ocr_errors_by_symbol.items(): + if len(value) != 0: + ocr_err_by_symbol_table.add_row([symbol, value]) + + return ocr_err_by_symbol_table diff --git a/scripts/text_extraction_benchmark/text_correction/sage_corrector.py b/scripts/text_extraction_benchmark/text_correction/sage_corrector.py new file mode 100644 index 00000000..58d28d2e --- /dev/null +++ b/scripts/text_extraction_benchmark/text_correction/sage_corrector.py @@ -0,0 +1,41 @@ +import os + +import torch +from sage.spelling_correction import AvailableCorrectors +from sage.spelling_correction import RuM2M100ModelForSpellingCorrection + + +""" +Install sage library (for ocr correction step): +git clone https://github.com/ai-forever/sage.git +cd sage +pip install . +pip install -r requirements.txt + +Note: sage use 5.2 Gb GPU ...... +""" + + +class SageCorrector: + + def __init__(self, cache_dir: str, use_gpu: bool = True) -> None: + self.corrected_path = os.path.join(cache_dir, "result_corrected") + os.makedirs(self.corrected_path, exist_ok=True) + + self.corrector = RuM2M100ModelForSpellingCorrection.from_pretrained(AvailableCorrectors.m2m100_1B.value) # 4.49 Gb model (pytorch_model.bin) + self._init_device(use_gpu) + + def _init_device(self, use_gpu: bool) -> None: + if torch.cuda.is_available() and use_gpu: + self.corrector.model.to(torch.device("cuda:0")) + print("use CUDA") + else: + print("use CPU") + + def correction(self, text: str) -> str: + corrected_lines = [] + for line in text.split("\n"): + corrected_lines.append(self.corrector.correct(line)[0]) + corrected_text = "\n".join(corrected_lines) + + return corrected_text diff --git a/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py b/scripts/text_extraction_benchmark/text_extraction_benchmarks.py similarity index 60% rename from scripts/tesseract_benchmark/calc_tesseract_benchmarks.py rename to scripts/text_extraction_benchmark/text_extraction_benchmarks.py index 07895d0d..ea1a000c 100644 --- a/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py +++ b/scripts/text_extraction_benchmark/text_extraction_benchmarks.py @@ -2,29 +2,28 @@ import re import time import zipfile +from enum import Enum from typing import Dict, List, Tuple -import cv2 import numpy as np import pytesseract import wget from texttable import Texttable from dedoc.config import get_config -from scripts.tesseract_benchmark.ocr_correction import correction, init_correction_step -from scripts.tesseract_benchmark.text_blob_correction import TextBlobCorrector +from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader +from scripts.text_extraction_benchmark.analyze_ocr_errors import get_summary_symbol_error -WITHOUT_CORRECTION = "" -SAGE_CORRECTION = "_sage-correction" -TEXT_BLOB_CORRECTION = "_textblob-correction" +correction = Enum("Correction", ["SAGE_CORRECTION", "WITHOUT_CORRECTION"]) -USE_CORRECTION_OCR = TEXT_BLOB_CORRECTION +USE_CORRECTION_OCR = correction.WITHOUT_CORRECTION +reader = PdfImageReader() -def _call_tesseract(image: np.ndarray, language: str, psm: int = 3) -> str: - config = f"--psm {psm}" - text = pytesseract.image_to_string(image, lang=language, output_type=pytesseract.Output.DICT, config=config)["text"] - return text + +def _get_text_from_image(path: str, language: str) -> str: + document = reader.read(file_path=path, parameters={"language": language}) + return document.get_text() def _init_statistics_by_dataset(statistics: Dict, dataset_name: str) -> Dict: @@ -60,7 +59,6 @@ def _update_statistics_by_dataset(statistics: Dict, dataset: str, accuracy_path: acc_percent = re.findall(r"\d+\.\d+", matched[0])[0][:-1] statistic["Accuracy"].append(float(acc_percent)) statistic["Amount of words"].append(word_cnt) - statistic["ASCII_Spacing_Characters"] = _update_statistics_by_symbol_kind(statistic["ASCII_Spacing_Characters"], "ASCII Spacing Characters", lines) statistic["ASCII_Special_Symbols"] = _update_statistics_by_symbol_kind(statistic["ASCII_Special_Symbols"], "ASCII Special Symbols", lines) statistic["ASCII_Digits"] = _update_statistics_by_symbol_kind(statistic["ASCII_Digits"], "ASCII Digits", lines) @@ -89,77 +87,8 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: ] -def __parse_symbol_info(lines: List[str]) -> Tuple[List, int]: - symbols_info = [] - matched_symbols = [(line_num, line) for line_num, line in enumerate(lines) if "Count Missed %Right" in line][-1] - start_block_line = matched_symbols[0] - - for line in lines[start_block_line + 1:]: - # example line: "1187 11 99.07 {<\n>}" - row_values = [value.strip() for value in re.findall(r"\d+.\d*|{\S+|\W+}", line)] - row_values[-1] = row_values[-1][1:-1] # get symbol value - symbols_info.append(row_values) - # Sort errors - symbols_info = sorted(symbols_info, key=lambda row: int(row[1]), reverse=True) # by missed - - return symbols_info, start_block_line - - -def __parse_ocr_errors(lines: List[str]) -> List: - ocr_errors = [] - matched_errors = [(line_num, line) for line_num, line in enumerate(lines) if "Errors Marked Correct-Generated" in line][0] - for line in lines[matched_errors[0] + 1:]: - # example line: " 2 0 { 6}-{б}" - errors = re.findall(r"(\d+)", line)[0] - chars = re.findall(r"{(.*)}-{(.*)}", line)[0] - ocr_errors.append([errors, chars[0], chars[1]]) - - return ocr_errors - - -def __get_summary_symbol_error(path_reports: str) -> Texttable: - # 1 - call accsum for get summary of all reports - accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "accsum")) - - if os.path.exists(f"{path_reports}/../accsum_report.txt"): - os.remove(f"{path_reports}/../accsum_report.txt") - - file_reports = " ".join([os.path.join(path_reports, f) for f in os.listdir(path_reports) if os.path.isfile(os.path.join(path_reports, f))]) - - command = f"{accuracy_script_path} {file_reports} >> {path_reports}/../accsum_report.txt" - os.system(command) - accsum_report_path = os.path.join(path_reports, "..", "accsum_report.txt") - - # 2 - parse report info - with open(accsum_report_path, "r") as f: - lines = f.readlines() - - symbols_info, start_symbol_block_line = __parse_symbol_info(lines) - ocr_errors = __parse_ocr_errors(lines[:start_symbol_block_line - 1]) - - # 3 - calculate ocr errors according to a symbol - ocr_errors_by_symbol = {} - for symbol_info in symbols_info: - ocr_errors_by_symbol[symbol_info[-1]] = [] - for ocr_err in ocr_errors: - if ocr_err[-1] == "" or len(ocr_err[-2]) > 3 or len(ocr_err[-1]) > 3: # to ignore errors with long text (len > 3) or without text - continue - if symbol_info[-1] in ocr_err[-2]: - ocr_errors_by_symbol[symbol_info[-1]].append(f"{ocr_err[0]} & <{ocr_err[1]}> -> <{ocr_err[2]}>") - - # 4 - create table with OCR errors - ocr_err_by_symbol_table = Texttable() - title = [["Symbol", "Cnt Errors & Correct-Generated"]] - ocr_err_by_symbol_table.add_rows(title) - for symbol, value in ocr_errors_by_symbol.items(): - if len(value) != 0: - ocr_err_by_symbol_table.add_row([symbol, value]) - - return ocr_err_by_symbol_table - - def __create_statistic_tables(statistics: dict, accuracy_values: List) -> Tuple[Texttable, Texttable]: - accs = [["Dataset", "Image name", "--psm", "Amount of words", "Accuracy OCR"]] + accs = [["Dataset", "Image name", "OCR language", "Amount of words", "Accuracy OCR"]] accs_common = [ [ "Dataset", "ASCII_Spacing_Chars", "ASCII_Special_Symbols", "ASCII_Digits", "ASCII_Uppercase_Chars", "Latin1_Special_Symbols", "Cyrillic", @@ -198,13 +127,9 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c result_dir = os.path.join(cache_dir, "result_ocr") os.makedirs(result_dir, exist_ok=True) - corrector, corrected_path = None, None - if USE_CORRECTION_OCR == SAGE_CORRECTION: - corrector, corrected_path = init_correction_step(cache_dir) - elif USE_CORRECTION_OCR == TEXT_BLOB_CORRECTION: - corrector = TextBlobCorrector() - corrected_path = os.path.join(cache_dir, "result_corrected") - os.makedirs(corrected_path, exist_ok=True) + if USE_CORRECTION_OCR == correction.SAGE_CORRECTION: + from scripts.text_extraction_benchmark.text_correction.sage_corrector import SageCorrector + corrector = SageCorrector(cache_dir=cache_dir, use_gpu=True) with zipfile.ZipFile(benchmark_data_path, "r") as arch_file: names_dirs = [member.filename for member in arch_file.infolist() if member.file_size > 0] @@ -228,10 +153,10 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c os.remove(accuracy_path) tmp_gt_path = os.path.join(result_dir, f"{img_name}_gt.txt") - tmp_ocr_path = os.path.join(result_dir, f"{img_name}_ocr.txt") + result_ocr_filepath = os.path.join(result_dir, f"{img_name}_ocr.txt") try: - with arch_file.open(gt_path) as gt_file, open(tmp_gt_path, "wb") as tmp_gt_file, open(tmp_ocr_path, "w") as tmp_ocr_file: + with arch_file.open(gt_path) as gt_file, open(tmp_gt_path, "wb") as tmp_gt_file, open(result_ocr_filepath, "w") as result_ocr_file: gt_text = gt_file.read().decode("utf-8") word_cnt = len(gt_text.split()) @@ -240,28 +165,27 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c tmp_gt_file.close() arch_file.extract(imgs_path, result_dir) - image = cv2.imread(result_dir + "/" + imgs_path) - # call ocr - psm = 6 if dataset_name == "english-words" else 4 - text = _call_tesseract(image, "rus+eng", psm=psm) - tmp_ocr_file.write(text) - tmp_ocr_file.close() + # 1 - call reader + language = "rus+eng" if dataset_name == "english-words" else "rus" + text = _get_text_from_image(path=os.path.join(result_dir, imgs_path), language=language) + result_ocr_file.write(text) + result_ocr_file.close() - # call correction step + # 2 - call correction step time_b = time.time() - if USE_CORRECTION_OCR in (SAGE_CORRECTION, TEXT_BLOB_CORRECTION): - corrected_text = correction(corrector, text) if USE_CORRECTION_OCR == SAGE_CORRECTION else corrector.correct(text) - tmp_corrected_path = os.path.join(corrected_path, f"{img_name}_ocr.txt") - with open(tmp_corrected_path, "w") as tmp_corrected_file: - tmp_corrected_file.write(corrected_text) - calculate_accuracy_script(tmp_gt_path, tmp_corrected_path, accuracy_path) - else: - calculate_accuracy_script(tmp_gt_path, tmp_ocr_path, accuracy_path) + if USE_CORRECTION_OCR == correction.SAGE_CORRECTION: + corrected_text = corrector.correction(text) + result_ocr_filepath = os.path.join(corrector.corrected_path, f"{img_name}_ocr.txt") + with open(result_ocr_filepath, "w") as tmp_corrected_file: + tmp_corrected_file.write(corrected_text) correction_times.append(time.time() - time_b) + + # 3 - calculate accuracy from GTs and result texts + calculate_accuracy_script(tmp_gt_path, result_ocr_filepath, accuracy_path) statistics = _update_statistics_by_dataset(statistics, dataset_name, accuracy_path, word_cnt) - accuracy_values.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) + accuracy_values.append([dataset_name, base_name, language, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) except Exception as ex: print(ex) @@ -274,6 +198,7 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c if __name__ == "__main__": base_zip = "data_tesseract_benchmarks" + output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) cache_dir = os.path.join(get_config()["intermediate_data_path"], "tesseract_data") os.makedirs(cache_dir, exist_ok=True) @@ -282,7 +207,7 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip") if not os.path.isfile(benchmark_data_path): - wget.download("https://at.ispras.ru/owncloud/index.php/s/wMyKioKInYITpYT/download", benchmark_data_path) + wget.download("https://at.ispras.ru/owncloud/index.php/s/gByenPIMlo0K7Gf/download", benchmark_data_path) print(f"Benchmark data downloaded to {benchmark_data_path}") else: print(f"Use cached benchmark data from {benchmark_data_path}") @@ -290,9 +215,9 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c table_common, table_accuracy_per_image = __calculate_ocr_reports(cache_dir_accuracy, benchmark_data_path, cache_dir) - table_errors = __get_summary_symbol_error(path_reports=cache_dir_accuracy) + table_errors = get_summary_symbol_error(path_reports=cache_dir_accuracy) - with open(os.path.join(output_dir, f"tesseract_benchmark{USE_CORRECTION_OCR}.txt"), "w") as res_file: + with open(os.path.join(output_dir, f"tesseract_benchmark_{USE_CORRECTION_OCR}.txt"), "w") as res_file: res_file.write(f"Tesseract version is {pytesseract.get_tesseract_version()}\n") res_file.write(f"Correction step: {USE_CORRECTION_OCR}\n") res_file.write("\nTable 1 - Accuracy for each file\n") diff --git a/scripts/train/train_diploma_line_classifier.py b/scripts/train/train_diploma_line_classifier.py index 71a4c900..dfc1695f 100644 --- a/scripts/train/train_diploma_line_classifier.py +++ b/scripts/train/train_diploma_line_classifier.py @@ -2,7 +2,7 @@ import os from typing import Optional -from dedoc.config import _config as config +from dedoc.config import get_config from dedoc.structure_extractors.feature_extractors.diploma_feature_extractor import DiplomaFeatureExtractor from scripts.train.trainers.xgboost_line_classifier_trainer import XGBoostLineClassifierTrainer @@ -16,6 +16,7 @@ def skip_labels(label: str) -> Optional[str]: classifier_name = "diploma_classifier" +config = get_config() clf_resources_path = os.path.join(os.path.expanduser("~"), ".cache", "dedoc", "resources", "line_type_classifiers") os.makedirs(clf_resources_path, exist_ok=True) diff --git a/scripts/train/train_acc_orientation_classifier.py b/scripts/train/train_eval_orientation_classifier.py similarity index 67% rename from scripts/train/train_acc_orientation_classifier.py rename to scripts/train/train_eval_orientation_classifier.py index 05f36083..abd558b6 100644 --- a/scripts/train/train_acc_orientation_classifier.py +++ b/scripts/train/train_eval_orientation_classifier.py @@ -3,10 +3,14 @@ from time import time from typing import List +import numpy as np import torch +from sklearn.metrics import precision_recall_fscore_support +from texttable import Texttable from torch import nn from torch import optim from torch.utils.data import DataLoader +from tqdm import tqdm from dedoc.config import get_config from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier @@ -16,6 +20,7 @@ checkpoint_path_save = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "efficient_net_b0_fixed.pth")) checkpoint_path_load = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "efficient_net_b0_fixed.pth")) checkpoint_path = "../../resources" +output_dir = os.path.abspath(os.path.join(checkpoint_path, "benchmarks")) parser.add_argument("-t", "--train", type=bool, help="run for train model", default=False) parser.add_argument("-s", "--checkpoint_save", help="Path to checkpoint for save or load", default=checkpoint_path_save) @@ -25,10 +30,16 @@ args = parser.parse_args() BATCH_SIZE = 1 -ON_GPU = False +ON_GPU = True +""" +Input data are available from our confluence (closed data). +First, you need generate full train/test data (all orientation of src documents) using scripts/gen_dataset.py +Then, you can use this script. +""" -def accuracy_step(data_executor: DataLoaderImageOrient, net_executor: ColumnsOrientationClassifier) -> None: + +def evaluation_step(data_executor: DataLoaderImageOrient, net_executor: ColumnsOrientationClassifier) -> None: """ Function calculates accuracy for the trained model :param data_executor: Extractor Data from path @@ -47,10 +58,22 @@ def accuracy_step(data_executor: DataLoaderImageOrient, net_executor: ColumnsOri print(f"GroundTruth: orientation {orientation}, columns {columns}") - calc_accuracy_by_classes(testloader, data_executor.classes, net_executor, batch_size=1) + evaluation(testloader, data_executor.classes, net_executor) + + +def print_metrics(precision: np.array, recall: np.array, f1: np.array, cnt: np.array, avg: np.array, classes: List[str]) -> Texttable: + table = Texttable() + + table.header(["Class", "Precision", "Recall", "F1", "Count"]) + for i, name_class in enumerate(classes): + table.add_row([name_class, precision[i], recall[i], f1[i], cnt[i]]) + table.add_row(["AVG", avg[0], avg[1], avg[2], "None"]) -def calc_accuracy_by_classes(testloader: DataLoader, classes: List, classifier: ColumnsOrientationClassifier, batch_size: int = 1) -> None: + return table + + +def evaluation(testloader: DataLoader, classes: List, classifier: ColumnsOrientationClassifier) -> None: """ Function calculates accuracy ba each class :param testloader: DataLoader @@ -59,43 +82,47 @@ def calc_accuracy_by_classes(testloader: DataLoader, classes: List, classifier: :param batch_size: size of batch :return: """ - class_correct = list(0. for _ in range(len(classes))) - class_total = list(0. for _ in range(len(classes))) + orientation_pred, orientation_true = [], [] + column_pred, column_true = [], [] + time_predict = 0 cnt_predict = 0 with torch.no_grad(): - for data in testloader: + for data in tqdm(testloader): images, orientation, columns = data["image"], data["orientation"], data["columns"] - time_begin = time() + time_begin = time() outputs = classifier.net(images.float().to(classifier.device)) time_predict += time() - time_begin cnt_predict += len(images) + # first 2 classes mean columns number # last 4 classes mean orientation columns_out, orientation_out = outputs[:, :2], outputs[:, 2:] _, columns_predicted = torch.max(columns_out, 1) _, orientation_predicted = torch.max(orientation_out, 1) - orientation_c = (orientation_predicted == orientation.to(classifier.device)).squeeze() - columns_c = (columns_predicted == columns.to(classifier.device)).squeeze() - - for i in range(batch_size): - orientation_i = orientation[i] - columns_i = columns[i] - orientation_bool_predict = orientation_c.item() if batch_size == 1 else orientation_c[i].item() - columns_bool_predict = columns_c.item() if batch_size == 1 else columns_c[i].item() - class_correct[2 + orientation_i] += orientation_bool_predict - class_total[2 + orientation_i] += 1 - class_correct[columns_i] += orientation_bool_predict - class_total[columns_i] += 1 - if not orientation_bool_predict or not columns_bool_predict: - print( # noqa - f'{data["image_name"][i]} predict as \norientation: {classes[2 + orientation_predicted[i]]} \ncolumns: {classes[columns_predicted[i]]}' - ) - - for i in range(len(classes)): - print(f"Accuracy of {classes[i]:5s} : {100 * class_correct[i] / class_total[i] if class_total[i] != 0 else 0:2d} %") + orientation_pred.append(classes[2 + orientation_predicted.squeeze().item()]) + orientation_true.append(classes[2 + orientation.to(classifier.device).squeeze().item()]) + + column_pred.append(classes[columns_predicted.squeeze().item()]) + column_true.append(classes[columns.to(classifier.device).squeeze().item()]) + + with open(os.path.join(output_dir, "orient_classifier_scores.txt"), "w") as benchmark_file: + orient_metrics = precision_recall_fscore_support(orientation_true, orientation_pred, average=None, labels=classes[2:]) + orient_avg = precision_recall_fscore_support(orientation_true, orientation_pred, average="weighted") + table = print_metrics(*orient_metrics, orient_avg, classes[2:]) + print(table.draw()) + benchmark_file.write("\nOrientation predictions:\n") + benchmark_file.write(table.draw()) + + column_metrics = precision_recall_fscore_support(column_true, column_pred, average=None, labels=classes[:2]) + column_avg = precision_recall_fscore_support(column_true, column_pred, average="weighted") + table = print_metrics(*column_metrics, column_avg, classes[:2]) + print(table.draw()) + benchmark_file.write("\nColumn predictions:\n") + benchmark_file.write(table.draw()) + print(f"=== AVG Time predict {time_predict / cnt_predict}") @@ -167,8 +194,8 @@ def train_step(data_executor: DataLoaderImageOrient, classifier: ColumnsOrientat if __name__ == "__main__": config = get_config() data_executor = DataLoaderImageOrient() - net = ColumnsOrientationClassifier(on_gpu=True, checkpoint_path=checkpoint_path if not args.train else "", config=config) + net = ColumnsOrientationClassifier(on_gpu=ON_GPU, checkpoint_path=checkpoint_path if not args.train else "", config=config) if args.train: train_step(data_executor, net) else: - accuracy_step(data_executor, net) + evaluation_step(data_executor, net) diff --git a/scripts/train/train_law_line_classifier.py b/scripts/train/train_law_line_classifier.py index 7b6dd416..c9fafcff 100644 --- a/scripts/train/train_law_line_classifier.py +++ b/scripts/train/train_law_line_classifier.py @@ -2,8 +2,9 @@ import os from typing import Optional -from dedoc.config import _config as config +from dedoc.config import get_config from dedoc.structure_extractors.feature_extractors.law_text_features import LawTextFeatures +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import roman_regexp from scripts.train.trainers.xgboost_line_classifier_trainer import XGBoostLineClassifierTrainer from train_dataset.data_structures.line_with_label import LineWithLabel @@ -36,6 +37,7 @@ def transform_labels(label: str) -> Optional[str]: path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json") path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx") +config = get_config() feature_extractor = LawTextFeatures(text_features_only=txt_classifier) classifier_parameters = dict(learning_rate=0.8, n_estimators=300, @@ -49,7 +51,7 @@ def get_sample_weight(line: LineWithLabel) -> int: label = transform_labels(line.label) class_weight = {"structure_unit": 5, "header": 0.2, "raw_text": 0.5}.get(label, 1) text_with_upper = line.line.strip() - regexps = LawTextFeatures.named_regexp + [LawTextFeatures.roman_regexp] + regexps = LawTextFeatures.named_regexp + [roman_regexp] application_regexp = LawTextFeatures.regexp_application_begin regexp_weight = 50 if any([regexp.match(text_with_upper) for regexp in regexps]) else 1 application_weight = 3000 if application_regexp.match(text_with_upper.lower()) else 1 diff --git a/scripts/train/train_nn_line_classifier_law.py b/scripts/train/train_nn_line_classifier_law.py index 2aa6e0d5..45edfcdf 100644 --- a/scripts/train/train_nn_line_classifier_law.py +++ b/scripts/train/train_nn_line_classifier_law.py @@ -2,7 +2,7 @@ import os from typing import Optional -from dedoc.config import _config as config +from dedoc.config import get_config from dedoc.structure_extractors.feature_extractors.law_text_features import LawTextFeatures from scripts.train.trainers.logreg_line_classifier_trainer import LogRegLineClassifierTrainer from train_dataset.data_structures.line_with_label import LineWithLabel @@ -54,6 +54,7 @@ def get_sample_weight(line: LineWithLabel) -> float: classifier_parameters_nn = dict(multi_class="auto") +config = get_config() trainer = LogRegLineClassifierTrainer( data_url="https://at.ispras.ru/owncloud/index.php/s/nDxc7wPQzJxoUXY/download", diff --git a/scripts/train/train_paragraph_classifier.py b/scripts/train/train_paragraph_classifier.py index 0f2a6ba5..30fd30ca 100644 --- a/scripts/train/train_paragraph_classifier.py +++ b/scripts/train/train_paragraph_classifier.py @@ -2,7 +2,7 @@ import os from typing import Optional -from dedoc.config import _config as config +from dedoc.config import get_config from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.paragraph_features import ParagraphFeatureExtractor from scripts.train.trainers.xgboost_line_classifier_trainer import XGBoostLineClassifierTrainer @@ -21,6 +21,7 @@ def skip_labels(label: str) -> Optional[str]: path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json") path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx") +config = get_config() feature_extractor = ParagraphFeatureExtractor(config=config) classifier_parameters = dict(learning_rate=0.6, diff --git a/scripts/train/train_tz_line_classifier.py b/scripts/train/train_tz_line_classifier.py index da62ca7b..ad7da326 100644 --- a/scripts/train/train_tz_line_classifier.py +++ b/scripts/train/train_tz_line_classifier.py @@ -2,7 +2,7 @@ import os from typing import Optional -from dedoc.config import _config as config +from dedoc.config import get_config from dedoc.structure_extractors.feature_extractors.tz_feature_extractor import TzTextFeatures from scripts.train.trainers.xgboost_line_classifier_trainer import XGBoostLineClassifierTrainer @@ -15,6 +15,7 @@ def skip_labels(label: str) -> Optional[str]: txt_classifier = True classifier_name = "tz_txt_classifier" if txt_classifier else "tz_classifier" +config = get_config() resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources")) assert os.path.isdir(resources_path) diff --git a/scripts/train/trainers/base_sklearn_line_classifier.py b/scripts/train/trainers/base_sklearn_line_classifier.py index 0cdfe95d..eee046fb 100644 --- a/scripts/train/trainers/base_sklearn_line_classifier.py +++ b/scripts/train/trainers/base_sklearn_line_classifier.py @@ -7,7 +7,7 @@ import pickle from collections import Counter, OrderedDict from statistics import mean -from typing import Any, Callable, List, Optional +from typing import Callable, List, Optional import numpy as np from sklearn.metrics import accuracy_score @@ -28,7 +28,7 @@ class BaseClassifier(XGBClassifier): Base class for a classifier. See documentation of `XGBClassifier `_ to get more details. """ - def __init__(self, **kwargs: Any) -> None: # noqa + def __init__(self, **kwargs: dict) -> None: super().__init__(**kwargs) diff --git a/tests/api_tests/test_api_format_csv.py b/tests/api_tests/test_api_format_csv.py index 1e8f0778..a1e948af 100644 --- a/tests/api_tests/test_api_format_csv.py +++ b/tests/api_tests/test_api_format_csv.py @@ -40,6 +40,7 @@ def test_csv_books(self) -> None: row3 = self._get_text_of_row(table[3]) self.assertListEqual(["id", "cat", "name", "price", "inStock", "author", "series_t", "sequence_i", "genre_s"], row0) self.assertListEqual(["055357342X", "book", "A Storm of Swords", "7.99", "true", "George R.R. Martin", "A Song of Ice and Fire", "3", "fantasy"], row3) + self.assertEqual("", table[-1][5]["lines"][0]["text"]) def test_csv_books2(self) -> None: file_name = "books_2.csv" diff --git a/tests/api_tests/test_api_format_pdf.py b/tests/api_tests/test_api_format_pdf.py index ff42a1e8..ce7e2900 100644 --- a/tests/api_tests/test_api_format_pdf.py +++ b/tests/api_tests/test_api_format_pdf.py @@ -1,6 +1,6 @@ import os -from dedoc.data_structures import BBoxAnnotation +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py index 1d194988..c3057368 100644 --- a/tests/api_tests/test_api_format_pdf_tabby_reader.py +++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py @@ -163,6 +163,7 @@ def test_pdf_with_tables(self) -> None: self.assertListEqual(["Государство", "Место", "ВВП (по ППС) за 2018 г."], self._get_text_of_row(table[0])) self.assertListEqual(["Китай", "1", "25362"], self._get_text_of_row(table[1])) self.assertListEqual(["США", "2", "20494"], self._get_text_of_row(table[2])) + self.assertEqual(6, len(table[0][2]["lines"][0]["annotations"])) table = tables[1]["cells"] self.assertListEqual(["Государство", "Место", "ВВП (по ППС) за 2018 г."], self._get_text_of_row(table[0])) diff --git a/tests/api_tests/test_api_format_pptx.py b/tests/api_tests/test_api_format_pptx.py index 214265be..50c157cb 100644 --- a/tests/api_tests/test_api_format_pptx.py +++ b/tests/api_tests/test_api_format_pptx.py @@ -1,6 +1,6 @@ import os -from dedoc.data_structures import TableAnnotation +from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation from tests.api_tests.abstract_api_test import AbstractTestApiDocReader diff --git a/tests/api_tests/test_api_misc_structure.py b/tests/api_tests/test_api_misc_structure.py index 978e8ec4..f762b026 100644 --- a/tests/api_tests/test_api_misc_structure.py +++ b/tests/api_tests/test_api_misc_structure.py @@ -31,6 +31,20 @@ def test_tree_structure(self) -> None: self.assertEqual("Пример документа", nodes[0]["text"].split("\n")[0]) self.assertEqual("1.2.1. Поясним за непонятное", nodes[1]["subparagraphs"][0]["text"].strip()) + def test_page_id_tree_structure(self) -> None: + file_name = os.path.join("..", "pdf_with_text_layer", "test_page_id.pdf") + result = self._send_request(file_name, data={"structure_type": "tree"}) + node = result["content"]["structure"]["subparagraphs"][0] + + page_change_positions = [2135, 4270, 6405, 8540, 10675, 12810, 13323] + for idx, additional_page_id in enumerate(node["metadata"]["additional_page_ids"], start=1): + self.assertEqual(idx, additional_page_id["page_id"]) + start, end = page_change_positions[idx - 1], page_change_positions[idx] + self.assertEqual(start, additional_page_id["start"]) + self.assertEqual(end, additional_page_id["end"]) + self.assertFalse(node["text"][start:end].startswith("\n")) + self.assertTrue(node["text"][start:end].endswith("\n")) + def test_incorrect_structure(self) -> None: file_name = "example.docx" _ = self._send_request(file_name, data={"structure_type": "bagel"}, expected_code=400) diff --git a/tests/api_tests/test_api_misc_with_images_refs.py b/tests/api_tests/test_api_misc_with_images_refs.py index 35996d1a..737adfb8 100644 --- a/tests/api_tests/test_api_misc_with_images_refs.py +++ b/tests/api_tests/test_api_misc_with_images_refs.py @@ -1,6 +1,6 @@ import os -from dedoc.data_structures import AttachAnnotation +from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation from tests.api_tests.abstract_api_test import AbstractTestApiDocReader diff --git a/tests/data/csvs/books.csv b/tests/data/csvs/books.csv index 8ccecbbe..486f8886 100644 --- a/tests/data/csvs/books.csv +++ b/tests/data/csvs/books.csv @@ -8,4 +8,4 @@ id,cat,name,price,inStock,author,series_t,sequence_i,genre_s 0441385532,book,Jhereg,7.95,false,Steven Brust,Vlad Taltos,1,fantasy 0380014300,book,Nine Princes In Amber,6.99,true,Roger Zelazny,the Chronicles of Amber,1,fantasy 0805080481,book,The Book of Three,5.99,true,Lloyd Alexander,The Chronicles of Prydain,1,fantasy -080508049X,book,The Black Cauldron,5.99,true,Lloyd Alexander,The Chronicles of Prydain,2,fantasy +080508049X,book,The Black Cauldron,5.99,true,,The Chronicles of Prydain,2,fantasy diff --git a/tests/data/pdf_with_text_layer/test_page_id.pdf b/tests/data/pdf_with_text_layer/test_page_id.pdf new file mode 100644 index 00000000..c5de845e Binary files /dev/null and b/tests/data/pdf_with_text_layer/test_page_id.pdf differ diff --git a/tests/test_utils.py b/tests/test_utils.py index 5764470e..118a4483 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,10 +1,11 @@ -import importlib import os import signal +from copy import deepcopy from typing import Any, List, Optional, Union from dedocutils.data_structures import BBox +from dedoc.config import get_config from dedoc.data_structures.line_metadata import LineMetadata from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.tables.location import Location @@ -24,11 +25,7 @@ def get_by_tree_path(tree: dict, path: Union[List[int], str]) -> dict: def get_test_config() -> dict: - config_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../dedoc/config.py")) - spec = importlib.util.spec_from_file_location("config_module", config_path) - config_module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(config_module) - config = config_module._config + config = deepcopy(get_config()) return config diff --git a/tests/unit_tests/test_doctype_law_text_features_regexps.py b/tests/unit_tests/test_doctype_law_text_features_regexps.py index 688a6ad6..0f86c46d 100644 --- a/tests/unit_tests/test_doctype_law_text_features_regexps.py +++ b/tests/unit_tests/test_doctype_law_text_features_regexps.py @@ -1,16 +1,17 @@ import unittest from dedoc.structure_extractors.feature_extractors.law_text_features import LawTextFeatures +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import roman_regexp class TestLawTextFeaturesRegexps(unittest.TestCase): features = LawTextFeatures() def test_roman_regexp(self) -> None: - self.assertTrue(self.features.roman_regexp.fullmatch(" XI. ")) - self.assertTrue(self.features.roman_regexp.fullmatch(" ") is None) - self.assertTrue(self.features.roman_regexp.fullmatch(" XI.") is None) - self.assertTrue(self.features.roman_regexp.fullmatch("\tIII. ")) + self.assertTrue(roman_regexp.fullmatch(" XI. ")) + self.assertTrue(roman_regexp.fullmatch(" ") is None) + self.assertTrue(roman_regexp.fullmatch(" XI.") is None) + self.assertTrue(roman_regexp.fullmatch("\tIII. ")) def test_application_beginnings_with_regexp(self) -> None: self.assertTrue(self.features.regexp_application_begin.fullmatch("приложение")) diff --git a/tests/unit_tests/test_format_docx_reader.py b/tests/unit_tests/test_format_docx_reader.py index f26c1e27..2497611d 100644 --- a/tests/unit_tests/test_format_docx_reader.py +++ b/tests/unit_tests/test_format_docx_reader.py @@ -4,7 +4,7 @@ from tempfile import TemporaryDirectory from dedoc.config import get_config -from dedoc.data_structures import SizeAnnotation +from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor from dedoc.readers.docx_reader.docx_reader import DocxReader diff --git a/tests/unit_tests/test_misc_line_object_linker.py b/tests/unit_tests/test_misc_line_object_linker.py index f21a939a..7993170b 100644 --- a/tests/unit_tests/test_misc_line_object_linker.py +++ b/tests/unit_tests/test_misc_line_object_linker.py @@ -27,7 +27,7 @@ def test_line_spacing(self) -> None: line4 = create_line_by_coordinates(x_top_left=15, y_top_left=7, height=2, width=7, page=0) line5 = create_line_by_coordinates(x_top_left=2, y_top_left=1, height=2, width=9, page=1) lines = [line1, line2, line3, line4, line5] - self.metadata_extractor._LineMetadataExtractor__add_spacing_annotations(lines) # noqa + self.metadata_extractor._LineMetadataExtractor__add_spacing_annotations(lines) self.assertEqual(self.metadata_extractor.default_spacing, self._get_spacing(line1)) self.assertEqual(50, self._get_spacing(line2)) self.assertEqual(self.metadata_extractor.default_spacing, self._get_spacing(line3)) diff --git a/tests/unit_tests/test_module_attachment_extractor.py b/tests/unit_tests/test_module_attachment_extractor.py index ac816cb6..c0b69968 100644 --- a/tests/unit_tests/test_module_attachment_extractor.py +++ b/tests/unit_tests/test_module_attachment_extractor.py @@ -7,8 +7,11 @@ from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor from dedoc.dedoc_manager import DedocManager -from dedoc.readers import ArchiveReader, PdfTabbyReader, PdfTxtlayerReader, PptxReader +from dedoc.readers.archive_reader.archive_reader import ArchiveReader from dedoc.readers.docx_reader.docx_reader import DocxReader +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader +from dedoc.readers.pptx_reader.pptx_reader import PptxReader from tests.test_utils import get_test_config diff --git a/tests/unit_tests/test_module_font_classifier.py b/tests/unit_tests/test_module_font_classifier.py index 513220bf..1ee040a0 100644 --- a/tests/unit_tests/test_module_font_classifier.py +++ b/tests/unit_tests/test_module_font_classifier.py @@ -4,7 +4,7 @@ import cv2 from dedocutils.data_structures import BBox -from dedoc.data_structures import BoldAnnotation +from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox from dedoc.readers.pdf_reader.data_classes.word_with_bbox import WordWithBBox