diff --git a/.flake8 b/.flake8 index d0da5961..804c52e0 100644 --- a/.flake8 +++ b/.flake8 @@ -7,6 +7,19 @@ inline-quotes = " application-import-names = dedoc, tests, scripts, train_dataset import-order-style = pycharm +extend-immutable-calls = File, Depends + +banned-modules = + dedoc = Use full path + dedoc.data_structures = Use full path + dedoc.attachments_extractors = Use full path + dedoc.attachments_handler = Use full path + dedoc.converters = Use full path + dedoc.metadata_extractors = Use full path + dedoc.readers = Use full path + dedoc.structure_constructors = Use full path + dedoc.structure_extractors = Use full path + exclude = .git, __pycache__, @@ -28,9 +41,11 @@ exclude = # ANN202 - Missing return type annotation for protected function # ANN204 - Missing return type annotation for special method # N802 - function name should be lowercase +# I251 - Banned import (Use full path) ignore = ANN101 per-file-ignores = scripts/*:T201 scripts/benchmark_pdf_performance*:JS101 tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802 + docs/source/_static/code_examples/*:I251 diff --git a/.github/check_version.py b/.github/check_version.py index 9a107efa..06120405 100644 --- a/.github/check_version.py +++ b/.github/check_version.py @@ -23,7 +23,7 @@ def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern args = parser.parse_args() print(f"Old version: {args.old_version}, new version: {args.new_version}, " - f"branch: {args.branch}, tag: {args.tag}, pre_release: {args.pre_release}") # noqa + f"branch: {args.branch}, tag: {args.tag}, pre_release: {args.pre_release}") master_version_pattern = re.compile(r"^\d+\.\d+(\.\d+)?$") develop_version_pattern = re.compile(r"^\d+\.\d+\.\d+rc\d+$") @@ -43,4 +43,4 @@ def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern is_correct_version(args.new_version, args.tag, args.old_version, master_version_pattern) assert args.pre_release != "true", "Pre-releases are not allowed on master" - print("Version is correct") # noqa + print("Version is correct") diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0f439368..09231202 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,6 +15,7 @@ repos: flake8-import-order==0.18.2, flake8-multiline-containers==0.0.19, flake8-print==5.0.0, + flake8-tidy-imports==4.10.0, flake8-quotes==3.3.2, flake8-use-fstring==1.4, pycodestyle==2.9.0, diff --git a/Dockerfile b/Dockerfile index 779508df..3d00dea6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,7 @@ ARG REPOSITORY="docker.io" FROM dedocproject/dedoc_p3.9_base:version_2023_08_28 +ARG LANGUAGES="" +RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$lang; done ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root" ENV RESOURCES_PATH "/dedoc_root/resources" diff --git a/README.md b/README.md index 5293342e..ab022482 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,18 @@ # Dedoc +[![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/) [![PyPI version](https://badge.fury.io/py/dedoc.svg)](https://badge.fury.io/py/dedoc) +[![PyPI downloads](https://pepy.tech/badge/dedoc)](https://pepy.tech/project/dedoc) +[![Docker Hub](https://img.shields.io/docker/pulls/dedocproject/dedoc.svg)](https://hub.docker.com/r/dedocproject/dedoc/ "Docker Pulls") [![License](http://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html) -[![Documentation Status](https://readthedocs.org/projects/dedoc/badge/?version=latest)](https://dedoc.readthedocs.io/en/latest/?badge=latest) -[![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/) [![Demo dedoc-readme.hf.space](https://img.shields.io/website-up-down-green-red/https/huggingface.co/spaces/dedoc/README.svg)](https://dedoc-readme.hf.space) -[![Docker Hub](https://img.shields.io/docker/pulls/dedocproject/dedoc.svg)](https://hub.docker.com/r/dedocproject/dedoc/ "Docker Pulls") +[![Documentation Status](https://readthedocs.org/projects/dedoc/badge/?version=latest)](https://dedoc.readthedocs.io/en/latest/?badge=latest) [![CI tests](https://github.com/ispras/dedoc/workflows/CI/badge.svg)](https://github.com/ispras/dedoc/actions) ![Dedoc](https://github.com/ispras/dedoc/raw/master/dedoc_logo.png) Dedoc is an open universal system for converting documents to a unified output format. -It extracts a document’s logical structure and content, its tables, text formatting and metadata. +It extracts a document’s logical structure and content: tables, text formatting and metadata. The document’s content is represented as a tree storing headings and lists of any level. Dedoc can be integrated in a document contents and structure analysis system as a separate module. @@ -22,14 +23,14 @@ Dedoc can be integrated in a document contents and structure analysis system as Workflow description is given [`here`](https://dedoc.readthedocs.io/en/latest/?badge=latest#workflow) ## Features and advantages -Dedoc is implemented in Python and works with semi-structured data formats (DOC/DOCX, ODT, XLS/XLSX, CSV, TXT, JSON) and none-structured data formats like images (PNG, JPG etc.), archives (ZIP, RAR etc.), PDF and HTML formats. +Dedoc is implemented in Python and works with semi-structured data formats (DOC/DOCX, ODT, XLS/XLSX, CSV, TXT, JSON) and unstructured data formats like images (PNG, JPG etc.), archives (ZIP, RAR etc.), PDF and HTML formats. Document structure extraction is fully automatic regardless of input data type. Metadata and text formatting are also extracted automatically. In 2022, the system won a grant to support the development of promising AI projects from the [Innovation Assistance Foundation (Фонд содействия инновациям)](https://fasie.ru/). ## Dedoc provides: -* Extensibility due to a flexible addition of new document formats and to an easy change of an output data format. +* Extensibility due to flexible addition of new document formats and easy change of an output data format. * Support for extracting document structure out of nested documents having different formats. * Extracting various text formatting features (indentation, font type, size, style etc.). * Working with documents of various origin (statements of work, legal documents, technical reports, scientific papers) allowing flexible tuning for new domains. @@ -68,7 +69,7 @@ The system processes different document formats. The main formats are listed bel ## Impact -This project may be useful as a first step of automatic document analysis pipeline (e.g. before the NLP part). +This project may be useful as a first step of an automatic document analysis pipeline (e.g. before the NLP part). Dedoc is in demand for information analytic systems, information leak monitoring systems, as well as for natural language processing systems. The library is intended for application use by developers of systems for automatic analysis and structuring of electronic documents, including for further search in electronic documents. @@ -92,7 +93,7 @@ Relevant documentation of dedoc is available [here](https://dedoc.readthedocs.io # Installation instructions -This project has REST Api and you can run it in Docker container. +This project has a REST api and you can run it in Docker container. Also, dedoc can be installed as a library via `pip`. There are two ways to install and run dedoc as a web application or a library that are described below. @@ -149,7 +150,7 @@ If you need to change some application settings, you may update `config.py` acco If you don't want to use docker for running the application, it's possible to run dedoc locally. However, it isn't suitable for any operating system (`Ubuntu 20+` is recommended) and -there may be not enough machine's resources for its work. +there may be not enough machine resources for its work. You should have `python` (`python3.8`, `python3.9` are recommended) and `pip` installed. Installation instructions via pip are available [here](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-dedoc-using-pip). diff --git a/VERSION b/VERSION index 6b4d1577..04761555 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.3 \ No newline at end of file +2.2.4 \ No newline at end of file diff --git a/dedoc/__init__.py b/dedoc/__init__.py index 82dbebad..e9841fc0 100644 --- a/dedoc/__init__.py +++ b/dedoc/__init__.py @@ -1,2 +1,2 @@ -from .dedoc_manager import DedocManager # noqa -from .version import __version__ # noqa +from .dedoc_manager import DedocManager +from .version import __version__ diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index 20d01db1..c77ec19a 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -28,7 +28,7 @@ class QueryParameters: # pdf handling pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"], description="Extract text from a text layer of PDF or using OCR methods for image-like documents") - language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng", "fra", "spa"], description="Recognition language") + language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')") pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right') is_one_column_document: str = Form("auto", enum=["auto", "true", "false"], description='One or multiple column document, "auto" - predict number of page columns automatically') diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py index c942fefa..ad91f2d8 100644 --- a/dedoc/api/api_utils.py +++ b/dedoc/api/api_utils.py @@ -120,12 +120,20 @@ def json2html(text: str, attachments: Optional[List[ParsedDocument]], tabs: int = 0, table2id: Dict[str, int] = None, - attach2id: Dict[str, int] = None) -> str: + attach2id: Dict[str, int] = None, + prev_page_id: Optional[List[int]] = None) -> str: + if prev_page_id is None: + prev_page_id = [0] + tables = [] if tables is None else tables attachments = [] if attachments is None else attachments table2id = {table.metadata.uid: table_id for table_id, table in enumerate(tables)} if table2id is None else table2id attach2id = {attachment.metadata.uid: attachment_id for attachment_id, attachment in enumerate(attachments)} if attach2id is None else attach2id + if paragraph.metadata.page_id != prev_page_id[0]: + text += f"
-
diff --git a/dedoc/attachments_extractors/abstract_attachment_extractor.py b/dedoc/attachments_extractors/abstract_attachment_extractor.py index 21a4497b..d55859bf 100644 --- a/dedoc/attachments_extractors/abstract_attachment_extractor.py +++ b/dedoc/attachments_extractors/abstract_attachment_extractor.py @@ -1,12 +1,7 @@ -import logging -import os -import uuid from abc import ABC, abstractmethod from typing import List, Optional, Set, Tuple from dedoc.data_structures.attached_file import AttachedFile -from dedoc.utils.parameter_utils import get_param_attachments_dir -from dedoc.utils.utils import get_mime_extension, save_data_to_unique_file class AbstractAttachmentsExtractor(ABC): @@ -19,6 +14,8 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti :param recognized_extensions: set of supported files extensions with a dot, for example {.doc, .pdf} :param recognized_mimes: set of supported MIME types of files """ + import logging + self.config = {} if config is None else config self.logger = self.config.get("logger", logging.getLogger()) self._recognized_extensions = {} if recognized_extensions is None else recognized_extensions @@ -39,6 +36,7 @@ def can_extract(self, :param parameters: any additional parameters for the given document :return: the indicator of possibility to get attachments of this file """ + from dedoc.utils.utils import get_mime_extension mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in self._recognized_extensions or mime in self._recognized_mimes @@ -66,7 +64,13 @@ def with_attachments(parameters: dict) -> bool: return str(parameters.get("with_attachments", "false")).lower() == "true" def _content2attach_file(self, content: List[Tuple[str, bytes]], tmpdir: str, need_content_analysis: bool, parameters: dict) -> List[AttachedFile]: + import os + import uuid + from dedoc.utils.parameter_utils import get_param_attachments_dir + from dedoc.utils.utils import save_data_to_unique_file + attachments = [] + attachments_dir = get_param_attachments_dir(parameters, tmpdir) for original_name, contents in content: diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py index eaac7cf5..d267f4ff 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py @@ -1,14 +1,8 @@ -import os -import zipfile from abc import ABC from typing import List, Optional, Set, Tuple -import olefile -from charset_normalizer import from_bytes - from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile -from dedoc.utils.parameter_utils import get_param_need_content_analysis class AbstractOfficeAttachmentsExtractor(AbstractAttachmentsExtractor, ABC): @@ -25,6 +19,8 @@ def __parse_ole_contents(self, stream: bytes) -> Tuple[str, bytes]: :param stream: binary content of olefile :return: tuple of (name of original file and binary file content) """ + from charset_normalizer import from_bytes + # original filename in ANSI starts at byte 7 and is null terminated stream = stream[6:] @@ -65,6 +61,11 @@ def __parse_ole_contents(self, stream: bytes) -> Tuple[str, bytes]: return filename, contents def _get_attachments(self, tmpdir: str, filename: str, parameters: dict, attachments_dir: str) -> List[AttachedFile]: + import olefile + import os + import zipfile + from dedoc.utils.parameter_utils import get_param_need_content_analysis + result = [] with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile: diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py index ea5da542..8c054d0d 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py @@ -1,17 +1,9 @@ -import hashlib -import os -import re -import tempfile -import zipfile from typing import List, Optional - -from bs4 import BeautifulSoup, Tag +from zipfile import BadZipFile, ZipFile from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.attached_file import AttachedFile -from dedoc.extensions import recognized_extensions, recognized_mimes -from dedoc.utils.parameter_utils import get_param_need_content_analysis class DocxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): @@ -19,6 +11,7 @@ class DocxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): Extract attachments from docx files. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.docx_like_format, recognized_mimes=recognized_mimes.docx_like_format) def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: @@ -28,11 +21,14 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + import os + from dedoc.utils.parameter_utils import get_param_need_content_analysis + parameters = {} if parameters is None else parameters tmpdir, filename = os.path.split(file_path) result = [] try: - with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile: + with ZipFile(os.path.join(tmpdir, filename), "r") as zfile: diagram_attachments = self.__extract_diagrams(zfile) need_content_analysis = get_param_need_content_analysis(parameters) result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis, @@ -40,17 +36,23 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word") - except zipfile.BadZipFile: + except BadZipFile: raise BadFileFormatError(f"Bad docx file:\n file_name = {filename}. Seems docx is broken") return result - def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]: + def __extract_diagrams(self, document: ZipFile) -> List[tuple]: """ Creates files for diagram: separate file for each paragraph with diagram. :param document: archive with docx document :returns: list of files with diagrams """ + import hashlib + import os + import re + import tempfile + from bs4 import BeautifulSoup, Tag + result = [] try: content = document.read("word/document.xml") @@ -85,7 +87,7 @@ def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]: with open(f"{tmpdir}/word/document.xml", "w") as f: f.write(doc_text) diagram_name = f"{uid}.docx" - with zipfile.ZipFile(os.path.join(tmpdir, diagram_name), mode="w") as new_d: + with ZipFile(os.path.join(tmpdir, diagram_name), mode="w") as new_d: for filename in namelist: new_d.write(os.path.join(tmpdir, filename), arcname=filename) with open(os.path.join(tmpdir, diagram_name), "rb") as f: diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py index 7097e5d3..70d64ce8 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py @@ -1,9 +1,7 @@ -import os from typing import List, Optional from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile -from dedoc.extensions import recognized_extensions, recognized_mimes class ExcelAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): @@ -11,6 +9,7 @@ class ExcelAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): Extracts attachments from xlsx files. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.excel_like_format, recognized_mimes=recognized_mimes.excel_like_format) def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: @@ -20,6 +19,8 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + import os + parameters = {} if parameters is None else parameters tmpdir, filename = os.path.split(file_path) return self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="xl") diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py index a68c5848..a1f47bad 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py @@ -1,11 +1,7 @@ -import json -import os from typing import List, Optional from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile -from dedoc.extensions import recognized_extensions, recognized_mimes -from dedoc.utils.parameter_utils import get_param_need_content_analysis class JsonAttachmentsExtractor(AbstractAttachmentsExtractor): @@ -13,6 +9,7 @@ class JsonAttachmentsExtractor(AbstractAttachmentsExtractor): Extract attachments from json files. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.json_like_format, recognized_mimes=recognized_mimes.json_like_format) def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: @@ -32,6 +29,10 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + import json + import os + from dedoc.utils.parameter_utils import get_param_need_content_analysis + parameters = {} if parameters is None else parameters tmpdir, filename = os.path.split(file_path) attachments = [] diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py index b152ca63..fc58af36 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py @@ -1,17 +1,9 @@ -import json -import os -import uuid from typing import List, Optional, Tuple -import PyPDF2 -from PyPDF2.pdf import PageObject -from PyPDF2.utils import PdfReadError +from PyPDF2.pdf import PageObject, PdfFileReader from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile -from dedoc.extensions import recognized_extensions, recognized_mimes -from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis -from dedoc.utils.utils import convert_datetime, get_unique_name class PDFAttachmentsExtractor(AbstractAttachmentsExtractor): @@ -19,6 +11,7 @@ class PDFAttachmentsExtractor(AbstractAttachmentsExtractor): Extract attachments from pdf files. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: @@ -28,12 +21,16 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + import os + from PyPDF2.utils import PdfReadError + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis + parameters = {} if parameters is None else parameters filename = os.path.basename(file_path) with open(file_path, "rb") as handler: try: - reader = PyPDF2.PdfFileReader(handler) + reader = PdfFileReader(handler) except Exception as e: self.logger.warning(f"can't handle {filename}, get {e}") return [] @@ -52,6 +49,8 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att return self._content2attach_file(content=attachments, tmpdir=attachments_dir, need_content_analysis=need_content_analysis, parameters=parameters) def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]: + from dedoc.utils.utils import convert_datetime + attachments = [] if "/Annots" in page.keys(): for annot in page["/Annots"]: @@ -72,7 +71,7 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]: attachments.append((name, bytes(content))) return attachments - def __get_page_level_attachments(self, reader: PyPDF2.PdfFileReader) -> List[Tuple[str, bytes]]: + def __get_page_level_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]: cnt_page = reader.getNumPages() attachments = [] for i in range(cnt_page): @@ -82,12 +81,14 @@ def __get_page_level_attachments(self, reader: PyPDF2.PdfFileReader) -> List[Tup return attachments - def __get_root_attachments(self, reader: PyPDF2.PdfFileReader) -> List[Tuple[str, bytes]]: + def __get_root_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]: """ Retrieves the file attachments of the PDF as a dictionary of file names and the file data as a bytestring. :return: dictionary of filenames and bytestrings """ + import uuid + attachments = [] catalog = reader.trailer["/Root"] if "/Names" in catalog.keys() and "/EmbeddedFiles" in catalog["/Names"].keys() and "/Names" in catalog["/Names"]["/EmbeddedFiles"].keys(): @@ -104,6 +105,9 @@ def __get_root_attachments(self, reader: PyPDF2.PdfFileReader) -> List[Tuple[str return attachments def __create_note(self, content: str, modified_time: int, created_time: int, author: str, size: int = None) -> [str, bytes]: + import json + from dedoc.utils.utils import get_unique_name + filename = get_unique_name("note.json") note_dict = { "content": content, diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py index f0fd8c9f..bd26456f 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py @@ -3,7 +3,6 @@ from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile -from dedoc.extensions import recognized_extensions, recognized_mimes class PptxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): @@ -11,6 +10,7 @@ class PptxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): Extract attachments from pptx files. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.pptx_like_format, recognized_mimes=recognized_mimes.pptx_like_format) def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py index b657dd88..8d88f9e6 100644 --- a/dedoc/attachments_handler/attachments_handler.py +++ b/dedoc/attachments_handler/attachments_handler.py @@ -1,14 +1,11 @@ -import copy -import logging -import os -import time from typing import List, Optional from dedoc.common.exceptions.dedoc_error import DedocError -from dedoc.data_structures import AttachedFile, DocumentMetadata, ParsedDocument +from dedoc.data_structures.attached_file import AttachedFile +from dedoc.data_structures.document_metadata import DocumentMetadata +from dedoc.data_structures.parsed_document import ParsedDocument from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.utils.parameter_utils import get_param_with_attachments -from dedoc.utils.utils import get_empty_content +from dedoc.dedoc_manager import DedocManager class AttachmentsHandler: @@ -26,10 +23,12 @@ def __init__(self, *, config: Optional[dict] = None) -> None: """ :param config: configuration of the handler, e.g. logger for logging """ + import logging + self.config = {} if config is None else config self.logger = self.config.get("logger", logging.getLogger()) - def handle_attachments(self, document_parser: "DedocManager", document: UnstructuredDocument, parameters: dict) -> List[ParsedDocument]: # noqa + def handle_attachments(self, document_parser: DedocManager, document: UnstructuredDocument, parameters: dict) -> List[ParsedDocument]: """ Handle attachments of the document in the intermediate representation. @@ -39,6 +38,11 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct are important, look to the API parameters documentation for more details). :return: list of parsed document attachments """ + import copy + import os + import time + from dedoc.utils.parameter_utils import get_param_with_attachments + attachments = [] recursion_deep_attachments = int(parameters.get("recursion_deep_attachments", 10)) - 1 @@ -76,7 +80,8 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct attachments.append(parsed_file) return attachments - def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa + def __get_empty_document(self, document_parser: DedocManager, attachment: AttachedFile, parameters: dict) -> ParsedDocument: + from dedoc.utils.utils import get_empty_content metadata = document_parser.document_metadata_extractor.extract( file_path=attachment.get_filename_in_path(), original_filename=attachment.get_original_filename(), diff --git a/dedoc/common/exceptions/dedoc_error.py b/dedoc/common/exceptions/dedoc_error.py index 78426e39..f91c8bd0 100644 --- a/dedoc/common/exceptions/dedoc_error.py +++ b/dedoc/common/exceptions/dedoc_error.py @@ -1,6 +1,6 @@ from typing import Optional -import dedoc +import dedoc.version class DedocError(Exception): @@ -14,7 +14,7 @@ def __init__(self, self.msg = msg self.msg_api = msg if msg_api is None else msg_api self.filename = filename - self.version = version if version is not None else dedoc.__version__ + self.version = version if version is not None else dedoc.version.__version__ self.metadata = metadata def __str__(self) -> str: diff --git a/dedoc/config.py b/dedoc/config.py index 06d98894..58ebc88a 100644 --- a/dedoc/config.py +++ b/dedoc/config.py @@ -1,55 +1,3 @@ -import logging -import os -import sys - -logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s") - -DEBUG_MODE = False -RESOURCES_PATH = os.environ.get("RESOURCES_PATH", os.path.join(os.path.expanduser("~"), ".cache", "dedoc", "resources")) - -_config = dict( - # -----------------------------------------RESOURCES PATH SETTINGS---------------------------------------------------- - resources_path=RESOURCES_PATH, - intermediate_data_path=os.path.join(RESOURCES_PATH, "datasets"), - table_path="/tmp/tables", - - # -----------------------------------------COMMON DEBUG SETTINGS---------------------------------------------------- - debug_mode=DEBUG_MODE, - path_debug=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc"), - - # --------------------------------------------JOBLIB SETTINGS------------------------------------------------------- - # number of parallel jobs in some tasks as OCR - n_jobs=1, - - # --------------------------------------------GPU SETTINGS---------------------------------------------------------- - # set gpu in XGBoost and torch models - on_gpu=False, - - # ---------------------------------------------API SETTINGS--------------------------------------------------------- - # max file size in bytes - max_content_length=512 * 1024 * 1024, - # application port - api_port=int(os.environ.get("DOCREADER_PORT", "1231")), - static_files_dirs={}, - # log settings - logger=logging.getLogger(), - import_path_init_api_args="dedoc.api.api_args", - - # ----------------------------------------TABLE RECOGNIZER DEBUG SETTINGS------------------------------------------- - # path to save debug images for tables recognizer - path_detect=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "detect_lines"), - - # -------------------------------------------RECOGNIZE SETTINGS----------------------------------------------------- - # TESSERACT OCR confidence threshold ( values: [-1 - undefined; 0.0 : 100.0 % - confidence value) - ocr_conf_threshold=40.0, - # max depth of document structure tree - recursion_deep_subparagraphs=30, - - # -------------------------------------------EXTERNAL SERVICES SETTINGS--------------------------------------------- - grobid_max_connection_attempts=3 -) - - class Configuration(object): """ Pattern Singleton for configuration service @@ -70,7 +18,54 @@ def get_instance(cls: "Configuration") -> "Configuration": def get_config(self) -> dict: if self.__config is None: - self.__config = _config + import logging + import os + import sys + + logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s") + + resources_path = os.environ.get("RESOURCES_PATH", os.path.join(os.path.expanduser("~"), ".cache", "dedoc", "resources")) + self.__config = dict( + # -----------------------------------------RESOURCES PATH SETTINGS---------------------------------------------------- + resources_path=resources_path, + intermediate_data_path=os.path.join(resources_path, "datasets"), + table_path="/tmp/tables", + + # -----------------------------------------COMMON DEBUG SETTINGS---------------------------------------------------- + debug_mode=False, + path_debug=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc"), + + # --------------------------------------------JOBLIB SETTINGS------------------------------------------------------- + # number of parallel jobs in some tasks as OCR + n_jobs=1, + + # --------------------------------------------GPU SETTINGS---------------------------------------------------------- + # set gpu in XGBoost and torch models + on_gpu=False, + + # ---------------------------------------------API SETTINGS--------------------------------------------------------- + # max file size in bytes + max_content_length=512 * 1024 * 1024, + # application port + api_port=int(os.environ.get("DOCREADER_PORT", "1231")), + static_files_dirs={}, + # log settings + logger=logging.getLogger(), + import_path_init_api_args="dedoc.api.api_args", + + # ----------------------------------------TABLE RECOGNIZER DEBUG SETTINGS------------------------------------------- + # path to save debug images for tables recognizer + path_detect=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "detect_lines"), + + # -------------------------------------------RECOGNIZE SETTINGS----------------------------------------------------- + # TESSERACT OCR confidence threshold ( values: [-1 - undefined; 0.0 : 100.0 % - confidence value) + ocr_conf_threshold=40.0, + # max depth of document structure tree + recursion_deep_subparagraphs=30, + + # -------------------------------------------EXTERNAL SERVICES SETTINGS--------------------------------------------- + grobid_max_connection_attempts=3 + ) return self.__config diff --git a/dedoc/converters/concrete_converters/abstract_converter.py b/dedoc/converters/concrete_converters/abstract_converter.py index d4385fe4..0e9f5310 100644 --- a/dedoc/converters/concrete_converters/abstract_converter.py +++ b/dedoc/converters/concrete_converters/abstract_converter.py @@ -1,11 +1,7 @@ -import logging -import os -import subprocess from abc import ABC, abstractmethod from typing import List, Optional, Set from dedoc.common.exceptions.conversion_error import ConversionError -from dedoc.utils.utils import get_mime_extension class AbstractConverter(ABC): @@ -18,6 +14,8 @@ def __init__(self, *, config: Optional[dict] = None, converted_extensions: Optio :param converted_extensions: set of supported files extensions with a dot, for example {.doc, .pdf} :param converted_mimes: set of supported MIME types of files """ + import logging + self.timeout = 60 self.period_checking = 0.05 self.config = {} if config is None else config @@ -40,6 +38,8 @@ def can_convert(self, :param parameters: any additional parameters for the given document :return: the indicator of possibility to convert this file """ + from dedoc.utils.utils import get_mime_extension + mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in self._converted_extensions or mime in self._converted_mimes @@ -58,6 +58,9 @@ def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: pass def _run_subprocess(self, command: List[str], filename: str, expected_path: str) -> None: + import os + import subprocess + try: conversion_results = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=self.timeout) error_message = conversion_results.stderr.decode().strip() diff --git a/dedoc/converters/concrete_converters/binary_converter.py b/dedoc/converters/concrete_converters/binary_converter.py index ba7741cf..bfb00cb4 100644 --- a/dedoc/converters/concrete_converters/binary_converter.py +++ b/dedoc/converters/concrete_converters/binary_converter.py @@ -1,9 +1,6 @@ from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.converters.concrete_converters.png_converter import PNGConverter -from dedoc.utils import supported_image_types -from dedoc.utils.utils import get_mime_extension class BinaryConverter(AbstractConverter): @@ -12,6 +9,7 @@ class BinaryConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.converters.concrete_converters.png_converter import PNGConverter super().__init__(config=config) self.png_converter = PNGConverter(config=self.config) @@ -23,6 +21,9 @@ def can_convert(self, """ Checks if the document is image-like (e.g. it has .bmp, .jpg, .tiff, etc. extension) and has `mime=application/octet-stream`. """ + from dedoc.utils import supported_image_types + from dedoc.utils.utils import get_mime_extension + mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return mime == "application/octet-stream" and extension in supported_image_types diff --git a/dedoc/converters/concrete_converters/docx_converter.py b/dedoc/converters/concrete_converters/docx_converter.py index 539a5d14..3422e400 100644 --- a/dedoc/converters/concrete_converters/docx_converter.py +++ b/dedoc/converters/concrete_converters/docx_converter.py @@ -1,9 +1,6 @@ -import os from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_extensions, converted_mimes -from dedoc.utils.utils import splitext_ class DocxConverter(AbstractConverter): @@ -12,12 +9,16 @@ class DocxConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import converted_extensions, converted_mimes super().__init__(config=config, converted_extensions=converted_extensions.docx_like_format, converted_mimes=converted_mimes.docx_like_format) def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the docx-like documents into files with .docx extension using the soffice application. """ + import os + from dedoc.utils.utils import splitext_ + file_dir, file_name = os.path.split(file_path) name_wo_ext, _ = splitext_(file_name) command = ["soffice", "--headless", "--convert-to", "docx", "--outdir", file_dir, file_path] diff --git a/dedoc/converters/concrete_converters/excel_converter.py b/dedoc/converters/concrete_converters/excel_converter.py index 2d3d2b59..351e5312 100644 --- a/dedoc/converters/concrete_converters/excel_converter.py +++ b/dedoc/converters/concrete_converters/excel_converter.py @@ -1,9 +1,6 @@ -import os from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_extensions, converted_mimes -from dedoc.utils.utils import splitext_ class ExcelConverter(AbstractConverter): @@ -12,12 +9,16 @@ class ExcelConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import converted_extensions, converted_mimes super().__init__(config=config, converted_extensions=converted_extensions.excel_like_format, converted_mimes=converted_mimes.excel_like_format) def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the xlsx-like documents into files with .xlsx extension using the soffice application. """ + import os + from dedoc.utils.utils import splitext_ + file_dir, file_name = os.path.split(file_path) name_wo_ext, _ = splitext_(file_name) command = ["soffice", "--headless", "--convert-to", "xlsx", "--outdir", file_dir, file_path] diff --git a/dedoc/converters/concrete_converters/pdf_converter.py b/dedoc/converters/concrete_converters/pdf_converter.py index 44b9fba1..306890ed 100644 --- a/dedoc/converters/concrete_converters/pdf_converter.py +++ b/dedoc/converters/concrete_converters/pdf_converter.py @@ -1,9 +1,6 @@ -import os from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_extensions, converted_mimes -from dedoc.utils.utils import splitext_ class PDFConverter(AbstractConverter): @@ -12,12 +9,16 @@ class PDFConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import converted_extensions, converted_mimes super().__init__(config=config, converted_extensions=converted_extensions.pdf_like_format, converted_mimes=converted_mimes.pdf_like_format) def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the pdf-like documents into files with .pdf extension using the ddjvu application. """ + import os + from dedoc.utils.utils import splitext_ + file_dir, file_name = os.path.split(file_path) name_wo_ext, _ = splitext_(file_name) converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.pdf") diff --git a/dedoc/converters/concrete_converters/png_converter.py b/dedoc/converters/concrete_converters/png_converter.py index fc04f876..fe616299 100644 --- a/dedoc/converters/concrete_converters/png_converter.py +++ b/dedoc/converters/concrete_converters/png_converter.py @@ -1,13 +1,7 @@ -import os from typing import Optional -import cv2 -from PIL import Image, UnidentifiedImageError - from dedoc.common.exceptions.conversion_error import ConversionError from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_extensions, converted_mimes -from dedoc.utils.utils import splitext_ class PNGConverter(AbstractConverter): @@ -16,12 +10,18 @@ class PNGConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import converted_extensions, converted_mimes super().__init__(config=config, converted_extensions=converted_extensions.image_like_format, converted_mimes=converted_mimes.image_like_format) def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the image-like documents into files with .png extension. """ + import os + import cv2 + from PIL import Image, UnidentifiedImageError + from dedoc.utils.utils import splitext_ + file_dir, file_name = os.path.split(file_path) name_wo_ext, extension = splitext_(file_name) converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.png") diff --git a/dedoc/converters/concrete_converters/pptx_converter.py b/dedoc/converters/concrete_converters/pptx_converter.py index 3eef1f61..c6eef47d 100644 --- a/dedoc/converters/concrete_converters/pptx_converter.py +++ b/dedoc/converters/concrete_converters/pptx_converter.py @@ -1,9 +1,6 @@ -import os from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_extensions, converted_mimes -from dedoc.utils.utils import splitext_ class PptxConverter(AbstractConverter): @@ -12,12 +9,16 @@ class PptxConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import converted_extensions, converted_mimes super().__init__(config=config, converted_extensions=converted_extensions.pptx_like_format, converted_mimes=converted_mimes.pptx_like_format) def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the pptx-like documents into files with .pptx extension using the soffice application. """ + import os + from dedoc.utils.utils import splitext_ + file_dir, file_name = os.path.split(file_path) name_wo_ext, _ = splitext_(file_name) command = ["soffice", "--headless", "--convert-to", "pptx", "--outdir", file_dir, file_path] diff --git a/dedoc/converters/concrete_converters/txt_converter.py b/dedoc/converters/concrete_converters/txt_converter.py index f0b71147..c8cfaf6c 100644 --- a/dedoc/converters/concrete_converters/txt_converter.py +++ b/dedoc/converters/concrete_converters/txt_converter.py @@ -1,10 +1,6 @@ -import os -import shutil from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_extensions, converted_mimes -from dedoc.utils.utils import splitext_ class TxtConverter(AbstractConverter): @@ -13,12 +9,17 @@ class TxtConverter(AbstractConverter): Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import converted_extensions, converted_mimes super().__init__(config=config, converted_extensions=converted_extensions.txt_like_format, converted_mimes=converted_mimes.txt_like_format) def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the txt-like documents into files with .txt extension by renaming it. """ + import os + import shutil + from dedoc.utils.utils import splitext_ + file_dir, file_name = os.path.split(file_path) name_wo_ext, _ = splitext_(file_name) converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.txt") diff --git a/dedoc/converters/converter_composition.py b/dedoc/converters/converter_composition.py index d6d24f91..d9f7533b 100644 --- a/dedoc/converters/converter_composition.py +++ b/dedoc/converters/converter_composition.py @@ -1,9 +1,6 @@ -import os -from stat import S_IREAD, S_IRGRP, S_IROTH from typing import List, Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.utils.utils import get_mime_extension class ConverterComposition: @@ -30,6 +27,10 @@ def convert(self, file_path: str, parameters: Optional[dict] = None, extension: :param mime: MIME type of file :return: path of converted file if conversion was executed else path of the original file """ + import os + from stat import S_IREAD, S_IRGRP, S_IROTH + from dedoc.utils.utils import get_mime_extension + mime, extension = get_mime_extension(file_path=file_path, extension=extension, mime=mime) converted_file_path = file_path diff --git a/dedoc/data_structures/cell_with_meta.py b/dedoc/data_structures/cell_with_meta.py index f544e9e7..c2ff7cac 100644 --- a/dedoc/data_structures/cell_with_meta.py +++ b/dedoc/data_structures/cell_with_meta.py @@ -1,7 +1,5 @@ from typing import List -import numpy as np - from dedoc.api.schema.cell_with_meta import CellWithMeta as ApiCellWithMeta from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.line_with_meta import LineWithMeta @@ -40,9 +38,11 @@ def get_annotations(self) -> List[Annotation]: return LineWithMeta.join(lines=self.lines, delimiter="\n").annotations @staticmethod - def create_from_cell(cell: "Cell") -> "CellWithMeta": # noqa + def create_from_cell(cell: "CellWithMeta") -> "CellWithMeta": return CellWithMeta(lines=cell.lines, colspan=cell.colspan, rowspan=cell.rowspan, invisible=cell.invisible) def to_api_schema(self) -> ApiCellWithMeta: + import numpy as np + lines = [line.to_api_schema() for line in self.lines] return ApiCellWithMeta(lines=lines, colspan=int(np.int8(self.colspan)), rowspan=int(np.int8(self.rowspan)), invisible=self.invisible) diff --git a/dedoc/data_structures/concrete_annotations/bbox_annotation.py b/dedoc/data_structures/concrete_annotations/bbox_annotation.py index bd453d24..303c9eb0 100644 --- a/dedoc/data_structures/concrete_annotations/bbox_annotation.py +++ b/dedoc/data_structures/concrete_annotations/bbox_annotation.py @@ -1,4 +1,3 @@ -import json from typing import Tuple from dedocutils.data_structures import BBox @@ -20,6 +19,8 @@ def __init__(self, start: int, end: int, value: BBox, page_width: int, page_heig :param page_width: width of original image with this bbox :param page_height: height of original image with this bbox """ + import json + if not isinstance(value, BBox): raise ValueError("the value of bounding box annotation should be instance of BBox") @@ -30,6 +31,8 @@ def get_bbox_from_value(value: str) -> Tuple[BBox, int, int]: """ Returns: BBox object, page_width, page_height """ + import json + bbox_dict = json.loads(value) bbox = BBox(x_top_left=int(bbox_dict["x_top_left"] * bbox_dict["page_width"]), y_top_left=int(bbox_dict["y_top_left"] * bbox_dict["page_height"]), diff --git a/dedoc/data_structures/concrete_annotations/color_annotation.py b/dedoc/data_structures/concrete_annotations/color_annotation.py index 4b6983d6..8ddd2479 100644 --- a/dedoc/data_structures/concrete_annotations/color_annotation.py +++ b/dedoc/data_structures/concrete_annotations/color_annotation.py @@ -1,6 +1,3 @@ -import json -from collections import OrderedDict - from dedoc.data_structures.annotation import Annotation @@ -18,6 +15,9 @@ def __init__(self, start: int, end: int, red: float, green: float, blue: float) :param green: mean value of the green color component in the pixels that are not white in the given bounding box :param blue: mean value of the blue color component in the pixels that are not white in the given bounding box """ + import json + from collections import OrderedDict + assert red >= 0 assert green >= 0 assert blue >= 0 diff --git a/dedoc/data_structures/document_metadata.py b/dedoc/data_structures/document_metadata.py index e93b2c16..ec51d143 100644 --- a/dedoc/data_structures/document_metadata.py +++ b/dedoc/data_structures/document_metadata.py @@ -1,5 +1,4 @@ -import uuid -from typing import Any, Dict, Union +from typing import Dict, Union from dedoc.api.schema.document_metadata import DocumentMetadata as ApiDocumentMetadata from dedoc.data_structures.serializable import Serializable @@ -30,6 +29,8 @@ def __init__(self, :param access_time: time of the last access to the file in unixtime :param file_type: mime type of the file """ + import uuid + self.file_name = file_name self.temporary_file_name = temporary_file_name self.size = size @@ -41,7 +42,7 @@ def __init__(self, self.add_attribute(key, value) self.uid = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid - def add_attribute(self, key: str, value: Any) -> None: # noqa + def add_attribute(self, key: str, value: Union[str, int, float]) -> None: setattr(self, key, value) def to_api_schema(self) -> ApiDocumentMetadata: diff --git a/dedoc/data_structures/hierarchy_level.py b/dedoc/data_structures/hierarchy_level.py index 9df3304c..ab2ea053 100644 --- a/dedoc/data_structures/hierarchy_level.py +++ b/dedoc/data_structures/hierarchy_level.py @@ -1,8 +1,6 @@ from functools import total_ordering from typing import Optional -import numpy as np - @total_ordering class HierarchyLevel: @@ -89,6 +87,8 @@ def __str__(self) -> str: return f"HierarchyLevel(level_1={self.level_1}, level_2={self.level_2}, can_be_multiline={self.can_be_multiline}, line_type={self.line_type})" def __to_number(self, x: Optional[int]) -> int: + import numpy as np + return np.inf if x is None else x def is_raw_text(self) -> bool: diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py index 798a1712..a16a7dd2 100644 --- a/dedoc/data_structures/line_with_meta.py +++ b/dedoc/data_structures/line_with_meta.py @@ -1,13 +1,9 @@ -import re -from copy import deepcopy from typing import List, Optional, Sized, Union -from uuid import uuid1 from dedoc.api.schema.line_with_meta import LineWithMeta as ApiLineWithMeta from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.serializable import Serializable -from dedoc.utils.annotation_merger import AnnotationMerger class LineWithMeta(Sized, Serializable): @@ -24,6 +20,7 @@ def __init__(self, line: str, metadata: Optional[LineMetadata] = None, annotatio :param annotations: metadata that refers to some part of the text, for example, font size, font type, etc. :param uid: unique identifier of the line """ + from uuid import uuid1 self._line = line self._metadata = LineMetadata(page_id=0, line_id=None) if metadata is None else metadata @@ -43,6 +40,8 @@ def join(lines: List["LineWithMeta"], delimiter: str = "\n") -> "LineWithMeta": :param delimiter: delimiter to insert between lines :return: merged line """ + from copy import deepcopy + if len(lines) == 0: return LineWithMeta("") @@ -65,6 +64,8 @@ def split(self, sep: str) -> List["LineWithMeta"]: :param sep: separator for splitting :return: list of split lines """ + import re + if not sep: raise ValueError("empty separator") borders = set() @@ -140,6 +141,8 @@ def __repr__(self) -> str: f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})") def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta": + from dedoc.utils.annotation_merger import AnnotationMerger + assert isinstance(other, (LineWithMeta, str)) if len(other) == 0: return self diff --git a/dedoc/data_structures/parsed_document.py b/dedoc/data_structures/parsed_document.py index 9b671e04..862b87d0 100644 --- a/dedoc/data_structures/parsed_document.py +++ b/dedoc/data_structures/parsed_document.py @@ -1,6 +1,5 @@ from typing import List, Optional -import dedoc from dedoc.api.schema.parsed_document import ParsedDocument as ApiParsedDocument from dedoc.data_structures.document_content import DocumentContent from dedoc.data_structures.document_metadata import DocumentMetadata @@ -36,7 +35,9 @@ def set_metadata(self, metadata: DocumentMetadata) -> None: self.metadata = metadata def to_api_schema(self) -> ApiParsedDocument: + import dedoc.version + content = self.content.to_api_schema() metadata = self.metadata.to_api_schema() attachments = [attachment.to_api_schema() for attachment in self.attachments] if self.attachments is not None else [] - return ApiParsedDocument(content=content, metadata=metadata, version=dedoc.__version__, warnings=self.warnings, attachments=attachments) + return ApiParsedDocument(content=content, metadata=metadata, version=dedoc.version.__version__, warnings=self.warnings, attachments=attachments) diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py index fc934d9a..e85c747e 100644 --- a/dedoc/data_structures/table_metadata.py +++ b/dedoc/data_structures/table_metadata.py @@ -1,4 +1,3 @@ -import uuid from typing import Optional from dedoc.api.schema.table_metadata import TableMetadata as ApiTableMetadata @@ -16,6 +15,8 @@ def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_an :param rotated_angle: value of the rotation angle by which the table was rotated during recognition :param title: table's title """ + import uuid + self.page_id = page_id self.uid = str(uuid.uuid4()) if not uid else uid self.rotated_angle = rotated_angle diff --git a/dedoc/data_structures/tree_node.py b/dedoc/data_structures/tree_node.py index 9d8ba676..6cde3554 100644 --- a/dedoc/data_structures/tree_node.py +++ b/dedoc/data_structures/tree_node.py @@ -6,7 +6,6 @@ from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.serializable import Serializable -from dedoc.utils.annotation_merger import AnnotationMerger class TreeNode(Serializable): @@ -52,20 +51,18 @@ def create(lines: List[LineWithMeta] = None) -> "TreeNode": """ page_id = 0 if len(lines) == 0 else min((line.metadata.page_id for line in lines)) line_id = 0 if len(lines) == 0 else min((line.metadata.line_id for line in lines)) + metadata = LineMetadata(page_id=page_id, line_id=line_id, hierarchy_level=HierarchyLevel.create_root()) texts = (line.line for line in lines) annotations = [] text_length = 0 for line in lines: annotations.extend(TreeNode.__shift_annotations(line=line, text_length=text_length)) + TreeNode.__add_additional_page_id(start=text_length, metadata=metadata, other_line=line) + text_length += len(line.line) text = "".join(texts) - return TreeNode("0", - text, - annotations=annotations, - metadata=LineMetadata(page_id=page_id, line_id=line_id, hierarchy_level=HierarchyLevel.create_root()), - subparagraphs=[], - parent=None) + return TreeNode("0", text, annotations=annotations, metadata=metadata, subparagraphs=[], parent=None) def add_child(self, line: LineWithMeta) -> "TreeNode": """ @@ -94,6 +91,7 @@ def add_text(self, line: LineWithMeta) -> None: text_length = len(self.text) new_annotations = self.__shift_annotations(line, text_length) + self.__add_additional_page_id(start=len(self.text), metadata=self.metadata, other_line=line) self.text += line.line self.annotations.extend(new_annotations) @@ -115,6 +113,8 @@ def get_root(self) -> "TreeNode": return node def merge_annotations(self) -> None: + from dedoc.utils.annotation_merger import AnnotationMerger + root = self.get_root() stack = [root] merger = AnnotationMerger() @@ -123,3 +123,30 @@ def merge_annotations(self) -> None: node.annotations = merger.merge_annotations(node.annotations, node.text) for sub_node in node.subparagraphs: stack.append(sub_node) + + @staticmethod + def __add_additional_page_id(start: int, metadata: LineMetadata, other_line: LineWithMeta) -> None: + """ + Adds additional page_id metadata for multi-page nodes. + + If node is located on several pages, its metadata will contain "additional_page_id" attribute with list of dicts: + { + start: start index of the text on the next page, + end: end index (not included), + page_id: page id, where this textual part (node_text[start:end]) is located + } + """ + if metadata.page_id == other_line.metadata.page_id: + return + + if hasattr(metadata, "additional_page_ids"): + last_page_id = metadata.additional_page_ids[-1]["page_id"] + if last_page_id == other_line.metadata.page_id: + metadata.additional_page_ids[-1]["end"] = start + len(other_line.line) + return + + additional_page_id = {"start": start, "end": start + len(other_line.line), "page_id": other_line.metadata.page_id} + if hasattr(metadata, "additional_page_ids"): + metadata.additional_page_ids.append(additional_page_id) + else: + metadata.additional_page_ids = [additional_page_id] diff --git a/dedoc/data_structures/unstructured_document.py b/dedoc/data_structures/unstructured_document.py index 29e82917..94197e2e 100644 --- a/dedoc/data_structures/unstructured_document.py +++ b/dedoc/data_structures/unstructured_document.py @@ -28,3 +28,6 @@ def __init__(self, self.attachments = attachments self.warnings = warnings if warnings else [] self.metadata = metadata if metadata is not None else {} + + def get_text(self) -> str: + return LineWithMeta.join(self.lines).line diff --git a/dedoc/dedoc_manager.py b/dedoc/dedoc_manager.py index 668bc07f..62b7302e 100644 --- a/dedoc/dedoc_manager.py +++ b/dedoc/dedoc_manager.py @@ -1,20 +1,11 @@ -import logging -import os.path -import shutil -import tempfile from typing import Dict, Optional, Tuple from dedoc.api.api_args import QueryParameters from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.common.exceptions.conversion_error import ConversionError from dedoc.common.exceptions.dedoc_error import DedocError -from dedoc.config import get_config -from dedoc.data_structures import ParsedDocument, UnstructuredDocument -from dedoc.extensions import mime2extension -from dedoc.manager_config import get_manager_config -from dedoc.metadata_extractors import BaseMetadataExtractor -from dedoc.utils.train_dataset_utils import get_path_original_documents, save_line_with_meta -from dedoc.utils.utils import get_file_mime_by_content, get_mime_extension, get_unique_name +from dedoc.data_structures.parsed_document import ParsedDocument +from dedoc.data_structures.unstructured_document import UnstructuredDocument class DedocManager: @@ -42,6 +33,11 @@ def __init__(self, config: Optional[dict] = None, manager_config: Optional[dict] - document_metadata_extractor (:class:`~dedoc.metadata_extractors.MetadataExtractorComposition`) - attachments_handler (:class:`~dedoc.attachments_handler.AttachmentsHandler`) """ + import logging + + from dedoc.config import get_config + from dedoc.manager_config import get_manager_config + self.config = get_config() if config is None else config self.logger = self.config.get("logger", logging.getLogger()) manager_config = get_manager_config(self.config) if manager_config is None else manager_config @@ -69,12 +65,16 @@ def parse(self, file_path: str, parameters: Optional[Dict[str, str]] = None) -> :param parameters: any parameters, specify how to parse file, see :ref:`parameters_description` for more details :return: parsed document """ + import os.path + parameters = self.__init_parameters(file_path, parameters) self.logger.info(f"Get file {os.path.basename(file_path)} with parameters {parameters}") try: return self.__parse_no_error_handling(file_path=file_path, parameters=parameters) except DedocError as e: + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + file_dir, file_name = os.path.split(file_path) e.filename = file_name e.metadata = BaseMetadataExtractor._get_base_meta_information(directory=file_dir, filename=file_name, name_actual=file_name) @@ -88,6 +88,11 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) :param parameters: any parameters, specify how to parse file :return: parsed document """ + import os.path + import shutil + import tempfile + from dedoc.utils.utils import get_unique_name + if not os.path.isfile(path=file_path): raise FileNotFoundError(file_path) self.logger.info(f"Start handle {file_path}") @@ -124,6 +129,8 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) return parsed_document def __init_parameters(self, file_path: str, parameters: Optional[dict]) -> dict: + import os.path + parameters = {} if parameters is None else parameters result_parameters = {} @@ -136,6 +143,10 @@ def __init_parameters(self, file_path: str, parameters: Optional[dict]) -> dict: return result_parameters def __read_with_mime_auto_detection(self, file_path: str, file_name: str, parameters: Optional[dict]) -> Tuple[str, UnstructuredDocument]: + import os.path + from dedoc.extensions import mime2extension + from dedoc.utils.utils import get_file_mime_by_content, get_mime_extension + # firstly, try to read file using its original extension mime, extension = get_mime_extension(file_path=file_path) try: @@ -156,6 +167,9 @@ def __read_with_mime_auto_detection(self, file_path: str, file_name: str, parame return converted_file_path, document def __parse_file(self, file_path: str, file_name: str, parameters: Optional[dict], extension: str, mime: str) -> Tuple[str, UnstructuredDocument]: + import os.path + from dedoc.utils.utils import get_mime_extension + converted_file_path = self.converter.convert(file_path, parameters=parameters, mime=mime, extension=extension) if converted_file_path != file_path: mime, extension = get_mime_extension(file_path=converted_file_path) @@ -168,6 +182,10 @@ def __parse_file(self, file_path: str, file_name: str, parameters: Optional[dict return converted_file_path, unstructured_document def __save(self, file_path: str, classified_document: UnstructuredDocument) -> None: + import os.path + import shutil + from dedoc.utils.train_dataset_utils import get_path_original_documents, save_line_with_meta + self.logger.info(f'Save document lines to {self.config["intermediate_data_path"]}') save_line_with_meta(lines=classified_document.lines, config=self.config, original_document=os.path.basename(file_path)) shutil.copy(file_path, os.path.join(get_path_original_documents(self.config), os.path.basename(file_path))) diff --git a/dedoc/download_models.py b/dedoc/download_models.py index b520a7df..7fa611bd 100644 --- a/dedoc/download_models.py +++ b/dedoc/download_models.py @@ -1,10 +1,4 @@ """Downloading models in advance inside the docker container.""" -import os -import shutil - -from huggingface_hub import hf_hub_download - -from dedoc.config import get_config """ These are versions of the models that are used at the current moment - hashes of commits from https://huggingface.co/dedoc. @@ -21,12 +15,18 @@ def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str) -> None: + import os + import shutil + from huggingface_hub import hf_hub_download + os.makedirs(out_dir, exist_ok=True) path = os.path.realpath(hf_hub_download(repo_id=f"dedoc/{repo_name}", filename=hub_name, revision=model_hash_dict[repo_name])) shutil.move(path, os.path.join(out_dir, out_name)) def download(resources_path: str) -> None: + import os + download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.pkl.gz", repo_name="txtlayer_classifier", hub_name="model.pkl.gz") download_from_hub(out_dir=resources_path, @@ -53,5 +53,7 @@ def download(resources_path: str) -> None: if __name__ == "__main__": + from dedoc.config import get_config + resources_path = get_config()["resources_path"] download(resources_path) diff --git a/dedoc/extensions.py b/dedoc/extensions.py index 7037eedb..069642e0 100644 --- a/dedoc/extensions.py +++ b/dedoc/extensions.py @@ -1,7 +1,6 @@ from collections import namedtuple from typing import List -from dedoc.utils.utils import get_extensions_by_mimes Extensions = namedtuple("Parts", [ "excel_like_format", @@ -125,6 +124,8 @@ def get_image_extensions() -> List[str]: + from dedoc.utils.utils import get_extensions_by_mimes + image_extensions = get_extensions_by_mimes(converted_mimes.image_like_format) image_extensions.extend(get_extensions_by_mimes(recognized_mimes.image_like_format)) image_extensions.extend(converted_extensions.image_like_format) diff --git a/dedoc/main.py b/dedoc/main.py index 9c97125c..1d650a1a 100644 --- a/dedoc/main.py +++ b/dedoc/main.py @@ -1,7 +1,7 @@ -from dedoc.api.dedoc_api import get_api, run_api # noqa from dedoc.config import Configuration if __name__ == "__main__": + from dedoc.api.dedoc_api import get_api, run_api Configuration.get_instance().get_config() run_api(get_api()) diff --git a/dedoc/metadata_extractors/abstract_metadata_extractor.py b/dedoc/metadata_extractors/abstract_metadata_extractor.py index 5fca1063..dfcd451d 100644 --- a/dedoc/metadata_extractors/abstract_metadata_extractor.py +++ b/dedoc/metadata_extractors/abstract_metadata_extractor.py @@ -1,10 +1,6 @@ -import logging -import os from abc import ABC, abstractmethod from typing import Optional, Set, Tuple -from dedoc.utils.utils import get_mime_extension - class AbstractMetadataExtractor(ABC): """ @@ -16,6 +12,8 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti :param recognized_extensions: set of supported files extensions with a dot, for example {.doc, .pdf} :param recognized_mimes: set of supported MIME types of files """ + import logging + self.config = {} if config is None else config self.logger = self.config.get("logger", logging.getLogger()) self._recognized_extensions = {} if recognized_extensions is None else recognized_extensions @@ -41,6 +39,9 @@ def can_extract(self, :param extension: file extension, for example .doc or .pdf :return: True if the extractor can handle the given file and False otherwise """ + import os + from dedoc.utils.utils import get_mime_extension + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) converted_file_path = os.path.join(file_dir, converted_filename) mime, extension = get_mime_extension(file_path=converted_file_path, mime=mime, extension=extension) @@ -66,6 +67,8 @@ def extract(self, pass def _get_names(self, file_path: str, converted_filename: Optional[str], original_filename: Optional[str]) -> Tuple[str, str, str, str]: + import os + file_dir, file_name = os.path.split(file_path) converted_filename = file_name if converted_filename is None else converted_filename original_filename = file_name if original_filename is None else original_filename diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py index 7986aaf2..c75681df 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py @@ -1,9 +1,6 @@ -import os -from base64 import b64encode from typing import Optional from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor -from dedoc.utils.utils import get_file_mime_type class BaseMetadataExtractor(AbstractMetadataExtractor): @@ -42,6 +39,9 @@ def extract(self, Gets the basic meta-information about the file. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ + from base64 import b64encode + import os + parameters = {} if parameters is None else parameters file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) meta_info = self._get_base_meta_information(file_dir, file_name, original_filename) @@ -54,6 +54,9 @@ def extract(self, @staticmethod def _get_base_meta_information(directory: str, filename: str, name_actual: str) -> dict: + import os + from dedoc.utils.utils import get_file_mime_type + (mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime) = os.stat(os.path.join(directory, filename)) meta = { "file_name": name_actual, diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py index 6234cd67..dc0bd949 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py @@ -1,11 +1,6 @@ -import os from datetime import datetime from typing import Optional -import docx -from docx.opc.exceptions import PackageNotFoundError - -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor @@ -26,6 +21,8 @@ class DocxMetadataExtractor(AbstractMetadataExtractor): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes + super().__init__(config=config, recognized_extensions=recognized_extensions.docx_like_format, recognized_mimes=recognized_mimes.docx_like_format) self.base_extractor = BaseMetadataExtractor(config=config) @@ -38,6 +35,8 @@ def extract(self, Add the predefined list of metadata for the docx documents. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ + import os + parameters = {} if parameters is None else parameters file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) @@ -53,6 +52,10 @@ def __convert_date(self, date: Optional[datetime]) -> Optional[int]: return None if date is None else int(date.timestamp()) def _get_docx_fields(self, file_path: str) -> dict: + import docx + from docx.opc.exceptions import PackageNotFoundError + import os + assert os.path.isfile(file_path) try: doc = docx.Document(file_path) diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py index bbf2e3a1..9ec09bb1 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py @@ -1,12 +1,5 @@ -import math -import os from typing import Optional, Union -import piexif -from PIL import ExifTags, Image -from dateutil import parser - -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor @@ -31,6 +24,7 @@ class ImageMetadataExtractor(AbstractMetadataExtractor): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.image_like_format, recognized_mimes=recognized_mimes.image_like_format) self.keys = { "DateTime": ("date_time", self.__parse_date), @@ -60,6 +54,8 @@ def extract(self, Add the predefined list of metadata for images. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ + import os + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) base_fields = self.base_extractor.extract( file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters @@ -85,6 +81,8 @@ def __parse_int(self, exif: Union[str, bytes]) -> Optional[int]: return None def __parse_date(self, date_str: Union[str, bytes]) -> Optional[int]: + from dateutil import parser + try: date_str = self.__encode_exif(date_str) date = parser.parse(date_str.replace(": ", ":")) @@ -93,6 +91,8 @@ def __parse_date(self, date_str: Union[str, bytes]) -> Optional[int]: return None def __parse_float(self, exif: Union[str, bytes]) -> Optional[float]: + import math + try: exif = self.__encode_exif(exif) result = float(exif) @@ -101,6 +101,9 @@ def __parse_float(self, exif: Union[str, bytes]) -> Optional[float]: return None def _get_exif(self, path: str) -> dict: + from PIL import ExifTags, Image + import piexif + try: image = Image.open(path) exif_dict = piexif.load(image.info["exif"]).get("Exif", {}) if "exif" in image.info else {} @@ -109,6 +112,6 @@ def _get_exif(self, path: str) -> dict: encoded_dict = {k: v for k, v in encoded_dict.items() if k is not None if v is not None} image.close() return encoded_dict - except Exception as e: # noqa + except Exception as e: self.logger.debug(e) return {"broken_image": True} diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py index f52608c1..cd1eec39 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py @@ -1,5 +1,3 @@ -import os -import pickle from typing import Optional from dedoc.common.exceptions.bad_file_error import BadFileFormatError @@ -40,6 +38,9 @@ def extract(self, Add the predefined list of metadata for the .note.pickle documents. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ + import os + import pickle + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) try: diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py index d3217a7b..f8b8e65a 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py @@ -1,13 +1,7 @@ -import os from typing import Optional -from PyPDF2 import PdfFileReader -from PyPDF2.utils import PdfReadError - -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor -from dedoc.utils.utils import convert_datetime class PdfMetadataExtractor(AbstractMetadataExtractor): @@ -27,6 +21,7 @@ class PdfMetadataExtractor(AbstractMetadataExtractor): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) self.base_extractor = BaseMetadataExtractor(config=config) self.keys = { @@ -52,6 +47,8 @@ def extract(self, Add the predefined list of metadata for the pdf documents. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ + import os + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) base_fields = self.base_extractor.extract( file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters @@ -61,6 +58,9 @@ def extract(self, return result def _get_pdf_info(self, path: str) -> dict: + from PyPDF2 import PdfFileReader + from PyPDF2.utils import PdfReadError + try: with open(path, "rb") as file: document = PdfFileReader(file) @@ -76,6 +76,8 @@ def _get_pdf_info(self, path: str) -> dict: return {"broken_pdf": True} def __prettify_metadata(self, document_info: dict) -> dict: + from dedoc.utils.utils import convert_datetime + result = {} for key, value in document_info.items(): if isinstance(value, str) and len(value) > 0: @@ -84,7 +86,7 @@ def __prettify_metadata(self, document_info: dict) -> dict: elif key in self.keys_date: try: date = convert_datetime(value) - except: # noqa + except Exception: date = None if date is not None: result[self.keys_date[key]] = date diff --git a/dedoc/metadata_extractors/metadata_extractor_composition.py b/dedoc/metadata_extractors/metadata_extractor_composition.py index f76ea6a3..165cd538 100644 --- a/dedoc/metadata_extractors/metadata_extractor_composition.py +++ b/dedoc/metadata_extractors/metadata_extractor_composition.py @@ -1,4 +1,3 @@ -import os.path from typing import List, Optional from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor @@ -37,6 +36,8 @@ def extract(self, :param mime: MIME type of file :return: dict with metadata information about the document """ + import os.path + for extractor in self.extractors: if extractor.can_extract( file_path=file_path, diff --git a/dedoc/readers/__init__.py b/dedoc/readers/__init__.py index 2d96fdae..357ab534 100644 --- a/dedoc/readers/__init__.py +++ b/dedoc/readers/__init__.py @@ -19,5 +19,5 @@ from .txt_reader.raw_text_reader import RawTextReader __all__ = ['ArchiveReader', 'ArticleReader', 'BaseReader', 'CSVReader', 'DocxReader', 'EmailReader', 'ExcelReader', 'HtmlReader', 'JsonReader', 'MhtmlReader', - 'NoteReader', 'PptxReader', 'ReaderComposition', 'RawTextReader', - 'PdfBaseReader', 'PdfImageReader', 'PdfTabbyReader', 'PdfTxtlayerReader', 'PdfAutoReader'] + 'NoteReader', 'PptxReader', 'ReaderComposition', 'RawTextReader', 'PdfBaseReader', 'PdfImageReader', 'PdfTabbyReader', 'PdfTxtlayerReader', + 'PdfAutoReader'] diff --git a/dedoc/readers/archive_reader/archive_reader.py b/dedoc/readers/archive_reader/archive_reader.py index 1598c403..af9554ec 100644 --- a/dedoc/readers/archive_reader/archive_reader.py +++ b/dedoc/readers/archive_reader/archive_reader.py @@ -1,20 +1,9 @@ -import os -import tarfile -import uuid -import zipfile -import zlib from typing import IO, Iterator, List, Optional -import py7zlib -import rarfile - from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.attached_file import AttachedFile from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments -from dedoc.utils.utils import get_file_mime_type, save_data_to_unique_file class ArchiveReader(BaseReader): @@ -23,6 +12,7 @@ class ArchiveReader(BaseReader): Documents with the following extensions can be parsed: .zip, .tar, .tar.gz, .rar, .7z. """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.archive_like_format, recognized_mimes=recognized_mimes.archive_like_format) def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: @@ -30,6 +20,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure The method return empty content of archive, all content will be placed inside attachments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments + parameters = {} if parameters is None else parameters with_attachments = get_param_with_attachments(parameters) @@ -42,6 +34,11 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(lines=[], tables=[], attachments=attachments) def __get_attachments(self, path: str, tmp_dir: str, need_content_analysis: bool) -> List[AttachedFile]: + import rarfile + import tarfile + import zipfile + from dedoc.utils.utils import get_file_mime_type + mime = get_file_mime_type(path) if zipfile.is_zipfile(path) and mime == "application/zip": return list(self.__read_zip_archive(path=path, tmp_dir=tmp_dir, need_content_analysis=need_content_analysis)) @@ -55,6 +52,9 @@ def __get_attachments(self, path: str, tmp_dir: str, need_content_analysis: bool raise BadFileFormatError(f"bad archive {path}") def __read_zip_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]: + import zipfile + import zlib + try: with zipfile.ZipFile(path, "r") as arch_file: names = [member.filename for member in arch_file.infolist() if member.file_size > 0] @@ -66,6 +66,8 @@ def __read_zip_archive(self, path: str, tmp_dir: str, need_content_analysis: boo raise BadFileFormatError(f"Can't read file {path} ({e})") def __read_tar_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]: + import tarfile + with tarfile.open(path, "r") as arch_file: names = [member.name for member in arch_file.getmembers() if member.isfile()] for name in names: @@ -74,6 +76,8 @@ def __read_tar_archive(self, path: str, tmp_dir: str, need_content_analysis: boo file.close() def __read_rar_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]: + import rarfile + with rarfile.RarFile(path, "r") as arch_file: names = [item.filename for item in arch_file.infolist() if item.compress_size > 0] for name in names: @@ -81,6 +85,8 @@ def __read_rar_archive(self, path: str, tmp_dir: str, need_content_analysis: boo yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis) def __read_7z_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]: + import py7zlib + with open(path, "rb") as content: arch_file = py7zlib.Archive7z(content) names = arch_file.getnames() @@ -89,6 +95,10 @@ def __read_7z_archive(self, path: str, tmp_dir: str, need_content_analysis: bool yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis) def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes], need_content_analysis: bool) -> AttachedFile: + import os + import uuid + from dedoc.utils.utils import save_data_to_unique_file + file_name = os.path.basename(file_name) binary_data = file.read() if isinstance(binary_data, str): diff --git a/dedoc/readers/article_reader/article_reader.py b/dedoc/readers/article_reader/article_reader.py index c85a72d1..e1065fb8 100644 --- a/dedoc/readers/article_reader/article_reader.py +++ b/dedoc/readers/article_reader/article_reader.py @@ -1,23 +1,14 @@ -import math -import os -import time -import uuid from typing import Dict, List, Optional, Tuple -import cv2 -import numpy as np -import requests -from bs4 import BeautifulSoup, Tag -from pdf2image import convert_from_path +from bs4 import Tag +from numpy import ndarray -from dedoc.data_structures import Annotation, AttachAnnotation, AttachedFile, CellWithMeta, HierarchyLevel, LineMetadata, Table, TableAnnotation, TableMetadata -from dedoc.data_structures.concrete_annotations.reference_annotation import ReferenceAnnotation +from dedoc.data_structures.annotation import Annotation +from dedoc.data_structures.attached_file import AttachedFile from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.data_structures.table import Table from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth -from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_document_type, get_param_need_content_analysis, get_param_with_attachments class ArticleReader(BaseReader): @@ -26,7 +17,11 @@ class ArticleReader(BaseReader): """ def __init__(self, config: Optional[dict] = None) -> None: + import os + from dedoc.extensions import recognized_extensions, recognized_mimes + super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) + grobid_url = os.environ.get("GROBID_URL", "") if grobid_url: self.grobid_url = grobid_url @@ -47,6 +42,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + import requests + from bs4 import BeautifulSoup + with open(file_path, "rb") as file: files = {"input": file} try: @@ -87,6 +85,8 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + from dedoc.utils.parameter_utils import get_param_document_type + if get_param_document_type(parameters) != "article": return False @@ -97,6 +97,9 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, return super().can_read(file_path=file_path, mime=mime, extension=extension) def __update_grobid_alive(self, grobid_url: str, max_attempts: int = 2) -> None: + import time + import requests + if self.grobid_is_alive: return @@ -127,6 +130,9 @@ def __get_tag_by_hierarchy_path(self, source: Tag, hierarchy_path: List[str]) -> def __create_line(self, text: str, hierarchy_level_id: Optional[int] = None, paragraph_type: Optional[str] = None, annotations: Optional[List[Annotation]] = None, other_fields: Optional[Dict] = None) -> LineWithMeta: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + from dedoc.data_structures.line_metadata import LineMetadata + # TODO check on improve if other_fields is None: other_fields = {} @@ -214,6 +220,10 @@ def __parse_keywords(self, keywords_tag: Tag) -> List[LineWithMeta]: return lines def __create_line_with_refs(self, content: List[Tuple[str, Tag]], bib2uid: dict, table2uid: dict, attachment2uid: dict) -> LineWithMeta: + from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation + from dedoc.data_structures.concrete_annotations.reference_annotation import ReferenceAnnotation + from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation + text = "" start = 0 annotations = [] @@ -258,6 +268,8 @@ def __parse_text(self, soup: Tag, bib2uid: dict, table2uid: dict, attachment2uid return lines def __parse_section(self, section_tag: Tag, bib2uid: dict, table2uid: dict, attachment2uid: dict) -> List[LineWithMeta]: + from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth + lines = [] number = section_tag.head.get("n") if section_tag.head else "" number = number + " " if number else "" @@ -299,6 +311,9 @@ def __parse_tables(self, soup: Tag) -> Tuple[List[Table], dict]: """ + from dedoc.data_structures.cell_with_meta import CellWithMeta + from dedoc.data_structures.table_metadata import TableMetadata + tables = [] table2uid = {} @@ -349,6 +364,11 @@ def __parse_images(self, soup: Tag, file_path: str, parameters: Optional[dict]) Documentation: https://grobid.readthedocs.io/en/latest/Coordinates-in-PDF/ """ + import os + import uuid + import cv2 + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments + if not get_param_with_attachments(parameters): return [], {} @@ -379,12 +399,16 @@ def __parse_images(self, soup: Tag, file_path: str, parameters: Optional[dict]) return attachments, attachment2uid - def __get_image(self, figure_tag: Tag, file_path: str, page_sizes: List[Tuple[float, float]]) -> Optional[np.ndarray]: + def __get_image(self, figure_tag: Tag, file_path: str, page_sizes: List[Tuple[float, float]]) -> Optional[ndarray]: """ Crop the PDF page according to the figure's coordinates. Figure can consist of multiple sub-figures: we crop the union of all sub-figures. Example of the figure's coordinates: coords="3,151.56,211.52,312.23,7.89;3,136.68,115.84,338.92,75.24" """ + import math + import numpy as np + from pdf2image import convert_from_path + if figure_tag.graphic is None: return None diff --git a/dedoc/readers/base_reader.py b/dedoc/readers/base_reader.py index d4adff80..b351ae43 100644 --- a/dedoc/readers/base_reader.py +++ b/dedoc/readers/base_reader.py @@ -1,9 +1,7 @@ -import logging from abc import ABC, abstractmethod from typing import Optional, Set from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.utils.utils import get_mime_extension class BaseReader(ABC): @@ -22,6 +20,8 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti :param recognized_extensions: set of supported files extensions with a dot, for example {.doc, .pdf} :param recognized_mimes: set of supported MIME types of files """ + import logging + self.config = {} if config is None else config self.logger = self.config.get("logger", logging.getLogger()) self._recognized_extensions = {} if recognized_extensions is None else recognized_extensions @@ -39,6 +39,8 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, :return: True if this reader can handle the file, False otherwise """ + from dedoc.utils.utils import get_mime_extension + mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in self._recognized_extensions or mime in self._recognized_mimes diff --git a/dedoc/readers/csv_reader/csv_reader.py b/dedoc/readers/csv_reader/csv_reader.py index 73c2d1d2..4048bd7b 100644 --- a/dedoc/readers/csv_reader/csv_reader.py +++ b/dedoc/readers/csv_reader/csv_reader.py @@ -1,15 +1,7 @@ from typing import List, Optional, Tuple -import pandas as pd - -from dedoc.data_structures import LineMetadata, LineWithMeta -from dedoc.data_structures.cell_with_meta import CellWithMeta -from dedoc.data_structures.table import Table -from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.utils.utils import get_encoding class CSVReader(BaseReader): @@ -18,6 +10,7 @@ class CSVReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.csv_like_format, recognized_mimes=recognized_mimes.csv_like_format) self.default_separator = "," @@ -27,19 +20,26 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure The lines and attachments remain empty. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + import pandas as pd + from dedoc.data_structures.line_metadata import LineMetadata + from dedoc.data_structures.line_with_meta import LineWithMeta + from dedoc.data_structures.cell_with_meta import CellWithMeta + from dedoc.data_structures.table import Table + from dedoc.data_structures.table_metadata import TableMetadata + parameters = {} if parameters is None else parameters delimiter = parameters.get("delimiter") if delimiter is None: delimiter = "\t" if file_path.endswith(".tsv") else self.default_separator encoding, encoding_warning = self.__get_encoding(file_path, parameters) - df = pd.read_csv(file_path, sep=delimiter, header=None, encoding=encoding) + df = pd.read_csv(file_path, sep=delimiter, header=None, encoding=encoding, dtype="string", keep_default_na=False) table_metadata = TableMetadata(page_id=0) cells_with_meta = [] line_id = 0 for ind in df.index: row_lines = [] for cell in df.loc[ind]: - row_lines.append(CellWithMeta(lines=[LineWithMeta(line=str(cell), metadata=LineMetadata(page_id=0, line_id=line_id))])) + row_lines.append(CellWithMeta(lines=[LineWithMeta(line=cell, metadata=LineMetadata(page_id=0, line_id=line_id))])) line_id += 1 cells_with_meta.append(row_lines) @@ -49,6 +49,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(lines=[], tables=tables, attachments=[], warnings=warnings) def __get_encoding(self, path: str, parameters: dict) -> Tuple[str, List[str]]: + from dedoc.utils.utils import get_encoding + if parameters.get("encoding"): return parameters["encoding"], [] else: diff --git a/dedoc/readers/docx_reader/data_structures/base_props.py b/dedoc/readers/docx_reader/data_structures/base_props.py index c439c3d0..f6154d1c 100644 --- a/dedoc/readers/docx_reader/data_structures/base_props.py +++ b/dedoc/readers/docx_reader/data_structures/base_props.py @@ -3,7 +3,7 @@ class BaseProperties: - def __init__(self, properties: Optional["BaseProperties"] = None) -> None: # noqa + def __init__(self, properties: Optional["BaseProperties"] = None) -> None: """ Contains style properties for paragraphs and runs. :param properties: Paragraph or Run for copying its properties diff --git a/dedoc/readers/docx_reader/data_structures/table.py b/dedoc/readers/docx_reader/data_structures/table.py index b86855cb..1f11fdac 100644 --- a/dedoc/readers/docx_reader/data_structures/table.py +++ b/dedoc/readers/docx_reader/data_structures/table.py @@ -3,8 +3,8 @@ from bs4 import Tag -from dedoc.data_structures import LineWithMeta from dedoc.data_structures.cell_with_meta import CellWithMeta +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.table import Table from dedoc.data_structures.table_metadata import TableMetadata from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph diff --git a/dedoc/readers/docx_reader/docx_reader.py b/dedoc/readers/docx_reader/docx_reader.py index 76f6c8e5..fc136a14 100644 --- a/dedoc/readers/docx_reader/docx_reader.py +++ b/dedoc/readers/docx_reader/docx_reader.py @@ -1,13 +1,8 @@ from typing import List, Optional -from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor -from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument -from dedoc.utils.parameter_utils import get_param_with_attachments class DocxReader(BaseReader): @@ -17,6 +12,9 @@ class DocxReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor + from dedoc.extensions import recognized_extensions, recognized_mimes + super().__init__(config=config, recognized_extensions=recognized_extensions.docx_like_format, recognized_mimes=recognized_mimes.docx_like_format) self.attachment_extractor = DocxAttachmentsExtractor(config=self.config) @@ -26,6 +24,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument + from dedoc.utils.parameter_utils import get_param_with_attachments + with_attachments = get_param_with_attachments(parameters) attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else [] @@ -34,6 +35,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(lines=lines, tables=docx_document.tables, attachments=attachments, warnings=[]) def __fix_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + for i, line in enumerate(lines[1:]): if lines[i].metadata.tag_hierarchy_level != line.metadata.tag_hierarchy_level \ or lines[i].metadata.tag_hierarchy_level.line_type != HierarchyLevel.unknown \ diff --git a/dedoc/readers/email_reader/email_reader.py b/dedoc/readers/email_reader/email_reader.py index 97c8d259..8d3ec876 100644 --- a/dedoc/readers/email_reader/email_reader.py +++ b/dedoc/readers/email_reader/email_reader.py @@ -1,24 +1,10 @@ -import email -import json -import mimetypes -import os -import re -import uuid -from email.header import decode_header from email.message import Message -from tempfile import NamedTemporaryFile from typing import List, Optional -from dedoc.data_structures.attached_file import AttachedFile -from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.readers.html_reader.html_reader import HtmlReader -from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis -from dedoc.utils.utils import get_mime_extension, get_unique_name, save_data_to_unique_file class EmailReader(BaseReader): @@ -27,6 +13,8 @@ class EmailReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.readers.html_reader.html_reader import HtmlReader super().__init__(config=config, recognized_extensions=recognized_extensions.eml_like_format, recognized_mimes=recognized_mimes.eml_like_format) self.html_reader = HtmlReader(config=self.config) @@ -35,6 +23,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, Check if the document extension or mime is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + from dedoc.utils.utils import get_mime_extension mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) # this code differs from BaseReader because .eml and .mhtml files have the same mime type if extension: @@ -50,6 +39,14 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + import email + import json + import os + import uuid + from dedoc.data_structures.attached_file import AttachedFile + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis + from dedoc.utils.utils import get_unique_name + parameters = {} if parameters is None else parameters attachments_dir = get_param_attachments_dir(parameters, file_path) @@ -108,6 +105,12 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments) def __add_attachment(self, message: Message, attachments_dir: str, attachments: list, need_content_analysis: bool) -> None: + import mimetypes + import os + import uuid + from dedoc.data_structures.attached_file import AttachedFile + from dedoc.utils.utils import save_data_to_unique_file + content_type = message.get_content_type() payload = message.get_payload(decode=True) @@ -133,6 +136,8 @@ def __add_attachment(self, message: Message, attachments_dir: str, attachments: need_content_analysis=need_content_analysis)) def __add_content_from_html(self, message: Message, lines: list, tables: list, parameters: dict) -> None: + from tempfile import NamedTemporaryFile + payload = message.get_payload(decode=True) if payload is None: return @@ -153,6 +158,8 @@ def __add_content_from_html(self, message: Message, lines: list, tables: list, p file.close() def __add_text_content(self, message: Message, lines: list) -> None: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + payload = message.get_payload(decode=True) if payload is None: return @@ -168,11 +175,15 @@ def __add_text_content(self, message: Message, lines: list) -> None: annotations=[])) def __fix_filename(self, filename: str) -> str: + import re + filename = re.sub(r"[<>:\"/\\|?*]", "_", filename) filename = re.sub(r"\s+", " ", filename) return filename def __get_decoded(self, text: str) -> str: + from email.header import decode_header + part = [] for letter, encode in decode_header(text): if isinstance(letter, bytes): @@ -188,6 +199,8 @@ def __get_field(self, message: Message, key: str, line_metadata: LineMetadata) - return LineWithMeta(line=text, metadata=line_metadata) def __get_main_fields(self, message: Message) -> List[LineWithMeta]: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + lines = list() line_metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel(0, 0, False, "root"), page_id=0, line_id=0) lines.append(self.__get_field(message, "subject", line_metadata)) diff --git a/dedoc/readers/excel_reader/excel_reader.py b/dedoc/readers/excel_reader/excel_reader.py index db3a104e..a68a92e5 100644 --- a/dedoc/readers/excel_reader/excel_reader.py +++ b/dedoc/readers/excel_reader/excel_reader.py @@ -1,20 +1,10 @@ from typing import Optional -import xlrd from xlrd.sheet import Sheet -from dedoc.attachments_extractors.concrete_attachments_extractors.excel_attachments_extractor import ExcelAttachmentsExtractor -from dedoc.data_structures import LineMetadata, LineWithMeta -from dedoc.data_structures.cell_with_meta import CellWithMeta from dedoc.data_structures.table import Table -from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.utils.parameter_utils import get_param_with_attachments - -xlrd.xlsx.ensure_elementtree_imported(False, None) -xlrd.xlsx.Element_has_iter = True class ExcelReader(BaseReader): @@ -22,8 +12,13 @@ class ExcelReader(BaseReader): This class is used for parsing documents with .xlsx extension. Please use :class:`~dedoc.converters.ExcelConverter` for getting xlsx file from similar formats. """ + import xlrd + xlrd.xlsx.ensure_elementtree_imported(False, None) + xlrd.xlsx.Element_has_iter = True def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.attachments_extractors.concrete_attachments_extractors.excel_attachments_extractor import ExcelAttachmentsExtractor + from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.excel_like_format, recognized_mimes=recognized_mimes.excel_like_format) self.attachment_extractor = ExcelAttachmentsExtractor(config=self.config) @@ -32,6 +27,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure This method extracts tables and attachments from the document, `lines` attribute remains empty. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + import xlrd + from dedoc.utils.parameter_utils import get_param_with_attachments + with xlrd.open_workbook(file_path) as book: sheets_num = book.nsheets tables = [] @@ -45,6 +43,11 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(lines=[], tables=tables, attachments=attachments, warnings=[]) def __parse_sheet(self, sheet_id: int, sheet: Sheet) -> Table: + from dedoc.data_structures.line_with_meta import LineWithMeta + from dedoc.data_structures.line_metadata import LineMetadata + from dedoc.data_structures.cell_with_meta import CellWithMeta + from dedoc.data_structures.table_metadata import TableMetadata + n_rows = sheet.nrows n_cols = sheet.ncols res = [] diff --git a/dedoc/readers/html2pdf_reader/html2pdf_reader.py b/dedoc/readers/html2pdf_reader/html2pdf_reader.py index f18cbf16..b8f83ed1 100644 --- a/dedoc/readers/html2pdf_reader/html2pdf_reader.py +++ b/dedoc/readers/html2pdf_reader/html2pdf_reader.py @@ -1,28 +1,25 @@ -import os -import re -from copy import deepcopy -from tempfile import TemporaryDirectory from typing import Dict, Optional, Tuple -from uuid import uuid1 from bs4 import BeautifulSoup -from weasyprint import HTML -from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation from dedoc.data_structures.table import Table from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.html_reader.html_reader import HtmlReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader -from dedoc.utils.utils import calculate_file_hash class Html2PdfReader(HtmlReader): def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader self.pdf_reader = PdfTxtlayerReader(config=self.config) def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: + import os + from copy import deepcopy + from tempfile import TemporaryDirectory + from weasyprint import HTML + parameters = {} if parameters is None else parameters with TemporaryDirectory() as tmp_dir: modified_path, tables = self._modify_html(file_path, tmp_dir) @@ -36,6 +33,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return self._add_tables(document=unstructured_document, tables=tables) def _add_tables(self, document: UnstructuredDocument, tables: Dict[str, Table]) -> UnstructuredDocument: + from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation + lines = [] tables_result = [] previous_line = None @@ -54,6 +53,8 @@ def _add_tables(self, document: UnstructuredDocument, tables: Dict[str, Table]) return UnstructuredDocument(lines=lines, tables=tables_result, attachments=document.attachments) def _handle_tables(self, soup: BeautifulSoup, path_hash: str) -> dict: + from uuid import uuid1 + tables = {} for table_tag in soup.find_all("table"): table_uid = f"table_{uuid1()}" @@ -75,6 +76,8 @@ def _handle_super_elements(self, soup: BeautifulSoup) -> None: html-code: 1.1) lalala view: "1.1) lalala" """ + import re + supers = soup.find_all(["span", "p"], {"style": re.compile("vertical-align:super")}) for super_element in supers: @@ -86,6 +89,9 @@ def _handle_super_elements(self, soup: BeautifulSoup) -> None: super_element.decompose() def _modify_html(self, path: str, tmp_dir: str) -> Tuple[str, dict]: + import os + from dedoc.utils.utils import calculate_file_hash + with open(path, encoding="utf-8") as f: soup = BeautifulSoup(f.read(), "html.parser") diff --git a/dedoc/readers/html_reader/html_reader.py b/dedoc/readers/html_reader/html_reader.py index 5cd8b76a..11d815ba 100644 --- a/dedoc/readers/html_reader/html_reader.py +++ b/dedoc/readers/html_reader/html_reader.py @@ -1,22 +1,11 @@ -import hashlib -import string from typing import List, Optional, Tuple, Union from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag -from dedoc.data_structures.cell_with_meta import CellWithMeta -from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.table import Table -from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.readers.html_reader.html_line_postprocessing import HtmlLinePostprocessing -from dedoc.readers.html_reader.html_tag_annotation_parser import HtmlTagAnnotationParser -from dedoc.readers.html_reader.html_tags import HtmlTags -from dedoc.utils.utils import calculate_file_hash class HtmlReader(BaseReader): @@ -25,6 +14,10 @@ class HtmlReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.readers.html_reader.html_line_postprocessing import HtmlLinePostprocessing + from dedoc.readers.html_reader.html_tag_annotation_parser import HtmlTagAnnotationParser + super().__init__(config=config, recognized_extensions=recognized_extensions.html_like_format, recognized_mimes=recognized_mimes.html_like_format) self.postprocessor = HtmlLinePostprocessing() self.tag_annotation_parser = HtmlTagAnnotationParser() @@ -35,6 +28,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + from dedoc.utils.utils import calculate_file_hash + parameters = {} if parameters is None else parameters with open(file_path, "rb") as f: soup = BeautifulSoup(f.read(), "html.parser") @@ -52,6 +47,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure def __handle_block(self, tag: Union[Tag], filepath_hash: str, handle_invisible_table: bool, table: Optional[bool] = False, uid: Optional[str] = "") -> List[LineWithMeta]: + import hashlib + from dedoc.readers.html_reader.html_tags import HtmlTags + tag_uid = hashlib.md5((uid + str(tag.name)).encode()).hexdigest() assert isinstance(tag, (Tag, str)) if not self.__is_content_tag(tag, handle_invisible_table=handle_invisible_table): @@ -80,6 +78,10 @@ def __handle_block(self, tag: Union[Tag], filepath_hash: str, handle_invisible_t return block_lines def __handle_single_tag(self, tag: Tag, filepath_hash: str, uid: str, table: Optional[bool] = False) -> List[LineWithMeta]: + import hashlib + from dedoc.data_structures.hierarchy_level import HierarchyLevel + from dedoc.readers.html_reader.html_tags import HtmlTags + text = self.__get_text(tag, table) if not text or text.isspace(): @@ -95,6 +97,8 @@ def __handle_single_tag(self, tag: Tag, filepath_hash: str, uid: str, table: Opt def __read_blocks(self, block: Tag, filepath_hash: str = "", handle_invisible_table: bool = False, table: Optional[bool] = False, uid: Optional[str] = "") -> List[LineWithMeta]: + import hashlib + tag_uid = hashlib.md5((filepath_hash + uid + str(block.name)).encode()).hexdigest() if not self.__is_content_tag(block, handle_invisible_table=handle_invisible_table): return [] @@ -108,6 +112,9 @@ def __read_blocks(self, block: Tag, filepath_hash: str = "", handle_invisible_ta return lines def _handle_text_line(self, block: str, filepath_hash: str, uid: str, ignore_space: bool = True) -> List[LineWithMeta]: + import hashlib + from dedoc.data_structures.hierarchy_level import HierarchyLevel + if not block.strip() and ignore_space: return [] tag_uid = hashlib.md5((uid + block).encode()).hexdigest() @@ -116,6 +123,9 @@ def _handle_text_line(self, block: str, filepath_hash: str, uid: str, ignore_spa def __make_line(self, line: str, line_type: str, header_level: int = 0, uid: str = None, filepath_hash: str = None, annotations: List = None) -> LineWithMeta: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + from dedoc.data_structures.line_metadata import LineMetadata + if annotations is None: annotations = [] @@ -126,6 +136,10 @@ def __make_line(self, line: str, line_type: str, header_level: int = 0, uid: str return LineWithMeta(line=line, metadata=metadata, annotations=annotations, uid=uid) def __get_li_header(self, list_type: str, index: int) -> LineWithMeta: + import string + from dedoc.data_structures.hierarchy_level import HierarchyLevel + from dedoc.data_structures.line_metadata import LineMetadata + end = ") " if list_type in ["a", "A"] else ". " if list_type == "": header = "" @@ -146,6 +160,9 @@ def __get_li_header(self, list_type: str, index: int) -> LineWithMeta: return header_line def __read_list(self, lst: Tag, uid: str, filepath_hash: str, handle_invisible_table: bool) -> List[LineWithMeta]: + import hashlib + from dedoc.readers.html_reader.html_tags import HtmlTags + tag_uid = hashlib.md5((uid + str(lst.name)).encode()).hexdigest() lines = [] list_type = lst.get("type", "1" if lst.name in HtmlTags.ordered_list else "") @@ -164,6 +181,8 @@ def __read_list(self, lst: Tag, uid: str, filepath_hash: str, handle_invisible_t return lines def __handle_list_item(self, item: Tag, item_index: int, list_type: str, filepath_hash: str, uid: str, handle_invisible_table: bool) -> List[LineWithMeta]: + import hashlib + tag_uid = hashlib.md5((uid + str(item.name)).encode()).hexdigest() lines = [] header_line = self.__get_li_header(list_type=list_type, index=item_index) @@ -195,6 +214,8 @@ def __is_content_tag(self, tag: Tag, handle_invisible_table: bool = False) -> bo @param handle_invisible_table: is invisibly table should be handled as table @return: True if tag is a content tag False otherwise. """ + from dedoc.readers.html_reader.html_tags import HtmlTags + if tag.name in HtmlTags.service_tags: return False if tag.name == "table" and not self._visible_table(tag, handle_invisible_table=handle_invisible_table): @@ -202,6 +223,9 @@ def __is_content_tag(self, tag: Tag, handle_invisible_table: bool = False) -> bo return not isinstance(tag, Doctype) and not isinstance(tag, Comment) def __handle_invisible_table(self, block: Tag, filepath_hash: str, uid: str) -> List[LineWithMeta]: + import hashlib + from dedoc.data_structures.hierarchy_level import HierarchyLevel + result = [] rows = self._read_table(block, filepath_hash).cells for row in rows: @@ -213,6 +237,8 @@ def __handle_invisible_table(self, block: Tag, filepath_hash: str, uid: str) -> return result def __clone_cell(self, el: Tuple[Tag, NavigableString]) -> Tuple[Tag, NavigableString]: + from dedoc.readers.html_reader.html_tags import HtmlTags + if isinstance(el, NavigableString): return type(el)(el) @@ -228,6 +254,8 @@ def __clone_cell(self, el: Tuple[Tag, NavigableString]) -> Tuple[Tag, NavigableS return copy def __split_table_cells(self, table: Tag, table_list: List[List[Tag]]) -> None: + from dedoc.readers.html_reader.html_tags import HtmlTags + for row_index, row in enumerate(table.find_all(HtmlTags.table_rows)): for cell_index, cell in enumerate(row.find_all(HtmlTags.table_cells)): cell_rowspan = int(cell.attrs.get("rowspan", 1)) @@ -239,6 +267,8 @@ def __split_table_cells(self, table: Tag, table_list: List[List[Tag]]) -> None: table_list[index][cell_index:cell_index] = [cell_copy] * cell_colspan def __fix_table(self, table: Tag) -> List[List[Tag]]: + from dedoc.readers.html_reader.html_tags import HtmlTags + table_list = [] # create table list @@ -253,6 +283,9 @@ def __fix_table(self, table: Tag) -> List[List[Tag]]: return table_list def _read_table(self, table: Tag, filepath_hash: str) -> Table: + from dedoc.data_structures.cell_with_meta import CellWithMeta + from dedoc.data_structures.table_metadata import TableMetadata + cells_with_meta = [] fixed_table = self.__fix_table(table) diff --git a/dedoc/readers/json_reader/json_reader.py b/dedoc/readers/json_reader/json_reader.py index b83bb7cb..4afb768d 100644 --- a/dedoc/readers/json_reader/json_reader.py +++ b/dedoc/readers/json_reader/json_reader.py @@ -1,16 +1,9 @@ -from json import JSONDecodeError from typing import Any, List, Optional -import ujson as json - -from dedoc.attachments_extractors.concrete_attachments_extractors.json_attachment_extractor import JsonAttachmentsExtractor from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.common.exceptions.bad_parameters_error import BadParametersError -from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader @@ -20,6 +13,9 @@ class JsonReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.attachments_extractors.concrete_attachments_extractors.json_attachment_extractor import JsonAttachmentsExtractor + from dedoc.extensions import recognized_extensions, recognized_mimes + super().__init__(config=config, recognized_extensions=recognized_extensions.json_like_format, recognized_mimes=recognized_mimes.json_like_format) self.attachment_extractor = JsonAttachmentsExtractor(config=self.config) @@ -31,6 +27,10 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure The dictionaries are processed by creating key line with type `key` and value line as a child. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + from json import JSONDecodeError + import ujson as json + from dedoc.data_structures.hierarchy_level import HierarchyLevel + parameters = {} if parameters is None else parameters with open(file_path) as file: try: @@ -84,6 +84,8 @@ def __exclude_key(self, json_data: dict, keys: List[str]) -> None: del data[key] def __handle_list(self, depth: int, element: list, result: list, stack: list) -> None: + from dedoc.data_structures.hierarchy_level import HierarchyLevel + for _ in range(len(element)): sub_element = element.pop(0) line = self.__handle_one_element(depth=depth, value=sub_element, line_type=HierarchyLevel.list_item, line_type_meta=HierarchyLevel.list_item) @@ -106,6 +108,9 @@ def __handle_dict(self, depth: int, element: dict, result: list, stack: list) -> break def __handle_one_element(self, depth: int, value: Any, line_type: str, line_type_meta: str) -> LineWithMeta: # noqa + from dedoc.data_structures.hierarchy_level import HierarchyLevel + from dedoc.data_structures.line_metadata import LineMetadata + if depth == 1 and line_type == "title": level1, level2 = 0, 0 else: diff --git a/dedoc/readers/mhtml_reader/mhtml_reader.py b/dedoc/readers/mhtml_reader/mhtml_reader.py index d96ed0ec..7073ee54 100644 --- a/dedoc/readers/mhtml_reader/mhtml_reader.py +++ b/dedoc/readers/mhtml_reader/mhtml_reader.py @@ -1,20 +1,8 @@ -import email -import gzip -import os -import uuid from typing import List, Optional, Tuple -from urllib.parse import urlparse - -from bs4 import BeautifulSoup from dedoc.data_structures.attached_file import AttachedFile from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.readers.html_reader.html_reader import HtmlReader -from dedoc.utils import supported_image_types -from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis -from dedoc.utils.utils import check_filename_length, get_encoding, get_mime_extension, save_data_to_unique_file class MhtmlReader(BaseReader): @@ -23,6 +11,9 @@ class MhtmlReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.readers.html_reader.html_reader import HtmlReader + super().__init__(config=config, recognized_extensions=recognized_extensions.mhtml_like_format, recognized_mimes=recognized_mimes.mhtml_like_format) self.html_reader = HtmlReader(config=self.config) @@ -31,6 +22,8 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + from dedoc.utils.utils import get_mime_extension + mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) # this code differs from BaseReader because .eml and .mhtml files have the same mime type if extension: @@ -43,6 +36,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis + parameters = {} if parameters is None else parameters attachments_dir = get_param_attachments_dir(parameters, file_path) @@ -70,6 +65,12 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(tables=tables, lines=lines, attachments=attachments) def __extract_files(self, path: str, save_dir: str) -> Tuple[List[str], List[str]]: + import email + import gzip + import os + from urllib.parse import urlparse + from dedoc.utils.utils import check_filename_length, save_data_to_unique_file + names_list = [] original_names_list = [] if path.endswith(".gz"): @@ -97,6 +98,10 @@ def __extract_files(self, path: str, save_dir: str) -> Tuple[List[str], List[str return names_list, original_names_list def __find_html(self, names_list: List[str]) -> List[str]: + from bs4 import BeautifulSoup + from dedoc.utils import supported_image_types + from dedoc.utils.utils import get_encoding + html_list = [] for file_name in names_list: extension = file_name.split(".")[-1] @@ -114,6 +119,10 @@ def __find_html(self, names_list: List[str]) -> List[str]: return html_list def __get_attachments(self, save_dir: str, tmp_names_list: List[str], original_names_list: List[str], need_content_analysis: bool) -> List[AttachedFile]: + import os + import uuid + from dedoc.utils import supported_image_types + attachments = [] for tmp_file_name, original_file_name in zip(tmp_names_list, original_names_list): *_, extension = tmp_file_name.rsplit(".", maxsplit=1) diff --git a/dedoc/readers/note_reader/note_reader.py b/dedoc/readers/note_reader/note_reader.py index 902386d1..2f6f4617 100644 --- a/dedoc/readers/note_reader/note_reader.py +++ b/dedoc/readers/note_reader/note_reader.py @@ -1,9 +1,5 @@ -import os -import pickle from typing import Optional -from dedoc.common.exceptions.bad_file_error import BadFileFormatError -from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader @@ -21,6 +17,10 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure The method return document content with all document's lines. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + import os + import pickle + from dedoc.common.exceptions.bad_file_error import BadFileFormatError + from dedoc.data_structures.line_with_meta import LineWithMeta try: with open(file_path, "rb") as infile: diff --git a/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py b/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py index fd6c178b..1880701d 100644 --- a/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py +++ b/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py @@ -1,6 +1,6 @@ from typing import List, Optional -import numpy as np +from numpy import ndarray from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox @@ -8,7 +8,7 @@ class PageWithBBox: - def __init__(self, image: np.ndarray, bboxes: List[TextWithBBox], page_num: int, attachments: List[PdfImageAttachment] = None, + def __init__(self, image: ndarray, bboxes: List[TextWithBBox], page_num: int, attachments: List[PdfImageAttachment] = None, pdf_page_width: Optional[int] = None, pdf_page_height: Optional[int] = None) -> None: self.image = image self.bboxes = bboxes diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py index 5df58258..9308fded 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/cell.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/cell.py @@ -1,9 +1,7 @@ -import uuid from typing import List, Optional from dedocutils.data_structures import BBox -from dedoc.data_structures import BBoxAnnotation from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.line_with_meta import LineWithMeta @@ -44,6 +42,7 @@ def __init__(self, rotated_angle: int = 0, uid: str = None, contour_coord: Optional[BBox] = None) -> None: + import uuid assert x_top_left <= x_bottom_right assert y_top_left <= y_bottom_right @@ -72,6 +71,8 @@ def get_annotations(self) -> List[Annotation]: return LineWithMeta.join(self.lines, delimiter="\n").annotations def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None: + from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation + for i_line, _ in enumerate(self.lines): for i_ann, annotation in enumerate(self.lines[i_line].annotations): if annotation.name != "bounding box": diff --git a/dedoc/readers/pdf_reader/data_classes/tables/location.py b/dedoc/readers/pdf_reader/data_classes/tables/location.py index bc6492d4..27d053a5 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/location.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/location.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from functools import total_ordering from typing import Any, Dict @@ -14,6 +13,8 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: self.rotated_angle = rotated_angle def to_dict(self) -> Dict[str, Any]: + from collections import OrderedDict + res = OrderedDict() res["page_number"] = self.page_number res["bbox"] = self.bbox.to_dict() # [x_begin, y_begin, width, height] diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py index 4bc057df..ee8316d0 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py @@ -1,11 +1,10 @@ -import copy -from collections import OrderedDict from typing import Any, List -import numpy as np from dedocutils.data_structures import BBox -from dedoc.data_structures import CellWithMeta, Table, TableMetadata +from dedoc.data_structures.cell_with_meta import CellWithMeta +from dedoc.data_structures.table import Table +from dedoc.data_structures.table_metadata import TableMetadata from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.data_classes.tables.location import Location @@ -63,6 +62,9 @@ def get_index_of_end_string_attr(matrix_cells: List[List[Cell]]) -> int: @staticmethod def get_attributes_cell(matrix_cells: List[List[Cell]]) -> (List[int], List[List[Cell]], int): + import copy + import numpy as np + required_columns = [] for j in range(0, len(matrix_cells[0])): if matrix_cells[0][j].is_attribute_required: @@ -94,6 +96,8 @@ def uid(self) -> str: return self.name def to_dict(self) -> dict: + from collections import OrderedDict + data_text = ScanTable.get_cells_text(self.matrix_cells) res = OrderedDict() diff --git a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py index a212bc80..9cf865ce 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py @@ -1,14 +1,10 @@ -import logging from collections import namedtuple from typing import List, Optional -import cv2 -import numpy as np from dedocutils.data_structures import BBox +from numpy import ndarray -from dedoc.data_structures import LineWithMeta -from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_cell_extractor import OCRCellExtractor -from dedoc.utils.image_utils import crop_image_text +from dedoc.data_structures.line_with_meta import LineWithMeta """-------------------------------Таблица в виде дерева, полученная от OpenCV----------------------------------------""" ContourCell = namedtuple("ContourCell", ["id_con", "image"]) @@ -25,6 +21,8 @@ class TableTree(object): minimal_cell_avg_length_line = 10 def __init__(self, *, config: dict) -> None: + import logging + self.left = None self.right = None self.cell_box: Optional[BBox] = None # [x_begin, y_begin, width, height] @@ -36,7 +34,10 @@ def __init__(self, *, config: dict) -> None: self.config = config self.logger = config.get("logger", logging.getLogger()) - def set_text_into_tree(self, tree: "TableTree", src_image: np.ndarray, language: str = "rus", *, config: dict) -> None: + def set_text_into_tree(self, tree: "TableTree", src_image: ndarray, language: str = "rus", *, config: dict) -> None: + import logging + from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_cell_extractor import OCRCellExtractor + # get List of TableTree cur_depth = 0 begin_depth = 2 @@ -60,7 +61,9 @@ def set_text_into_tree(self, tree: "TableTree", src_image: np.ndarray, language: for lines, tree in zip(lines_with_meta, trees): tree.lines = lines - def set_crop_text_box(self, page_image: np.ndarray) -> None: + def set_crop_text_box(self, page_image: ndarray) -> None: + from dedoc.utils.image_utils import crop_image_text + cell_image = BBox.crop_image_by_box(page_image, self.cell_box) self.crop_text_box = crop_image_text(cell_image) # make crop_text_box'coordinates relative page_image @@ -69,6 +72,8 @@ def set_crop_text_box(self, page_image: np.ndarray) -> None: @staticmethod def parse_contours_to_tree(contours: List, hierarchy: List, *, config: dict) -> "TableTree": + import cv2 + table_tree = TableTree(config=config) table_tree.id_contours = 0 if len(contours) == 0: @@ -91,6 +96,8 @@ def print_tree(self, depth: int) -> None: ch.print_tree(depth + 1) def __build_childs(self, cur: "TableTree", hierarchy: List, contours: List) -> "TableTree": + import cv2 + list_childs = [] for i, h in enumerate(hierarchy[0]): if h[3] == cur.id_contours: diff --git a/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py b/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py index 8ac0d201..e7537a07 100644 --- a/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py +++ b/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py @@ -1,6 +1,4 @@ -from collections import OrderedDict from typing import List, Optional -from uuid import uuid1 from dedocutils.data_structures import BBox @@ -18,6 +16,8 @@ def __init__(self, uid: Optional[str] = None, label: Optional[str] = None, annotations: List[Annotation] = None) -> None: + from uuid import uuid1 + self.bbox = bbox self.page_num = page_num self.line_num = line_num @@ -37,6 +37,8 @@ def __repr__(self) -> str: return self.__str__() def to_dict(self) -> dict: + from collections import OrderedDict + res = OrderedDict() res["uid"] = self.uid res["_uid"] = self.uid diff --git a/dedoc/readers/pdf_reader/data_classes/word_with_bbox.py b/dedoc/readers/pdf_reader/data_classes/word_with_bbox.py index cb9dc6c7..88bb6460 100644 --- a/dedoc/readers/pdf_reader/data_classes/word_with_bbox.py +++ b/dedoc/readers/pdf_reader/data_classes/word_with_bbox.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - from dedocutils.data_structures import BBox @@ -16,6 +14,8 @@ def __repr__(self) -> str: return self.__str__() def to_dict(self) -> dict: + from collections import OrderedDict + res = OrderedDict() res["bbox"] = self.bbox.to_dict() res["text"] = self.text diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py index 9cfe9dd8..523c96a1 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py @@ -1,18 +1,7 @@ -import copy -import os -from itertools import chain from typing import Optional -from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation -from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_detector import TxtLayerDetector -from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader -from dedoc.utils.parameter_utils import get_param_page_slice, get_param_pdf_with_txt_layer class PdfAutoReader(BaseReader): @@ -31,7 +20,14 @@ class PdfAutoReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_detector import TxtLayerDetector + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) + self.pdf_txtlayer_reader = PdfTxtlayerReader(config=self.config) self.pdf_tabby_reader = PdfTabbyReader(config=self.config) self.pdf_image_reader = PdfImageReader(config=self.config) @@ -46,6 +42,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, It is recommended to use `pdf_with_text_layer=auto_tabby` because it's faster and allows to get better results. You can look to :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer return super().can_read(file_path=file_path, mime=mime, extension=extension) and get_param_pdf_with_txt_layer(parameters) in ("auto", "auto_tabby") def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: @@ -70,12 +67,17 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return result def __handle_incorrect_text_layer(self, parameters_copy: dict, path: str, warnings: list) -> UnstructuredDocument: + import os + self.logger.info(f"Assume document {os.path.basename(path)} has incorrect textual layer") warnings.append("Assume document has incorrect textual layer") result = self.pdf_image_reader.read(file_path=path, parameters=parameters_copy) return result def __handle_correct_text_layer(self, is_first_page_correct: bool, parameters: dict, path: str, warnings: list) -> UnstructuredDocument: + import os + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer + self.logger.info(f"Assume document {os.path.basename(path)} has a correct textual layer") warnings.append("Assume document has a correct textual layer") recognized_first_page = None @@ -99,6 +101,9 @@ def __handle_correct_text_layer(self, is_first_page_correct: bool, parameters: d return result def __preparing_first_page_parameters(self, parameters: dict) -> dict: + import copy + from dedoc.utils.parameter_utils import get_param_page_slice + first_page, last_page = get_param_page_slice(parameters) # calculate indexes for the first page parsing first_page_index = 0 if first_page is None else first_page @@ -111,6 +116,8 @@ def __preparing_first_page_parameters(self, parameters: dict) -> dict: return scan_parameters def __preparing_other_pages_parameters(self, parameters: dict) -> dict: + from dedoc.utils.parameter_utils import get_param_page_slice + first_page, last_page = get_param_page_slice(parameters) # parameters for reading pages from the second page first_page_index = 1 if first_page is None else first_page @@ -120,6 +127,10 @@ def __preparing_other_pages_parameters(self, parameters: dict) -> dict: return parameters def __merge_documents(self, first: UnstructuredDocument, second: UnstructuredDocument) -> UnstructuredDocument: + from itertools import chain + from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation + from dedoc.data_structures.line_with_meta import LineWithMeta + tables = first.tables dropped_tables = set() for table in second.tables: diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py index 766ac250..21716598 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py @@ -7,7 +7,7 @@ from xgboost import XGBClassifier from dedoc.config import get_config -from dedoc.data_structures import LineWithMeta +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.download_models import download_from_hub from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_feature_extractor import TxtlayerFeatureExtractor from dedoc.utils.parameter_utils import get_param_gpu_available diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py index cfff918d..0500698f 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py @@ -3,7 +3,7 @@ from copy import deepcopy from typing import List -from dedoc.data_structures import LineWithMeta +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier import TxtlayerClassifier from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 9d46175a..839b5006 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -1,34 +1,16 @@ -import math -import os from abc import abstractmethod from collections import namedtuple from typing import Iterator, List, Optional, Set, Tuple -import cv2 -import numpy as np -from joblib import Parallel, delayed -from pdf2image import convert_from_path -from pdf2image.exceptions import PDFPageCountError, PDFSyntaxError +from numpy import ndarray -import dedoc.utils.parameter_utils as param_utils -from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions as extensions, recognized_mimes as mimes from dedoc.readers.base_reader import BaseReader from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable -from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.metadata_extractor import LineMetadataExtractor -from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer -from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis -from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker -from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor -from dedoc.utils.pdf_utils import get_pdf_page_count -from dedoc.utils.utils import flatten, get_file_mime_by_content -from dedoc.utils.utils import get_file_mime_type, splitext_ ParametersForParseDoc = namedtuple("ParametersForParseDoc", [ "orient_analysis_cells", @@ -55,6 +37,13 @@ class PdfBaseReader(BaseReader): def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Optional[Set[str]] = None, recognized_mimes: Optional[Set[str]] = None) -> None: super().__init__(config=config, recognized_extensions=recognized_extensions, recognized_mimes=recognized_mimes) + + from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.metadata_extractor import LineMetadataExtractor + from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor + from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer + from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker + from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor + self.config["n_jobs"] = self.config.get("n_jobs", 1) self.table_recognizer = TableRecognizer(config=self.config) self.metadata_extractor = LineMetadataExtractor(config=self.config) @@ -70,6 +59,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ + import dedoc.utils.parameter_utils as param_utils + parameters = {} if parameters is None else parameters first_page, last_page = param_utils.get_param_page_slice(parameters) @@ -101,6 +92,13 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( Tuple)[List[LineWithMeta], List[ScanTable], List[PdfImageAttachment], List[str], Optional[dict]]: + import math + from joblib import Parallel, delayed + from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.utils.pdf_utils import get_pdf_page_count + from dedoc.utils.utils import flatten + first_page = 0 if parameters.first_page is None or parameters.first_page < 0 else parameters.first_page last_page = math.inf if parameters.last_page is None else parameters.last_page images = self._get_images(path, first_page, last_page) @@ -142,7 +140,7 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata @abstractmethod - def _process_one_page(self, image: np.ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \ + def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \ -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]: """ function parses image and returns: @@ -153,7 +151,13 @@ def _process_one_page(self, image: np.ndarray, parameters: ParametersForParseDoc """ pass - def _get_images(self, path: str, page_from: int, page_to: int) -> Iterator[np.ndarray]: + def _get_images(self, path: str, page_from: int, page_to: int) -> Iterator[ndarray]: + import os + import cv2 + from dedoc.extensions import recognized_extensions as extensions, recognized_mimes as mimes + from dedoc.utils.utils import get_file_mime_by_content + from dedoc.utils.utils import get_file_mime_type, splitext_ + mime = get_file_mime_type(path) mime = get_file_mime_by_content(path) if mime not in self._recognized_mimes else mime if mime in mimes.pdf_like_format: @@ -166,10 +170,17 @@ def _get_images(self, path: str, page_from: int, page_to: int) -> Iterator[np.nd else: raise BadFileFormatError(f"Unsupported input format: {splitext_(path)[1]}") - def _split_pdf2image(self, path: str, page_from: int, page_to: int) -> Iterator[np.ndarray]: + def _split_pdf2image(self, path: str, page_from: int, page_to: int) -> Iterator[ndarray]: if page_from >= page_to: return + import math + import os + import numpy as np + from pdf2image import convert_from_path + from pdf2image.exceptions import PDFPageCountError, PDFSyntaxError + from dedoc.utils.pdf_utils import get_pdf_page_count + try: page_count = get_pdf_page_count(path) page_count = math.inf if page_count is None else page_count @@ -179,7 +190,7 @@ def _split_pdf2image(self, path: str, page_from: int, page_to: int) -> Iterator[ while (images is None or len(images) > 0) and left <= min(page_to, page_count): right = left + step # for convert_from_path function first_page should start from 1, last_page is included to the result - images = convert_from_path(path, first_page=left, last_page=right) # noqa + images = convert_from_path(path, first_page=left, last_page=right) # in logging we include both ends of the pages interval, numeration starts with 1 self.logger.info(f"Get page from {left} to {min(right, page_count)} of {page_count} file {os.path.basename(path)}") for image in images: @@ -190,24 +201,30 @@ def _split_pdf2image(self, path: str, page_from: int, page_to: int) -> Iterator[ except (PDFPageCountError, PDFSyntaxError) as error: raise BadFileFormatError(f"Bad pdf file:\n file_name = {os.path.basename(path)} \n exception = {error.args}") - def _convert_to_gray(self, image: np.ndarray) -> np.ndarray: + def _convert_to_gray(self, image: ndarray) -> ndarray: + import cv2 + import numpy as np + gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY) gray_image = self._binarization(gray_image) return gray_image - def _binarization(self, gray_image: np.ndarray) -> np.ndarray: + def _binarization(self, gray_image: ndarray) -> ndarray: + import numpy as np + if gray_image.mean() < 220: # filter black and white image binary_mask = gray_image >= np.quantile(gray_image, 0.05) gray_image[binary_mask] = 255 return gray_image def eval_tables_by_batch(self, - batch: Iterator[np.ndarray], + batch: Iterator[ndarray], page_number_begin: int, language: str, orient_analysis_cells: bool = False, orient_cell_angle: int = 270, - table_type: str = "") -> Tuple[List[np.ndarray], List[ScanTable]]: + table_type: str = "") -> Tuple[List[ndarray], List[ScanTable]]: + from joblib import Parallel, delayed result_batch = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.table_recognizer.recognize_tables_from_image)( image, page_number_begin + i, language, orient_analysis_cells, orient_cell_angle, table_type) for i, image in enumerate(batch)) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py index b3b0790b..245234c7 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py @@ -59,5 +59,9 @@ def __init__(self) -> None: def load_dataset(self, csv_path: str, image_path: str, batch_size: int = 4) -> DataLoader: trainset = DatasetImageOrient(csv_file=csv_path, root_dir=image_path, transform=self.transform) trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2) + self.amount = len(trainset) return trainloader + + def __len__(self) -> int: + return self.amount diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py index c9ef35a8..f381fc6b 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py @@ -6,7 +6,10 @@ import numpy as np from dedocutils.data_structures import BBox -from dedoc.data_structures import BBoxAnnotation, ConfidenceAnnotation, LineMetadata, LineWithMeta +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation +from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation +from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_page import OcrPage from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_cells @@ -62,7 +65,8 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"], return self.__create_lines_with_meta(tree_nodes, originalbox_to_fastocrbox, page_image) - def __handle_one_batch(self, src_image: np.ndarray, tree_table_nodes: List["TableTree"], num_batch: int, language: str = "rus") -> Tuple[OcrPage, List[BBox]]: # noqa + def __handle_one_batch(self, src_image: np.ndarray, tree_table_nodes: List["TableTree"], num_batch: int, language: str = "rus") \ + -> Tuple[OcrPage, List[BBox]]: # noqa concatenated, chunk_boxes = self.__concat_images(src_image=src_image, tree_table_nodes=tree_table_nodes) if self.config.get("debug_mode", False): debug_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables", "batches") @@ -120,7 +124,8 @@ def __nodes2batch(self, tree_nodes: List["TableTree"]) -> Iterator[List["TableTr if len(batch) > 0: yield batch - def __create_lines_with_meta(self, tree_nodes: List["TableTree"], original_box_to_fast_ocr_box: dict, original_image: np.ndarray) -> List[List[LineWithMeta]]: # noqa + def __create_lines_with_meta(self, tree_nodes: List["TableTree"], original_box_to_fast_ocr_box: dict, original_image: np.ndarray) \ + -> List[List[LineWithMeta]]: # noqa nodes_lines = [] for node in tree_nodes: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index d685b8e2..0a9a54a3 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -1,21 +1,11 @@ -import os -from datetime import datetime from typing import List, Optional, Tuple -import cv2 -import numpy as np -from dedocutils.preprocessing import AdaptiveBinarizer, SkewCorrector +from numpy import ndarray -from dedoc.config import get_config -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader -from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier -from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor -from dedoc.utils import supported_image_types -from dedoc.utils.parameter_utils import get_path_param class PdfImageReader(PdfBaseReader): @@ -41,6 +31,13 @@ class PdfImageReader(PdfBaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedocutils.preprocessing import AdaptiveBinarizer, SkewCorrector + from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier + from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor + from dedoc.config import get_config + from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.utils import supported_image_types + supported_image_extensions = {ext for ext in supported_image_types if ext.startswith(".")} super().__init__( config=config, @@ -54,10 +51,15 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.ocr = OCRLineExtractor(config=self.config) def _process_one_page(self, - image: np.ndarray, + image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]: + import os + from datetime import datetime + import cv2 + from dedoc.utils.parameter_utils import get_path_param + # --- Step 1: correct orientation and detect column count --- rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters) if self.config.get("debug_mode", False): @@ -89,7 +91,7 @@ def _process_one_page(self, lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page) return lines, tables, page.attachments, [angle] - def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool, float]: + def _detect_column_count_and_orientation(self, image: ndarray, parameters: ParametersForParseDoc) -> Tuple[ndarray, bool, float]: """ Function : - detects the number of page columns @@ -97,6 +99,11 @@ def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: Pa - rotates the page on detected angle Return: rotated_image and indicator if the page is one-column """ + import os + from datetime import datetime + import cv2 + from dedoc.utils.parameter_utils import get_path_param + columns, angle = None, None if parameters.is_one_column_document is None or parameters.document_orientation is None: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py index 80055a9b..0b14f034 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py @@ -2,9 +2,10 @@ from typing import List import numpy as np -from dedocutils.data_structures import BBox +from dedocutils.data_structures.bbox import BBox -from dedoc.data_structures import ConfidenceAnnotation, LineWithMeta +from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_cell_extractor import OCRCellExtractor from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_cells diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 56c7d2ae..4eaed54c 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -1,43 +1,18 @@ -import json -import math -import os -import shutil -import subprocess -import uuid from typing import List, Optional, Tuple -import numpy as np from dedocutils.data_structures import BBox +from numpy import ndarray from dedoc.common.exceptions.java_not_found_error import JavaNotFoundError from dedoc.common.exceptions.tabby_pdf_error import TabbyPdfError -from dedoc.data_structures.cell_with_meta import CellWithMeta -from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation -from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation -from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation -from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation -from dedoc.data_structures.concrete_annotations.linked_text_annotation import LinkedTextAnnotation -from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation -from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation -from dedoc.data_structures.concrete_annotations.style_annotation import StyleAnnotation from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.table import Table -from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment -from dedoc.readers.pdf_reader.data_classes.tables.location import Location from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader -from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor -from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth -from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_page_slice, get_param_pdf_with_txt_layer, \ - get_param_with_attachments -from dedoc.utils.pdf_utils import get_pdf_page_count -from dedoc.utils.utils import calculate_file_hash, get_unique_name class PdfTabbyReader(PdfBaseReader): @@ -51,6 +26,9 @@ class PdfTabbyReader(PdfBaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + import os + from dedoc.extensions import recognized_extensions, recognized_mimes + super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) self.tabby_java_version = "2.0.0" self.jar_name = "ispras_tbl_extr.jar" @@ -67,6 +45,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer return super().can_read(file_path=file_path, mime=mime, extension=extension) and get_param_pdf_with_txt_layer(parameters) == "tabby" def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: @@ -77,6 +56,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ + from dedoc.utils.parameter_utils import get_param_with_attachments parameters = {} if parameters is None else parameters warnings = [] lines, tables, tables_on_images, attachments, document_metadata = self.__extract(path=file_path, parameters=parameters, warnings=warnings) @@ -93,6 +73,11 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure def __extract(self, path: str, parameters: dict, warnings: list)\ -> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]: + import math + from dedoc.utils.pdf_utils import get_pdf_page_count + from dedoc.utils.utils import calculate_file_hash + from dedoc.utils.parameter_utils import get_param_page_slice, get_param_with_attachments + all_lines, all_tables, all_tables_on_images, all_attached_images = [], [], [], [] with_attachments = get_param_with_attachments(parameters) document_metadata = None @@ -137,6 +122,12 @@ def __extract(self, path: str, parameters: dict, warnings: list)\ return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]: + import uuid + from dedoc.data_structures.cell_with_meta import CellWithMeta + from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation + from dedoc.data_structures.line_metadata import LineMetadata + from dedoc.data_structures.table_metadata import TableMetadata + tables = [] tables_on_image = [] page_number = page["number"] @@ -178,6 +169,13 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]: return tables, tables_on_image def __get_attached_images(self, page: dict, parameters: dict, path: str) -> List[PdfImageAttachment]: + import os + import shutil + import uuid + from dedoc.readers.pdf_reader.data_classes.tables.location import Location + from dedoc.utils.utils import get_unique_name + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis + attachments_dir = get_param_attachments_dir(parameters, path) need_content_analysis = get_param_need_content_analysis(parameters) @@ -204,6 +202,17 @@ def __get_attached_images(self, page: dict, parameters: dict, path: str) -> List return image_attachment_list def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]: + from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation + from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation + from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation + from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation + from dedoc.data_structures.concrete_annotations.linked_text_annotation import LinkedTextAnnotation + from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation + from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation + from dedoc.data_structures.concrete_annotations.style_annotation import StyleAnnotation + from dedoc.data_structures.line_metadata import LineMetadata + from dedoc.readers.pdf_reader.data_classes.tables.location import Location + lines = [] page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"]) prev_line = None @@ -260,6 +269,9 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith return lines def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_type: str) -> HierarchyLevel: + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth + if line_type == HierarchyLevel.header: header_level = get_dotted_item_depth(line.line) header_level = header_level if header_level != -1 else 1 @@ -271,9 +283,12 @@ def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_ return HierarchyLevel(None, None, True, line_type) def __jar_path(self) -> str: + import os return os.environ.get("TABBY_JAR", self.default_config["JAR_PATH"]) def __run(self, path: str = None, encoding: str = "utf-8", start_page: int = None, end_page: int = None) -> bytes: + import subprocess + args = ["java"] + ["-jar", self.__jar_path(), "-i", path] if start_page is not None and end_page is not None: args += ["-sp", str(start_page), "-ep", str(end_page)] @@ -288,13 +303,15 @@ def __run(self, path: str = None, encoding: str = "utf-8", start_page: int = Non raise TabbyPdfError(e.stderr.decode(encoding)) def __process_pdf(self, path: str, start_page: int = None, end_page: int = None) -> dict: + import json + output = self.__run(path=path, start_page=start_page, end_page=end_page) response = output.decode("UTF-8") document = json.loads(response) if response else {} return document def _process_one_page(self, - image: np.ndarray, + image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]: diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index abf9c38d..d7bb2b6a 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -1,15 +1,12 @@ from typing import List, Optional, Tuple -import numpy as np from dedocutils.data_structures import BBox +from numpy import ndarray -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor -from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer class PdfTxtlayerReader(PdfBaseReader): @@ -21,7 +18,11 @@ class PdfTxtlayerReader(PdfBaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes + super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) + + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor self.extractor_layer = PdfminerExtractor(config=self.config) def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: @@ -33,10 +34,11 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer return super().can_read(file_path=file_path, mime=mime, extension=extension) and get_param_pdf_with_txt_layer(parameters) == "true" def _process_one_page(self, - image: np.ndarray, + image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]: diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py index 8d17f4d8..1d030885 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py @@ -146,7 +146,7 @@ def __extract_image(self, @staticmethod def __get_image(path: str, page_num: int) -> np.ndarray: - image_page = np.array(get_page_image(path=path, page_id=page_num)) # noqa + image_page = np.array(get_page_image(path=path, page_id=page_num)) image_page = np.array(image_page) if len(image_page.shape) == 2: image_page = cv2.cvtColor(image_page, cv2.COLOR_GRAY2BGR) diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar index 2d22e7c2..1115c4d6 100644 Binary files a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar differ diff --git a/dedoc/readers/pdf_reader/utils/line_object_linker.py b/dedoc/readers/pdf_reader/utils/line_object_linker.py index 9b70b87c..0e562bdc 100644 --- a/dedoc/readers/pdf_reader/utils/line_object_linker.py +++ b/dedoc/readers/pdf_reader/utils/line_object_linker.py @@ -57,7 +57,7 @@ def link_objects(self, lines: List[LineWithLocation], tables: List[ScanTable], i self.logger.warning(f"Unsupported page object type {page_object}") if self.config.get("debug_mode", False): raise Exception(f"Unsupported page object type {page_object}") - best_line.annotations.append(annotation) # noqa + best_line.annotations.append(annotation) return lines def _add_lines(self, all_objects: List[Union[LineWithLocation, ScanTable, PdfImageAttachment]], lines_key: str, objects_with_line_candidate: dict) -> None: diff --git a/dedoc/readers/pptx_reader/paragraph.py b/dedoc/readers/pptx_reader/paragraph.py index 2dfcb952..129ac3a3 100644 --- a/dedoc/readers/pptx_reader/paragraph.py +++ b/dedoc/readers/pptx_reader/paragraph.py @@ -1,7 +1,16 @@ from bs4 import Tag -from dedoc.data_structures import AlignmentAnnotation, BoldAnnotation, HierarchyLevel, ItalicAnnotation, LineMetadata, LineWithMeta, SizeAnnotation, \ - StrikeAnnotation, SubscriptAnnotation, SuperscriptAnnotation, UnderlinedAnnotation +from dedoc.data_structures.concrete_annotations.alignment_annotation import AlignmentAnnotation +from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation +from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation +from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation +from dedoc.data_structures.concrete_annotations.strike_annotation import StrikeAnnotation +from dedoc.data_structures.concrete_annotations.subscript_annotation import SubscriptAnnotation +from dedoc.data_structures.concrete_annotations.superscript_annotation import SuperscriptAnnotation +from dedoc.data_structures.concrete_annotations.underlined_annotation import UnderlinedAnnotation +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor from dedoc.utils.annotation_merger import AnnotationMerger diff --git a/dedoc/readers/pptx_reader/pptx_reader.py b/dedoc/readers/pptx_reader/pptx_reader.py index 2d68b850..b3f581e6 100644 --- a/dedoc/readers/pptx_reader/pptx_reader.py +++ b/dedoc/readers/pptx_reader/pptx_reader.py @@ -1,21 +1,12 @@ -import zipfile from typing import Dict, List, Optional from bs4 import BeautifulSoup, Tag -from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor -from dedoc.data_structures import AttachAnnotation, Table, TableAnnotation -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.data_structures.table import Table from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor -from dedoc.readers.pptx_reader.shape import PptxShape -from dedoc.readers.pptx_reader.table import PptxTable -from dedoc.utils.office_utils import get_bs_from_zip -from dedoc.utils.parameter_utils import get_param_with_attachments class PptxReader(BaseReader): @@ -25,6 +16,10 @@ class PptxReader(BaseReader): """ def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor + from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor + super().__init__(config=config, recognized_extensions=recognized_extensions.pptx_like_format, recognized_mimes=recognized_mimes.pptx_like_format) self.attachments_extractor = PptxAttachmentsExtractor(config=self.config) self.numbering_extractor = NumberingExtractor() @@ -34,6 +29,10 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure The method return document content with all document's lines, tables and attachments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + from dedoc.data_structures.line_metadata import LineMetadata + from dedoc.readers.pptx_reader.shape import PptxShape + from dedoc.utils.parameter_utils import get_param_with_attachments + with_attachments = get_param_with_attachments(parameters) attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else [] attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments} @@ -71,6 +70,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=[]) def __get_slides_bs(self, path: str, xml_prefix: str, xml_postfix: str) -> List[BeautifulSoup]: + import zipfile + from dedoc.utils.office_utils import get_bs_from_zip + with zipfile.ZipFile(path) as document: xml_names = document.namelist() filtered_names = [file_name for file_name in xml_names if file_name.startswith(xml_prefix) and file_name.endswith(xml_postfix)] @@ -94,6 +96,10 @@ def __get_slide_images_rels(self, path: str) -> Dict[str, str]: return images_rels def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, table_xml: Tag, properties_extractor: PropertiesExtractor) -> None: + from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation + from dedoc.data_structures.line_metadata import LineMetadata + from dedoc.readers.pptx_reader.table import PptxTable + table = PptxTable(table_xml, page_id, self.numbering_extractor, properties_extractor).to_table() if len(lines) == 0: @@ -102,6 +108,8 @@ def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: i tables.append(table) def __add_attach_annotation(self, line: LineWithMeta, image_rel_id: str, attachment_name2uid: dict, images_rels: dict) -> None: + from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation + try: image_name = images_rels[image_rel_id] image_uid = attachment_name2uid[image_name] diff --git a/dedoc/readers/pptx_reader/properties_extractor.py b/dedoc/readers/pptx_reader/properties_extractor.py index 67c0c919..213bafb8 100644 --- a/dedoc/readers/pptx_reader/properties_extractor.py +++ b/dedoc/readers/pptx_reader/properties_extractor.py @@ -1,11 +1,8 @@ -from copy import deepcopy from dataclasses import dataclass from typing import Dict, Optional from bs4 import Tag -from dedoc.utils.office_utils import get_bs_from_zip - @dataclass class Properties: @@ -42,6 +39,8 @@ def get_properties(self, xml: Tag, level: int, properties: Optional[Properties]