Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new version 2.3.1 #509

Merged
merged 7 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.3
2.3.1
2 changes: 1 addition & 1 deletion dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ <h4>Tables handling </h4>

<div class="parameters">
<h4>PDF handling</h4>
<details><summary>pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
<details><summary>pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
<br>
<p>
<label>
Expand Down
6 changes: 3 additions & 3 deletions dedoc/data_structures/cell_with_meta.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List
from typing import List, Optional

from dedoc.api.schema.cell_with_meta import CellWithMeta as ApiCellWithMeta
from dedoc.data_structures.annotation import Annotation
Expand All @@ -20,14 +20,14 @@ class CellWithMeta(Serializable):
:vartype rowspan: int
:vartype invisible: bool
"""
def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
def __init__(self, lines: Optional[List[LineWithMeta]], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
"""
:param lines: textual lines of the cell
:param colspan: number of columns to span like in HTML format
:param rowspan: number of rows to span like in HTML format
:param invisible: indicator for displaying or hiding cell text
"""
self.lines: List[LineWithMeta] = lines
self.lines: List[LineWithMeta] = [] if lines is None else lines
self.colspan: int = colspan
self.rowspan: int = rowspan
self.invisible: bool = invisible
Expand Down
7 changes: 5 additions & 2 deletions dedoc/readers/article_reader/article_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ def __init__(self, config: Optional[dict] = None) -> None:
else:
self.grobid_url = f"http://{os.environ.get('GROBID_HOST', 'localhost')}:{os.environ.get('GROBID_PORT', '8070')}"
self.url = f"{self.grobid_url}/api/processFulltextDocument"

auth_key = os.environ.get("GROBID_AUTH_KEY", "")
self.request_headers = {"Authorization": auth_key} if auth_key else {}
self.grobid_is_alive = False

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
Expand All @@ -48,7 +51,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
with open(file_path, "rb") as file:
files = {"input": file}
try:
response = requests.post(self.url, files=files, data={"teiCoordinates": "figure"})
response = requests.post(self.url, files=files, data={"teiCoordinates": "figure"}, headers=self.request_headers)
if response.status_code != 200:
warning = f"GROBID returns code {response.status_code}."
self.logger.warning(warning)
Expand Down Expand Up @@ -106,7 +109,7 @@ def __update_grobid_alive(self, grobid_url: str, max_attempts: int = 2) -> None:
attempt = max_attempts
while attempt > 0:
try:
response = requests.get(f"{grobid_url}/api/isalive")
response = requests.get(f"{grobid_url}/api/isalive", headers=self.request_headers)
if response.status_code == 200:
self.logger.info(f"GROBID up on {grobid_url}.")
self.grobid_is_alive = True
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/docx_reader/numbering_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ def parse(self, xml: Tag, paragraph_properties: BaseProperties, run_properties:
self.styles_extractor.parse(lvl_info.style_id, paragraph_properties, StyleType.NUMBERING)
if lvl_info.pPr:
change_paragraph_properties(paragraph_properties, lvl_info.pPr)
# run properties are applied only to the numbering text ("lvlText" content)
if lvl_info.rPr:
change_run_properties(run_properties, lvl_info.rPr)
change_run_properties(paragraph_properties, lvl_info.rPr)

run_properties.text = text
paragraph_properties.list_level = self.state.levels_count
Expand Down
25 changes: 9 additions & 16 deletions dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
from dedocutils.data_structures import BBox

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.data_structures.line_with_meta import LineWithMeta


class Cell:
class Cell(CellWithMeta):

@staticmethod
def copy_from(cell: "Cell",
Expand Down Expand Up @@ -41,35 +42,27 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int)
if self.con_coord:
self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)

def __init__(self,
x_top_left: int,
x_bottom_right: int,
y_top_left: int,
y_bottom_right: int,
id_con: int = -1,
lines: Optional[List[LineWithMeta]] = None,
is_attribute: bool = False,
is_attribute_required: bool = False,
rotated_angle: int = 0,
uid: str = None,
def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None,
contour_coord: Optional[BBox] = None) -> None:

import uuid

assert x_top_left <= x_bottom_right
assert y_top_left <= y_bottom_right

self.lines = [] if lines is None else lines
super().__init__(lines)

self.x_top_left = x_top_left
self.x_bottom_right = x_bottom_right
self.y_top_left = y_top_left
self.y_bottom_right = y_bottom_right
self.id_con = id_con
self.lines = [] if lines is None else lines
self.is_attribute = is_attribute
self.is_attribute_required = is_attribute_required
self.rotated_angle = rotated_angle
self.cell_uid = f"cell_{uuid.uuid1()}" if uid is None else uid
self.colspan = 1
self.rowspan = 1
self.invisible = False
self.con_coord = contour_coord or BBox(0, 0, 0, 0)

def __str__(self) -> str:
Expand Down
1 change: 1 addition & 0 deletions dedoc/readers/pdf_reader/data_classes/tables/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle:
self.page_number = page_number
self.bbox = bbox
self.name = name
# TODO put self.order (change LineWithLocation, PdfImageAttachment, ScanTable)
self.rotated_angle = rotated_angle

def shift(self, shift_x: int, shift_y: int) -> None:
Expand Down
14 changes: 12 additions & 2 deletions dedoc/readers/pdf_reader/data_classes/tables/scantable.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, List
from typing import Any, List, Optional

from dedocutils.data_structures import BBox

Expand All @@ -10,7 +10,8 @@


class ScanTable:
def __init__(self, page_number: int, matrix_cells: List[List[Cell]] = None, bbox: BBox = None, name: str = "", order: int = -1) -> None:
def __init__(self, page_number: int, matrix_cells: Optional[List[List[CellWithMeta]]] = None, bbox: Optional[BBox] = None,
name: str = "", order: int = -1) -> None:
self.matrix_cells = matrix_cells
self.page_number = page_number
self.locations = []
Expand All @@ -27,6 +28,15 @@ def extended(self, table: "ScanTable") -> None:
# extend order
self.order = max(self.order, table.order)

def check_on_cell_instance(self) -> bool:
if len(self.matrix_cells) == 0:
return False
if len(self.matrix_cells[0]) == 0:
return False
if not isinstance(self.matrix_cells[0][0], Cell):
return False
return True

def to_table(self) -> Table:
metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle)
cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells]
Expand Down
12 changes: 6 additions & 6 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from collections import namedtuple
from typing import Dict, Iterator, List, Optional, Set, Tuple

import numpy as np
from dedocutils.data_structures.bbox import BBox
from numpy import ndarray

Expand All @@ -13,7 +12,7 @@
from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer


ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
"orient_analysis_cells",
Expand Down Expand Up @@ -45,6 +44,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti

from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.metadata_extractor import LineMetadataExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer
from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker
from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor
Expand Down Expand Up @@ -153,24 +153,24 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
metadata["rotated_page_angles"] = page_angles
return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata

def _process_document_with_gost_frame(self, images: Iterator[np.ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \
Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]]:
def _process_document_with_gost_frame(self, images: Iterator[ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \
Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]]:
from joblib import Parallel, delayed
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader

gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
page_range = range(first_page, first_page + len(gost_analyzed_images))
gost_analyzed_images = dict(zip(page_range, gost_analyzed_images))
if isinstance(self, PdfTxtlayerReader):
self.gost_frame_boxes = dict(zip(page_range, [item[1] for item in gost_analyzed_images.values()]))
self.gost_frame_boxes = dict(zip(page_range, [(item[1], item[2]) for item in gost_analyzed_images.values()]))
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in
gost_analyzed_images.items()
)
return result, gost_analyzed_images

def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[ScanTable], attachments: List[PdfImageAttachment],
gost_analyzed_images: Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]) -> None:
gost_analyzed_images: Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]) -> None:
# shift unref_tables
for scan_table in unref_tables:
for location in scan_table.locations:
Expand Down
4 changes: 4 additions & 0 deletions dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from numpy import ndarray

from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
Expand Down Expand Up @@ -53,6 +54,9 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
self.binarizer = AdaptiveBinarizer()
self.ocr = OCRLineExtractor(config=self.config)

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
return super().read(file_path, parameters)

def _process_one_page(self,
image: ndarray,
parameters: ParametersForParseDoc,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import logging
from typing import List

Expand Down Expand Up @@ -155,24 +156,26 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool:
# condition 2. Exclusion of the duplicated header (if any)
attr1 = TableAttributeExtractor.get_header_table(t1.matrix_cells)
attr2 = TableAttributeExtractor.get_header_table(t2.matrix_cells)
t2_update = copy.deepcopy(t2)
if TableAttributeExtractor.is_equal_attributes(attr1, attr2):
t2.matrix_cells = t2.matrix_cells[len(attr2):]
t2_update.matrix_cells = t2_update.matrix_cells[len(attr2):]

if len(t2.matrix_cells) == 0 or len(t1.matrix_cells) == 0:
if len(t2_update.matrix_cells) == 0 or len(t1.matrix_cells) == 0:
return False

TableAttributeExtractor.clear_attributes(t2.matrix_cells)
TableAttributeExtractor.clear_attributes(t2_update.matrix_cells)

# condition 3. Number of columns should be equal
if len(t1.matrix_cells[-1]) != len(t2.matrix_cells[0]):
if len(t1.matrix_cells[-1]) != len(t2_update.matrix_cells[0]):
if self.config.get("debug_mode", False):
self.logger.debug("Different count column")
return False

# condition 4. Comparison of the widths of last and first rows
if not self.__is_equal_width_cells(t1.matrix_cells, t2.matrix_cells):
if t1.check_on_cell_instance() and t2_update.check_on_cell_instance() and not self.__is_equal_width_cells(t1.matrix_cells, t2_update.matrix_cells):
if self.config.get("debug_mode", False):
self.logger.debug("Different width columns")
return False

t2.matrix_cells = copy.deepcopy(t2_update.matrix_cells) # save changes
return True
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,22 @@ def is_equal_attributes(attr1: List[List[Cell]], attr2: List[List[Cell]], thr_si

return True

@staticmethod
def check_have_attributes(matrix_table: List[List[Cell]]) -> bool:
if len(matrix_table) == 0:
return False
if len(matrix_table[0]) == 0:
return False
if not hasattr(matrix_table[0][0], "is_attribute"):
return False
return True

@staticmethod
def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]:

if not TableAttributeExtractor.check_have_attributes(matrix_table):
return matrix_table[:1]

header_rows = len(matrix_table)
for (i, row) in enumerate(matrix_table):
attrs = [cell for cell in row if cell.is_attribute]
Expand All @@ -44,6 +58,9 @@ def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]:

@staticmethod
def clear_attributes(matrix_table: List[List[Cell]]) -> None:
if not TableAttributeExtractor.check_have_attributes(matrix_table):
return

for row in matrix_table:
for cell in row:
cell.is_attribute = False
Expand Down
Loading
Loading