Skip to content

Commit

Permalink
update master (#288)
Browse files Browse the repository at this point in the history
* Add version checking for release pipeline

* TLDR-260 fix docx bug: change attachment annotation value (#286)

* TLDR-260 fix docx bug: change attachment annotation value from filename to file uid

* TLDR-260 review fixes

* new version 0.9.1 (#287)
  • Loading branch information
NastyBoget authored Jul 5, 2023
1 parent 5237390 commit a2e8feb
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 49 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ jobs:
with:
python-version: '3.9'

# - name: Check version correctness
# run: |
# python3 .github/check_version.py --branch ${{ github.event.release.target_commitish }} --tag $GITHUB_REF_NAME \
# --new_version $(< VERSION) --old_version $(git cat-file -p $(git rev-parse "$GITHUB_SHA"^1):VERSION) \
# --pre_release ${{ github.event.release.prerelease }}
- name: Check version correctness
run: |
python3 .github/check_version.py --branch ${{ github.event.release.target_commitish }} --tag $GITHUB_REF_NAME \
--new_version $(< VERSION) --old_version $(git cat-file -p $(git rev-parse "$GITHUB_SHA"^1):VERSION) \
--pre_release ${{ github.event.release.prerelease }}
- name: Install dependencies
run: |
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9
0.9.1
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import tempfile
import zipfile
from typing import List, Optional

from bs4 import BeautifulSoup, Tag

from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor
from dedoc.common.exceptions.bad_file_exception import BadFileFormatException
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.utils.utils import splitext_


class DocxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor):
Expand All @@ -30,17 +31,16 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[
the methods' parameters.
"""
result = []
name, ext = splitext_(filename)

if ext.lower() != '.docx':
return []
try:
with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile:
diagram_attachments = self.__extract_diagrams(zfile)
need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis)

with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile:
diagram_attachments = self.__extract_diagrams(zfile)
need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis)
result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word")

result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word")
except zipfile.BadZipFile:
raise BadFileFormatException("Bad docx file:\n file_name = {}. Seems docx is broken".format(filename))
return result

def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]:
Expand Down
23 changes: 18 additions & 5 deletions dedoc/readers/docx_reader/data_structures/docx_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from bs4 import BeautifulSoup, Tag

from dedoc.common.exceptions.bad_file_exception import BadFileFormatException
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation
from dedoc.data_structures.line_with_meta import LineWithMeta
Expand All @@ -23,10 +24,11 @@


class DocxDocument:
def __init__(self, path: str, logger: logging.Logger) -> None:
def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.Logger) -> None:
self.logger = logger
self.path = path
self.path_hash = calculate_file_hash(path=path)
self.attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}

self.document_bs_tree = self.__get_bs_tree('word/document.xml')
if self.document_bs_tree is None:
Expand Down Expand Up @@ -68,7 +70,7 @@ def __get_lines(self, logger: logging.Logger) -> List[LineWithMeta]:
continue

if paragraph_xml.pict: # diagrams are saved using docx_attachments_extractor
self.__handle_diagrams_xml(paragraph_xml, diagram_refs, uids_set, cnt)
self.__handle_diagram_xml(paragraph_xml, diagram_refs, uids_set, cnt)
continue

if paragraph_xml.name != 'p':
Expand Down Expand Up @@ -179,11 +181,22 @@ def __handle_images_xml(self, xmls: List[Tag], image_refs: dict, uids_set: set,

for image_xml in xmls:
blips = image_xml.find_all("a:blip")
image_uid = images_rels[blips[0]["r:embed"]]
image_name = images_rels[blips[0]["r:embed"]]

if image_name in self.attachment_name2uid:
image_uid = self.attachment_name2uid[image_name]
else:
self.logger.info(f"Attachment with name {image_name} not found")
continue
image_refs[len(self.paragraph_list) - 1].append(image_uid)

def __handle_diagrams_xml(self, xml: Tag, diagram_refs: dict, uids_set: set, cnt: Counter) -> None:
diagram_uid = hashlib.md5(xml.encode()).hexdigest()
def __handle_diagram_xml(self, xml: Tag, diagram_refs: dict, uids_set: set, cnt: Counter) -> None:
diagram_name = f"{hashlib.md5(xml.encode()).hexdigest()}.docx"
if diagram_name in self.attachment_name2uid:
diagram_uid = self.attachment_name2uid[diagram_name]
else:
self.logger.info(f"Attachment with name {diagram_name} not found")
return
self.__prepare_paragraph_list(uids_set, cnt)
diagram_refs[len(self.paragraph_list) - 1].append(diagram_uid)

Expand Down
9 changes: 3 additions & 6 deletions dedoc/readers/docx_reader/docx_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
from typing import Optional, List

from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.readers.base_reader import BaseReader
from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument
from dedoc.data_structures.hierarchy_level import HierarchyLevel


class DocxReader(BaseReader):
Expand All @@ -34,8 +34,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
parameters = {} if parameters is None else parameters
docx_document = self._parse_document(path=path)
attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters)

docx_document = DocxDocument(path=path, attachments=attachments, logger=self.logger)
lines = self.__fix_lines(docx_document.lines)
return UnstructuredDocument(lines=lines, tables=docx_document.tables, attachments=attachments, warnings=[])

Expand All @@ -54,7 +55,3 @@ def __fix_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]:
annotation.end += 1

return lines

def _parse_document(self, path: str) -> DocxDocument:
docx_document = DocxDocument(path=path, logger=self.logger)
return docx_document
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from copy import deepcopy
from typing import Iterator, Optional, Dict, Iterable, Tuple
from typing import List

import numpy as np
from PIL import Image
from PIL import ImageColor
Expand All @@ -18,11 +19,9 @@
from dedoc.common.exceptions.conversion_exception import ConversionException
from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument
from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph
from dedoc.readers.docx_reader.docx_reader import DocxReader
from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
from dedoc.train_dataset.train_dataset_utils import get_original_document_path

from dedoc.train_dataset.taskers.images_creators.concrete_creators.abstract_images_creator import AbstractImagesCreator
from dedoc.train_dataset.train_dataset_utils import get_original_document_path
from dedoc.utils.image_utils import get_concat_v

PairedPdf = namedtuple("PairedPdf", ["many_color_pdf", "two_color_pdf", "many_colors", "two_colors"])
Expand All @@ -32,7 +31,6 @@ class DocxImagesCreator(AbstractImagesCreator):

def __init__(self, path2docs: str, *, config: dict) -> None:
self.path2docs = path2docs
self.docx_reader = DocxReader(config=config)
self.color_step = 16
self.first_color = 15
self.base_color = 0
Expand All @@ -58,7 +56,7 @@ def add_images(self, page: List[dict], archive: zipfile.ZipFile) -> None:
"""
path2doc = get_original_document_path(self.path2docs, page)
# here we get half processing docx document (with raw xml)
document = self.docx_reader._parse_document(path2doc)
document = DocxDocument(path=path2doc, attachments=[], logger=self.logger)
with zipfile.ZipFile(path2doc) as d:
with tempfile.TemporaryDirectory() as tmp_dir:
pdfs = self.__create_pair_pdfs(docx_archive=d, document=document, tmp_dir=tmp_dir)
Expand Down
10 changes: 8 additions & 2 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
Changelog
=========

v0.9.1 (2023-07-05)
-------------------
Release note: `v0.9.1 <https://github.com/ispras/dedoc/releases/tag/v0.9.1>`_

* Fixed bug with `AttachAnnotation` in docx: its value is equal attachment uid instead of file name.


v0.9 (2023-06-26)
-------------------
-----------------
Release note: `v0.9 <https://github.com/ispras/dedoc/releases/tag/v0.9>`_

* Publication of the first version of dedoc library
* Publication of the first version of dedoc library.
33 changes: 18 additions & 15 deletions tests/api_tests/test_api_with_images_refs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,52 +10,55 @@ class TestApiImageRefs(AbstractTestApiDocReader):
def test_docx_with_images(self) -> None:
file_name = "docx_with_images.docx"
result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear"))
attachments_name2uid = {attachment["metadata"]["file_name"]: attachment["metadata"]["uid"] for attachment in result["attachments"]}
content = result["content"]["structure"]

image_paragraph = content["subparagraphs"][0]
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image1.png')
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.png'])

image_paragraph = content["subparagraphs"][2]
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image2.jpeg')
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image3.jpeg')
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg'])
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.jpeg'])

image_paragraph = content["subparagraphs"][5]
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image4.jpeg')
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image4.jpeg'])

image_paragraph = content["subparagraphs"][6]
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image5.jpeg')
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image6.jpeg')
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image7.jpeg')
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image5.jpeg'])
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image6.jpeg'])
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image7.jpeg'])

def test_odt_with_images(self) -> None:
file_name = "odt_with_images.odt"
result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear"))
attachments_name2uid = {attachment["metadata"]["file_name"]: attachment["metadata"]["uid"] for attachment in result["attachments"]}
content = result["content"]["structure"]

image_paragraph = content["subparagraphs"][0]
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image1.jpeg')
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.jpeg'])

image_paragraph = content["subparagraphs"][7]
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image2.jpeg')
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg'])

image_paragraph = content["subparagraphs"][8]
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image3.jpeg')
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.jpeg'])

def test_docx_with_images_from_mac(self) -> None:
file_name = "doc_with_images.docx"
result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear"))
attachments_name2uid = {attachment["metadata"]["file_name"]: attachment["metadata"]["uid"] for attachment in result["attachments"]}
content = result["content"]["structure"]

image_paragraph = content["subparagraphs"][2]
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image1.jpeg')
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.jpeg'])

image_paragraph = content["subparagraphs"][3]
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image2.jpeg')
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg'])

image_paragraph = content["subparagraphs"][5]
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image3.png')
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.png'])

def __check_image_paragraph(self, image_paragraph: dict, image_name: str) -> None:
def __check_image_paragraph(self, image_paragraph: dict, image_uid: str) -> None:
text = image_paragraph["text"]
image_annotations = image_paragraph["annotations"]
self.assertIn({'start': 0, 'end': len(text), 'name': 'attachment', 'value': image_name}, image_annotations)
self.assertIn({'start': 0, 'end': len(text), 'name': 'attachment', 'value': image_uid}, image_annotations)

0 comments on commit a2e8feb

Please sign in to comment.