diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 14597d0d..2594ee25 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -17,11 +17,11 @@ jobs: with: python-version: '3.9' -# - name: Check version correctness -# run: | -# python3 .github/check_version.py --branch ${{ github.event.release.target_commitish }} --tag $GITHUB_REF_NAME \ -# --new_version $(< VERSION) --old_version $(git cat-file -p $(git rev-parse "$GITHUB_SHA"^1):VERSION) \ -# --pre_release ${{ github.event.release.prerelease }} + - name: Check version correctness + run: | + python3 .github/check_version.py --branch ${{ github.event.release.target_commitish }} --tag $GITHUB_REF_NAME \ + --new_version $(< VERSION) --old_version $(git cat-file -p $(git rev-parse "$GITHUB_SHA"^1):VERSION) \ + --pre_release ${{ github.event.release.prerelease }} - name: Install dependencies run: | diff --git a/VERSION b/VERSION index 9a7d84f2..f514a2f0 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.9 \ No newline at end of file +0.9.1 \ No newline at end of file diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py index ab36b73e..93959faf 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py @@ -4,12 +4,13 @@ import tempfile import zipfile from typing import List, Optional + from bs4 import BeautifulSoup, Tag from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor +from dedoc.common.exceptions.bad_file_exception import BadFileFormatException from dedoc.data_structures.attached_file import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes -from dedoc.utils.utils import splitext_ class DocxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): @@ -30,17 +31,16 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ the methods' parameters. """ result = [] - name, ext = splitext_(filename) - - if ext.lower() != '.docx': - return [] + try: + with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile: + diagram_attachments = self.__extract_diagrams(zfile) + need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true" + result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis) - with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile: - diagram_attachments = self.__extract_diagrams(zfile) - need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true" - result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis) + result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word") - result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word") + except zipfile.BadZipFile: + raise BadFileFormatException("Bad docx file:\n file_name = {}. Seems docx is broken".format(filename)) return result def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]: diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py index 395166e0..901f4750 100644 --- a/dedoc/readers/docx_reader/data_structures/docx_document.py +++ b/dedoc/readers/docx_reader/data_structures/docx_document.py @@ -9,6 +9,7 @@ from bs4 import BeautifulSoup, Tag from dedoc.common.exceptions.bad_file_exception import BadFileFormatException +from dedoc.data_structures.attached_file import AttachedFile from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation from dedoc.data_structures.line_with_meta import LineWithMeta @@ -23,10 +24,11 @@ class DocxDocument: - def __init__(self, path: str, logger: logging.Logger) -> None: + def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.Logger) -> None: self.logger = logger self.path = path self.path_hash = calculate_file_hash(path=path) + self.attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments} self.document_bs_tree = self.__get_bs_tree('word/document.xml') if self.document_bs_tree is None: @@ -68,7 +70,7 @@ def __get_lines(self, logger: logging.Logger) -> List[LineWithMeta]: continue if paragraph_xml.pict: # diagrams are saved using docx_attachments_extractor - self.__handle_diagrams_xml(paragraph_xml, diagram_refs, uids_set, cnt) + self.__handle_diagram_xml(paragraph_xml, diagram_refs, uids_set, cnt) continue if paragraph_xml.name != 'p': @@ -179,11 +181,22 @@ def __handle_images_xml(self, xmls: List[Tag], image_refs: dict, uids_set: set, for image_xml in xmls: blips = image_xml.find_all("a:blip") - image_uid = images_rels[blips[0]["r:embed"]] + image_name = images_rels[blips[0]["r:embed"]] + + if image_name in self.attachment_name2uid: + image_uid = self.attachment_name2uid[image_name] + else: + self.logger.info(f"Attachment with name {image_name} not found") + continue image_refs[len(self.paragraph_list) - 1].append(image_uid) - def __handle_diagrams_xml(self, xml: Tag, diagram_refs: dict, uids_set: set, cnt: Counter) -> None: - diagram_uid = hashlib.md5(xml.encode()).hexdigest() + def __handle_diagram_xml(self, xml: Tag, diagram_refs: dict, uids_set: set, cnt: Counter) -> None: + diagram_name = f"{hashlib.md5(xml.encode()).hexdigest()}.docx" + if diagram_name in self.attachment_name2uid: + diagram_uid = self.attachment_name2uid[diagram_name] + else: + self.logger.info(f"Attachment with name {diagram_name} not found") + return self.__prepare_paragraph_list(uids_set, cnt) diagram_refs[len(self.paragraph_list) - 1].append(diagram_uid) diff --git a/dedoc/readers/docx_reader/docx_reader.py b/dedoc/readers/docx_reader/docx_reader.py index 87cad223..e2599914 100644 --- a/dedoc/readers/docx_reader/docx_reader.py +++ b/dedoc/readers/docx_reader/docx_reader.py @@ -3,12 +3,12 @@ from typing import Optional, List from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor +from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument -from dedoc.data_structures.hierarchy_level import HierarchyLevel class DocxReader(BaseReader): @@ -34,8 +34,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - docx_document = self._parse_document(path=path) attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) + + docx_document = DocxDocument(path=path, attachments=attachments, logger=self.logger) lines = self.__fix_lines(docx_document.lines) return UnstructuredDocument(lines=lines, tables=docx_document.tables, attachments=attachments, warnings=[]) @@ -54,7 +55,3 @@ def __fix_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]: annotation.end += 1 return lines - - def _parse_document(self, path: str) -> DocxDocument: - docx_document = DocxDocument(path=path, logger=self.logger) - return docx_document diff --git a/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py b/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py index 868ac5a2..9bdd3390 100644 --- a/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py +++ b/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py @@ -9,6 +9,7 @@ from copy import deepcopy from typing import Iterator, Optional, Dict, Iterable, Tuple from typing import List + import numpy as np from PIL import Image from PIL import ImageColor @@ -18,11 +19,9 @@ from dedoc.common.exceptions.conversion_exception import ConversionException from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph -from dedoc.readers.docx_reader.docx_reader import DocxReader from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader -from dedoc.train_dataset.train_dataset_utils import get_original_document_path - from dedoc.train_dataset.taskers.images_creators.concrete_creators.abstract_images_creator import AbstractImagesCreator +from dedoc.train_dataset.train_dataset_utils import get_original_document_path from dedoc.utils.image_utils import get_concat_v PairedPdf = namedtuple("PairedPdf", ["many_color_pdf", "two_color_pdf", "many_colors", "two_colors"]) @@ -32,7 +31,6 @@ class DocxImagesCreator(AbstractImagesCreator): def __init__(self, path2docs: str, *, config: dict) -> None: self.path2docs = path2docs - self.docx_reader = DocxReader(config=config) self.color_step = 16 self.first_color = 15 self.base_color = 0 @@ -58,7 +56,7 @@ def add_images(self, page: List[dict], archive: zipfile.ZipFile) -> None: """ path2doc = get_original_document_path(self.path2docs, page) # here we get half processing docx document (with raw xml) - document = self.docx_reader._parse_document(path2doc) + document = DocxDocument(path=path2doc, attachments=[], logger=self.logger) with zipfile.ZipFile(path2doc) as d: with tempfile.TemporaryDirectory() as tmp_dir: pdfs = self.__create_pair_pdfs(docx_archive=d, document=document, tmp_dir=tmp_dir) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 903f4592..cc3d492b 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,9 +1,15 @@ Changelog ========= +v0.9.1 (2023-07-05) +------------------- +Release note: `v0.9.1 `_ + +* Fixed bug with `AttachAnnotation` in docx: its value is equal attachment uid instead of file name. + v0.9 (2023-06-26) -------------------- +----------------- Release note: `v0.9 `_ -* Publication of the first version of dedoc library +* Publication of the first version of dedoc library. diff --git a/tests/api_tests/test_api_with_images_refs.py b/tests/api_tests/test_api_with_images_refs.py index 9da119e7..774b6a10 100644 --- a/tests/api_tests/test_api_with_images_refs.py +++ b/tests/api_tests/test_api_with_images_refs.py @@ -10,52 +10,55 @@ class TestApiImageRefs(AbstractTestApiDocReader): def test_docx_with_images(self) -> None: file_name = "docx_with_images.docx" result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear")) + attachments_name2uid = {attachment["metadata"]["file_name"]: attachment["metadata"]["uid"] for attachment in result["attachments"]} content = result["content"]["structure"] image_paragraph = content["subparagraphs"][0] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image1.png') + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.png']) image_paragraph = content["subparagraphs"][2] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image2.jpeg') - self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image3.jpeg') + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg']) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.jpeg']) image_paragraph = content["subparagraphs"][5] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image4.jpeg') + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image4.jpeg']) image_paragraph = content["subparagraphs"][6] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image5.jpeg') - self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image6.jpeg') - self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image7.jpeg') + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image5.jpeg']) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image6.jpeg']) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image7.jpeg']) def test_odt_with_images(self) -> None: file_name = "odt_with_images.odt" result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear")) + attachments_name2uid = {attachment["metadata"]["file_name"]: attachment["metadata"]["uid"] for attachment in result["attachments"]} content = result["content"]["structure"] image_paragraph = content["subparagraphs"][0] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image1.jpeg') + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.jpeg']) image_paragraph = content["subparagraphs"][7] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image2.jpeg') + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg']) image_paragraph = content["subparagraphs"][8] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image3.jpeg') + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.jpeg']) def test_docx_with_images_from_mac(self) -> None: file_name = "doc_with_images.docx" result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear")) + attachments_name2uid = {attachment["metadata"]["file_name"]: attachment["metadata"]["uid"] for attachment in result["attachments"]} content = result["content"]["structure"] image_paragraph = content["subparagraphs"][2] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image1.jpeg') + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.jpeg']) image_paragraph = content["subparagraphs"][3] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image2.jpeg') + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg']) image_paragraph = content["subparagraphs"][5] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image3.png') + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.png']) - def __check_image_paragraph(self, image_paragraph: dict, image_name: str) -> None: + def __check_image_paragraph(self, image_paragraph: dict, image_uid: str) -> None: text = image_paragraph["text"] image_annotations = image_paragraph["annotations"] - self.assertIn({'start': 0, 'end': len(text), 'name': 'attachment', 'value': image_name}, image_annotations) + self.assertIn({'start': 0, 'end': len(text), 'name': 'attachment', 'value': image_uid}, image_annotations)