update master (#288)

* Add version checking for release pipeline * TLDR-260 fix docx bug: change attachment annotation value (#286) * TLDR-260 fix docx bug: change attachment annotation value from filename to file uid * TLDR-260 review fixes * new version 0.9.1 (#287)
ispras · Jul 5, 2023 · a2e8feb · a2e8feb
1 parent 5237390
commit a2e8feb
Show file tree

Hide file tree

Showing 8 changed files with 66 additions and 49 deletions.
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -17,11 +17,11 @@ jobs:
         with:
           python-version: '3.9'
 
-#      - name: Check version correctness
-#        run: |
-#          python3 .github/check_version.py --branch ${{ github.event.release.target_commitish }} --tag $GITHUB_REF_NAME \
-#                    --new_version $(< VERSION) --old_version $(git cat-file -p $(git rev-parse "$GITHUB_SHA"^1):VERSION) \
-#                    --pre_release ${{ github.event.release.prerelease }}
+      - name: Check version correctness
+        run: |
+          python3 .github/check_version.py --branch ${{ github.event.release.target_commitish }} --tag $GITHUB_REF_NAME \
+                    --new_version $(< VERSION) --old_version $(git cat-file -p $(git rev-parse "$GITHUB_SHA"^1):VERSION) \
+                    --pre_release ${{ github.event.release.prerelease }}
 
       - name: Install dependencies
         run: |

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.9
+0.9.1
diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py
@@ -4,12 +4,13 @@
 import tempfile
 import zipfile
 from typing import List, Optional
+
 from bs4 import BeautifulSoup, Tag
 
 from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor
+from dedoc.common.exceptions.bad_file_exception import BadFileFormatException
 from dedoc.data_structures.attached_file import AttachedFile
 from dedoc.extensions import recognized_extensions, recognized_mimes
-from dedoc.utils.utils import splitext_
 
 
 class DocxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor):
@@ -30,17 +31,16 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[
         the methods' parameters.
         """
         result = []
-        name, ext = splitext_(filename)
-
-        if ext.lower() != '.docx':
-            return []
+        try:
+            with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile:
+                diagram_attachments = self.__extract_diagrams(zfile)
+                need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
+                result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis)
 
-        with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile:
-            diagram_attachments = self.__extract_diagrams(zfile)
-            need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
-            result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis)
+            result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word")
 
-        result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word")
+        except zipfile.BadZipFile:
+            raise BadFileFormatException("Bad docx file:\n file_name = {}. Seems docx is broken".format(filename))
         return result
 
     def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]:

diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py
@@ -9,6 +9,7 @@
 from bs4 import BeautifulSoup, Tag
 
 from dedoc.common.exceptions.bad_file_exception import BadFileFormatException
+from dedoc.data_structures.attached_file import AttachedFile
 from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
 from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation
 from dedoc.data_structures.line_with_meta import LineWithMeta
@@ -23,10 +24,11 @@
 
 
 class DocxDocument:
-    def __init__(self, path: str, logger: logging.Logger) -> None:
+    def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.Logger) -> None:
         self.logger = logger
         self.path = path
         self.path_hash = calculate_file_hash(path=path)
+        self.attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}
 
         self.document_bs_tree = self.__get_bs_tree('word/document.xml')
         if self.document_bs_tree is None:
@@ -68,7 +70,7 @@ def __get_lines(self, logger: logging.Logger) -> List[LineWithMeta]:
                 continue
 
             if paragraph_xml.pict:  # diagrams are saved using docx_attachments_extractor
-                self.__handle_diagrams_xml(paragraph_xml, diagram_refs, uids_set, cnt)
+                self.__handle_diagram_xml(paragraph_xml, diagram_refs, uids_set, cnt)
                 continue
 
             if paragraph_xml.name != 'p':
@@ -179,11 +181,22 @@ def __handle_images_xml(self, xmls: List[Tag], image_refs: dict, uids_set: set,
 
         for image_xml in xmls:
             blips = image_xml.find_all("a:blip")
-            image_uid = images_rels[blips[0]["r:embed"]]
+            image_name = images_rels[blips[0]["r:embed"]]
+
+            if image_name in self.attachment_name2uid:
+                image_uid = self.attachment_name2uid[image_name]
+            else:
+                self.logger.info(f"Attachment with name {image_name} not found")
+                continue
             image_refs[len(self.paragraph_list) - 1].append(image_uid)
 
-    def __handle_diagrams_xml(self, xml: Tag, diagram_refs: dict, uids_set: set, cnt: Counter) -> None:
-        diagram_uid = hashlib.md5(xml.encode()).hexdigest()
+    def __handle_diagram_xml(self, xml: Tag, diagram_refs: dict, uids_set: set, cnt: Counter) -> None:
+        diagram_name = f"{hashlib.md5(xml.encode()).hexdigest()}.docx"
+        if diagram_name in self.attachment_name2uid:
+            diagram_uid = self.attachment_name2uid[diagram_name]
+        else:
+            self.logger.info(f"Attachment with name {diagram_name} not found")
+            return
         self.__prepare_paragraph_list(uids_set, cnt)
         diagram_refs[len(self.paragraph_list) - 1].append(diagram_uid)
 

diff --git a/dedoc/readers/docx_reader/docx_reader.py b/dedoc/readers/docx_reader/docx_reader.py
@@ -3,12 +3,12 @@
 from typing import Optional, List
 
 from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor
+from dedoc.data_structures.hierarchy_level import HierarchyLevel
 from dedoc.data_structures.line_with_meta import LineWithMeta
 from dedoc.data_structures.unstructured_document import UnstructuredDocument
 from dedoc.extensions import recognized_extensions, recognized_mimes
 from dedoc.readers.base_reader import BaseReader
 from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument
-from dedoc.data_structures.hierarchy_level import HierarchyLevel
 
 
 class DocxReader(BaseReader):
@@ -34,8 +34,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
         """
         parameters = {} if parameters is None else parameters
-        docx_document = self._parse_document(path=path)
         attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters)
+
+        docx_document = DocxDocument(path=path, attachments=attachments, logger=self.logger)
         lines = self.__fix_lines(docx_document.lines)
         return UnstructuredDocument(lines=lines, tables=docx_document.tables, attachments=attachments, warnings=[])
 
@@ -54,7 +55,3 @@ def __fix_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]:
                     annotation.end += 1
 
         return lines
-
-    def _parse_document(self, path: str) -> DocxDocument:
-        docx_document = DocxDocument(path=path, logger=self.logger)
-        return docx_document
diff --git a/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py b/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py
@@ -9,6 +9,7 @@
 from copy import deepcopy
 from typing import Iterator, Optional, Dict, Iterable, Tuple
 from typing import List
+
 import numpy as np
 from PIL import Image
 from PIL import ImageColor
@@ -18,11 +19,9 @@
 from dedoc.common.exceptions.conversion_exception import ConversionException
 from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument
 from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph
-from dedoc.readers.docx_reader.docx_reader import DocxReader
 from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
-from dedoc.train_dataset.train_dataset_utils import get_original_document_path
-
 from dedoc.train_dataset.taskers.images_creators.concrete_creators.abstract_images_creator import AbstractImagesCreator
+from dedoc.train_dataset.train_dataset_utils import get_original_document_path
 from dedoc.utils.image_utils import get_concat_v
 
 PairedPdf = namedtuple("PairedPdf", ["many_color_pdf", "two_color_pdf", "many_colors", "two_colors"])
@@ -32,7 +31,6 @@ class DocxImagesCreator(AbstractImagesCreator):
 
     def __init__(self, path2docs: str, *, config: dict) -> None:
         self.path2docs = path2docs
-        self.docx_reader = DocxReader(config=config)
         self.color_step = 16
         self.first_color = 15
         self.base_color = 0
@@ -58,7 +56,7 @@ def add_images(self, page: List[dict], archive: zipfile.ZipFile) -> None:
         """
         path2doc = get_original_document_path(self.path2docs, page)
         # here we get half processing docx document (with raw xml)
-        document = self.docx_reader._parse_document(path2doc)
+        document = DocxDocument(path=path2doc, attachments=[], logger=self.logger)
         with zipfile.ZipFile(path2doc) as d:
             with tempfile.TemporaryDirectory() as tmp_dir:
                 pdfs = self.__create_pair_pdfs(docx_archive=d, document=document, tmp_dir=tmp_dir)

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -1,9 +1,15 @@
 Changelog
 =========
 
+v0.9.1 (2023-07-05)
+-------------------
+Release note: `v0.9.1 <https://github.com/ispras/dedoc/releases/tag/v0.9.1>`_
+
+* Fixed bug with `AttachAnnotation` in docx: its value is equal attachment uid instead of file name.
+
 
 v0.9 (2023-06-26)
--------------------
+-----------------
 Release note: `v0.9 <https://github.com/ispras/dedoc/releases/tag/v0.9>`_
 
-* Publication of the first version of dedoc library
+* Publication of the first version of dedoc library.
diff --git a/tests/api_tests/test_api_with_images_refs.py b/tests/api_tests/test_api_with_images_refs.py
@@ -10,52 +10,55 @@ class TestApiImageRefs(AbstractTestApiDocReader):
     def test_docx_with_images(self) -> None:
         file_name = "docx_with_images.docx"
         result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear"))
+        attachments_name2uid = {attachment["metadata"]["file_name"]: attachment["metadata"]["uid"] for attachment in result["attachments"]}
         content = result["content"]["structure"]
 
         image_paragraph = content["subparagraphs"][0]
-        self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image1.png')
+        self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.png'])
 
         image_paragraph = content["subparagraphs"][2]
-        self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image2.jpeg')
-        self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image3.jpeg')
+        self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg'])
+        self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.jpeg'])
 
         image_paragraph = content["subparagraphs"][5]
-        self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image4.jpeg')
+        self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image4.jpeg'])
 
         image_paragraph = content["subparagraphs"][6]
-        self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image5.jpeg')
-        self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image6.jpeg')
-        self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image7.jpeg')
+        self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image5.jpeg'])
+        self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image6.jpeg'])
+        self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image7.jpeg'])
 
     def test_odt_with_images(self) -> None:
         file_name = "odt_with_images.odt"
         result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear"))
+        attachments_name2uid = {attachment["metadata"]["file_name"]: attachment["metadata"]["uid"] for attachment in result["attachments"]}
         content = result["content"]["structure"]
 
         image_paragraph = content["subparagraphs"][0]
-        self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image1.jpeg')
+        self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.jpeg'])
 
         image_paragraph = content["subparagraphs"][7]
-        self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image2.jpeg')
+        self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg'])
 
         image_paragraph = content["subparagraphs"][8]
-        self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image3.jpeg')
+        self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.jpeg'])
 
     def test_docx_with_images_from_mac(self) -> None:
         file_name = "doc_with_images.docx"
         result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear"))
+        attachments_name2uid = {attachment["metadata"]["file_name"]: attachment["metadata"]["uid"] for attachment in result["attachments"]}
         content = result["content"]["structure"]
 
         image_paragraph = content["subparagraphs"][2]
-        self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image1.jpeg')
+        self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.jpeg'])
 
         image_paragraph = content["subparagraphs"][3]
-        self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image2.jpeg')
+        self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg'])
 
         image_paragraph = content["subparagraphs"][5]
-        self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image3.png')
+        self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.png'])
 
-    def __check_image_paragraph(self, image_paragraph: dict, image_name: str) -> None:
+    def __check_image_paragraph(self, image_paragraph: dict, image_uid: str) -> None:
         text = image_paragraph["text"]
         image_annotations = image_paragraph["annotations"]
-        self.assertIn({'start': 0, 'end': len(text), 'name': 'attachment', 'value': image_name}, image_annotations)
+        self.assertIn({'start': 0, 'end': len(text), 'name': 'attachment', 'value': image_uid}, image_annotations)