Skip to content

Commit

Permalink
update master (#285)
Browse files Browse the repository at this point in the history
* hashes of downloaded models added

* fix bugs in diplomas

* change features for diploma classifier

* Remove unused API parameters

* Move all remaining readers from docreader (#273)

* TLDR-340 renamed pdf folder; some refactoring (#275)

* Taining scripts transition (#274)

* TLDR-350 pypi pipeline fix (#277)

* TLDR-322 fix ispras_tbl_extr.jar (#279)

* moved benchmarks from docreader (#280)

* TLDR-336 dedoc api documentation (#281)

* TLDR-372 docx bug for documents with comments (#282)

* TLDR-359 push to dockerhub automatically (#283)

* new version 0.9 (#284)
  • Loading branch information
NastyBoget authored Jun 26, 2023
1 parent 3603e75 commit 5237390
Show file tree
Hide file tree
Showing 385 changed files with 34,113 additions and 717 deletions.
58 changes: 58 additions & 0 deletions .github/check_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import argparse
import re
from typing import Pattern


def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern) -> bool:
match = regexp.match(version)

if match is None:
print("New version doesn't match the pattern") # noqa
return False

if not (tag.startswith("v") and tag[1:] == version):
print("Tag value should be equal to version with `v` in the beginning") # noqa
return False

return old_version < version


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--branch", help="Git branch to check its version", choices=["develop", "master"])
parser.add_argument("--tag", help="Tag of the release", type=str)
parser.add_argument("--pre_release", help="Tag of the release", choices=["true", "false"])
parser.add_argument("--new_version", help="New release version", type=str)
parser.add_argument("--old_version", help="Previous release version", type=str)
args = parser.parse_args()

print(f"Old version: {args.old_version}, new version: {args.new_version}, "
f"branch: {args.branch}, tag: {args.tag}, pre_release: {args.pre_release}") # noqa

master_version_pattern = re.compile(r"^\d+\.\d+(\.\d+)?$")
develop_version_pattern = re.compile(r"^\d+\.\d+\.\d+rc\d+$")

correct = False
if args.branch == "develop":
correct = is_correct_version(args.new_version, args.tag, args.old_version, develop_version_pattern)

if correct and master_version_pattern.match(args.old_version) and args.new_version.split("rc")[0] <= args.old_version:
correct = False
print("New version should add 'rc' to the bigger version than the old one") # noqa
elif correct and int(args.new_version.split("rc")[1]) == 0:
correct = False
print("Numeration for 'rc' should start from 1") # noqa

if args.pre_release == "false":
correct = False
print("Only pre-releases allowed on develop") # noqa

if args.branch == "master":
correct = is_correct_version(args.new_version, args.tag, args.old_version, master_version_pattern)

if args.pre_release == "true":
correct = False
print("Pre-releases are not allowed on master") # noqa

assert correct
print("Version is correct") # noqa
6 changes: 5 additions & 1 deletion .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,14 @@ jobs:

- name: Install dependencies
run: |
sudo apt-get install -y libreoffice
python -m pip install --upgrade --no-cache-dir pip setuptools
python -m pip install --exists-action=w --no-cache-dir -r requirements.txt
python -m pip install --upgrade --upgrade-strategy eager --no-cache-dir .[torch,docs]
- name: Build documentation
# Build the documentation, you can use this command locally
run: python -m sphinx -T -E -W -b html -d docs/_build/doctrees -D language=en docs/source docs/_build
run: |
python -m sphinx -T -E -W -b html -d docs/_build/doctrees -D language=en docs/source docs/_build
cd docs/source/_static/code_examples
python dedoc_usage_tutorial.py
35 changes: 24 additions & 11 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
@@ -1,35 +1,48 @@
name: Publish to PyPI

on:
push:
branches:
- develop
- master
paths:
- VERSION # publish only when version has been changed
release:
types: [published]

jobs:
# Publish the package to PyPI https://pypi.org
pypi-publish:
runs-on: ubuntu-latest
steps:
- name: Checkout repo
uses: actions/checkout@v2
uses: actions/checkout@v1

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: '3.9'

# - name: Check version correctness
# run: |
# python3 .github/check_version.py --branch ${{ github.event.release.target_commitish }} --tag $GITHUB_REF_NAME \
# --new_version $(< VERSION) --old_version $(git cat-file -p $(git rev-parse "$GITHUB_SHA"^1):VERSION) \
# --pre_release ${{ github.event.release.prerelease }}

- name: Install dependencies
run: |
python3 -m pip install --upgrade pip
pip3 install build twine
- name: Build and publish to PyPI # TODO change to pypi instead of test pypi
- name: Build and publish to PyPI
if: ${{ success() }} # publish only when version passed the checks
env:
TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }} # TODO delete TEST_ in the name of the variable
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }} # TODO delete TEST_ in the name of the variable
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
python3 -m build -w
twine check dist/*
twine upload --repository testpypi dist/*
twine upload --repository pypi dist/*
- name: Push to dockerhub
if: ${{ success() }}
run: |
docker build -f docker/Dockerfile -t dedocproject/dedoc:$GITHUB_REF_NAME .
docker login -u ${{ secrets.DOCKERHUB_USERNAME }} -p ${{ secrets.DOCKERHUB_PASSWORD }}
docker tag dedocproject/dedoc:$GITHUB_REF_NAME dedocproject/dedoc:latest
docker push dedocproject/dedoc:$GITHUB_REF_NAME
docker push dedocproject/dedoc:latest
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
include dedoc/api/static/*/*
include dedoc/readers/scanned_reader/pdftxtlayer_reader/tabbypdf/jars/*
include dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/*
include docs/*
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ docker-compose up --build

Start Dedoc with tests:
```bash
tests="true" docker-compose up --build
test="true" docker-compose up --build
```

Now you can go to the localhost:1231 and look at the docs and examples.
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2023.05.26
0.9
114 changes: 62 additions & 52 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,86 +5,96 @@


class QueryParameters(BaseModel):
language: Optional[str]
document_type: Optional[str]
structure_type: Optional[str]
return_format: Optional[str]

with_attachments: Optional[str]
need_content_analysis: Optional[str]
recursion_deep_attachments: Optional[str]
return_base64: Optional[str]

insert_table: Optional[str]
return_format: Optional[str]
structure_type: Optional[str]
delimiter: Optional[str]
encoding: Optional[str]
document_type: Optional[str]
pdf_with_text_layer: Optional[str]
pages: Optional[str]
need_pdf_table_analysis: Optional[str]
table_type: Optional[str]
orient_analysis_cells: Optional[str]
orient_cell_angle: Optional[str]

pdf_with_text_layer: Optional[str]
language: Optional[str]
pages: Optional[str]
is_one_column_document: Optional[str]
document_orientation: Optional[str]
html_fields: Optional[str]
cloud_bucket: Optional[str]
need_header_footer_analysis: Optional[str]
need_binarization: Optional[str]
need_pdf_table_analysis: Optional[str]

delimiter: Optional[str]
encoding: Optional[str]
html_fields: Optional[str]
handle_invisible_table: Optional[str]
return_base64: Optional[str]
archive_as_single_file: Optional[str]
upload_attachments_into_cloud: Optional[str]
need_content_analysis: Optional[str]
recursion_deep_attachments: Optional[str]
table_type: Optional[str]

def __init__(self,
language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None),
# type of document structure parsing
document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None),
structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None),
return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None),

# attachments handling
with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None),
need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None),
recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None),
return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None),

# tables handling
insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None),
return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None),
structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None),
delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None),
encoding: Optional[str] = Body(description="a document encoding", default=None),
document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma", "article", "slide"], default=None),
pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None),
pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None),
need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None),
table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None),
orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None),
orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None),

# pdf handling
pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None),
language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None),
pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None),
is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None),
document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 370) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None),
html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None),
cloud_bucket: Optional[str] = Body(description="a path (bucket) in the cloud storage mime. Default: ''", default=None),
document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None),
need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None),
need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None),
need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None),

# other formats handling
delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None),
encoding: Optional[str] = Body(description="a document encoding", default=None),
html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None),
handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None),
return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None),
archive_as_single_file: Optional[str] = Body(description="additional parameters for the archive reader. Default: 'true'", default=None),
upload_attachments_into_cloud: Optional[str] = Body(description="turn on if you need upload attachments into a cloud. Turn on if with_attachments=True and \"cloud_bucket\" not empty. Default: 'false'", default=None),
need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None),
recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None),
table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None),


**data: Any) -> None:

super().__init__(**data)
self.language: str = language or "rus+eng"
self.document_type: str = document_type or ""
self.structure_type: str = structure_type or 'tree'
self.return_format: str = return_format or 'json'

self.with_attachments: str = with_attachments or 'false'
self.need_content_analysis: str = need_content_analysis or 'false'
self.recursion_deep_attachments: str = recursion_deep_attachments or '10'
self.return_base64: str = return_base64 or 'false'

self.insert_table: str = insert_table or 'false'
self.return_format: str = return_format or 'json'
self.structure_type: str = structure_type or 'tree'
self.delimiter: str = delimiter
self.encoding: str = encoding
self.document_type: str = document_type or ""
self.pdf_with_text_layer: str = pdf_with_text_layer or 'auto_tabby'
self.pages: str = pages or ':'
self.need_pdf_table_analysis: str = need_pdf_table_analysis or 'true'
self.table_type: str = table_type or ''
self.orient_analysis_cells: str = orient_analysis_cells or 'false'
self.orient_cell_angle: str = orient_cell_angle or "90"

self.pdf_with_text_layer: str = pdf_with_text_layer or 'auto_tabby'
self.language: str = language or "rus+eng"
self.pages: str = pages or ':'
self.is_one_column_document: str = is_one_column_document or 'auto'
self.document_orientation: str = document_orientation or "auto"
self.html_fields: str = html_fields or ''
self.cloud_bucket: str = cloud_bucket or ''
self.need_header_footer_analysis: str = need_header_footer_analysis or 'false'
self.need_binarization: str = need_binarization or 'false'
self.need_pdf_table_analysis: str = need_pdf_table_analysis or 'true'

self.delimiter: str = delimiter
self.encoding: str = encoding
self.html_fields: str = html_fields or ''
self.handle_invisible_table: str = handle_invisible_table or 'false'
self.return_base64: str = return_base64 or 'false'
self.archive_as_single_file: str = archive_as_single_file or 'true'
self.upload_attachments_into_cloud: str = upload_attachments_into_cloud or 'false'
self.need_content_analysis: str = need_content_analysis or 'false'
self.recursion_deep_attachments: str = recursion_deep_attachments or '10'
self.table_type: str = table_type or ''
14 changes: 12 additions & 2 deletions dedoc/api/dedoc_api.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
import importlib
import os

import uvicorn
from fastapi import Response, FastAPI, Request, Depends, UploadFile, File
from fastapi.responses import UJSONResponse, ORJSONResponse
from fastapi.staticfiles import StaticFiles
from starlette.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse
from fastapi import Response, FastAPI, Request, Depends, UploadFile, File

from dedoc.api.api_args import QueryParameters
from dedoc.api.api_utils import json2html, json2tree, json2collapsed_tree
from dedoc.api.init_api import app, config, static_files_dirs, PORT, static_path
from dedoc.common.exceptions.dedoc_exception import DedocException
from dedoc.common.exceptions.missing_file_exception import MissingFileException
from dedoc.config import get_config
from dedoc.manager.dedoc_thread_manager import DedocThreadedManager

config = get_config()
PORT = config["api_port"]
static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static/")
static_files_dirs = config.get("static_files_dirs")

app = FastAPI()
app.mount('/static', StaticFiles(directory=config.get("static_path", static_path)), name="static")

module_api_args = importlib.import_module(config['import_path_init_api_args'])
logger = config["logger"]
version_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "VERSION"))
Expand Down
Loading

0 comments on commit 5237390

Please sign in to comment.