diff --git a/.github/check_version.py b/.github/check_version.py new file mode 100644 index 00000000..f2734c87 --- /dev/null +++ b/.github/check_version.py @@ -0,0 +1,58 @@ +import argparse +import re +from typing import Pattern + + +def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern) -> bool: + match = regexp.match(version) + + if match is None: + print("New version doesn't match the pattern") # noqa + return False + + if not (tag.startswith("v") and tag[1:] == version): + print("Tag value should be equal to version with `v` in the beginning") # noqa + return False + + return old_version < version + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--branch", help="Git branch to check its version", choices=["develop", "master"]) + parser.add_argument("--tag", help="Tag of the release", type=str) + parser.add_argument("--pre_release", help="Tag of the release", choices=["true", "false"]) + parser.add_argument("--new_version", help="New release version", type=str) + parser.add_argument("--old_version", help="Previous release version", type=str) + args = parser.parse_args() + + print(f"Old version: {args.old_version}, new version: {args.new_version}, " + f"branch: {args.branch}, tag: {args.tag}, pre_release: {args.pre_release}") # noqa + + master_version_pattern = re.compile(r"^\d+\.\d+(\.\d+)?$") + develop_version_pattern = re.compile(r"^\d+\.\d+\.\d+rc\d+$") + + correct = False + if args.branch == "develop": + correct = is_correct_version(args.new_version, args.tag, args.old_version, develop_version_pattern) + + if correct and master_version_pattern.match(args.old_version) and args.new_version.split("rc")[0] <= args.old_version: + correct = False + print("New version should add 'rc' to the bigger version than the old one") # noqa + elif correct and int(args.new_version.split("rc")[1]) == 0: + correct = False + print("Numeration for 'rc' should start from 1") # noqa + + if args.pre_release == "false": + correct = False + print("Only pre-releases allowed on develop") # noqa + + if args.branch == "master": + correct = is_correct_version(args.new_version, args.tag, args.old_version, master_version_pattern) + + if args.pre_release == "true": + correct = False + print("Pre-releases are not allowed on master") # noqa + + assert correct + print("Version is correct") # noqa diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 1e602915..c0534921 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -19,10 +19,14 @@ jobs: - name: Install dependencies run: | + sudo apt-get install -y libreoffice python -m pip install --upgrade --no-cache-dir pip setuptools python -m pip install --exists-action=w --no-cache-dir -r requirements.txt python -m pip install --upgrade --upgrade-strategy eager --no-cache-dir .[torch,docs] - name: Build documentation # Build the documentation, you can use this command locally - run: python -m sphinx -T -E -W -b html -d docs/_build/doctrees -D language=en docs/source docs/_build + run: | + python -m sphinx -T -E -W -b html -d docs/_build/doctrees -D language=en docs/source docs/_build + cd docs/source/_static/code_examples + python dedoc_usage_tutorial.py diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 87c8cd53..14597d0d 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -1,12 +1,8 @@ name: Publish to PyPI on: - push: - branches: - - develop - - master - paths: - - VERSION # publish only when version has been changed + release: + types: [published] jobs: # Publish the package to PyPI https://pypi.org @@ -14,22 +10,39 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repo - uses: actions/checkout@v2 + uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: '3.9' +# - name: Check version correctness +# run: | +# python3 .github/check_version.py --branch ${{ github.event.release.target_commitish }} --tag $GITHUB_REF_NAME \ +# --new_version $(< VERSION) --old_version $(git cat-file -p $(git rev-parse "$GITHUB_SHA"^1):VERSION) \ +# --pre_release ${{ github.event.release.prerelease }} + - name: Install dependencies run: | python3 -m pip install --upgrade pip pip3 install build twine - - name: Build and publish to PyPI # TODO change to pypi instead of test pypi + - name: Build and publish to PyPI + if: ${{ success() }} # publish only when version passed the checks env: - TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }} # TODO delete TEST_ in the name of the variable - TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }} # TODO delete TEST_ in the name of the variable + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | python3 -m build -w twine check dist/* - twine upload --repository testpypi dist/* + twine upload --repository pypi dist/* + + - name: Push to dockerhub + if: ${{ success() }} + run: | + docker build -f docker/Dockerfile -t dedocproject/dedoc:$GITHUB_REF_NAME . + docker login -u ${{ secrets.DOCKERHUB_USERNAME }} -p ${{ secrets.DOCKERHUB_PASSWORD }} + docker tag dedocproject/dedoc:$GITHUB_REF_NAME dedocproject/dedoc:latest + docker push dedocproject/dedoc:$GITHUB_REF_NAME + docker push dedocproject/dedoc:latest diff --git a/MANIFEST.in b/MANIFEST.in index 505b1f24..c5efea23 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,3 @@ include dedoc/api/static/*/* -include dedoc/readers/scanned_reader/pdftxtlayer_reader/tabbypdf/jars/* +include dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/* include docs/* \ No newline at end of file diff --git a/README.md b/README.md index f01cd77a..f821b75d 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ docker-compose up --build Start Dedoc with tests: ```bash - tests="true" docker-compose up --build +test="true" docker-compose up --build ``` Now you can go to the localhost:1231 and look at the docs and examples. diff --git a/VERSION b/VERSION index 7e937f29..9a7d84f2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2023.05.26 \ No newline at end of file +0.9 \ No newline at end of file diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index aaaa28fd..0c383b3b 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -5,86 +5,96 @@ class QueryParameters(BaseModel): - language: Optional[str] + document_type: Optional[str] + structure_type: Optional[str] + return_format: Optional[str] + with_attachments: Optional[str] + need_content_analysis: Optional[str] + recursion_deep_attachments: Optional[str] + return_base64: Optional[str] + insert_table: Optional[str] - return_format: Optional[str] - structure_type: Optional[str] - delimiter: Optional[str] - encoding: Optional[str] - document_type: Optional[str] - pdf_with_text_layer: Optional[str] - pages: Optional[str] + need_pdf_table_analysis: Optional[str] + table_type: Optional[str] orient_analysis_cells: Optional[str] orient_cell_angle: Optional[str] + + pdf_with_text_layer: Optional[str] + language: Optional[str] + pages: Optional[str] is_one_column_document: Optional[str] document_orientation: Optional[str] - html_fields: Optional[str] - cloud_bucket: Optional[str] need_header_footer_analysis: Optional[str] need_binarization: Optional[str] - need_pdf_table_analysis: Optional[str] + + delimiter: Optional[str] + encoding: Optional[str] + html_fields: Optional[str] handle_invisible_table: Optional[str] - return_base64: Optional[str] - archive_as_single_file: Optional[str] - upload_attachments_into_cloud: Optional[str] - need_content_analysis: Optional[str] - recursion_deep_attachments: Optional[str] - table_type: Optional[str] def __init__(self, - language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None), + # type of document structure parsing + document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None), + structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None), + return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None), + + # attachments handling with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None), + need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None), + recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None), + return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None), + + # tables handling insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None), - return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None), - structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None), - delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None), - encoding: Optional[str] = Body(description="a document encoding", default=None), - document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma", "article", "slide"], default=None), - pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None), - pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None), + need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None), + table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None), orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None), orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None), + + # pdf handling + pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None), + language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None), + pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None), is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None), - document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 370) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None), - html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None), - cloud_bucket: Optional[str] = Body(description="a path (bucket) in the cloud storage mime. Default: ''", default=None), + document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None), need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None), need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None), - need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None), + + # other formats handling + delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None), + encoding: Optional[str] = Body(description="a document encoding", default=None), + html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None), handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None), - return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None), - archive_as_single_file: Optional[str] = Body(description="additional parameters for the archive reader. Default: 'true'", default=None), - upload_attachments_into_cloud: Optional[str] = Body(description="turn on if you need upload attachments into a cloud. Turn on if with_attachments=True and \"cloud_bucket\" not empty. Default: 'false'", default=None), - need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None), - recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None), - table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None), + + **data: Any) -> None: super().__init__(**data) - self.language: str = language or "rus+eng" + self.document_type: str = document_type or "" + self.structure_type: str = structure_type or 'tree' + self.return_format: str = return_format or 'json' + self.with_attachments: str = with_attachments or 'false' + self.need_content_analysis: str = need_content_analysis or 'false' + self.recursion_deep_attachments: str = recursion_deep_attachments or '10' + self.return_base64: str = return_base64 or 'false' + self.insert_table: str = insert_table or 'false' - self.return_format: str = return_format or 'json' - self.structure_type: str = structure_type or 'tree' - self.delimiter: str = delimiter - self.encoding: str = encoding - self.document_type: str = document_type or "" - self.pdf_with_text_layer: str = pdf_with_text_layer or 'auto_tabby' - self.pages: str = pages or ':' + self.need_pdf_table_analysis: str = need_pdf_table_analysis or 'true' + self.table_type: str = table_type or '' self.orient_analysis_cells: str = orient_analysis_cells or 'false' self.orient_cell_angle: str = orient_cell_angle or "90" + + self.pdf_with_text_layer: str = pdf_with_text_layer or 'auto_tabby' + self.language: str = language or "rus+eng" + self.pages: str = pages or ':' self.is_one_column_document: str = is_one_column_document or 'auto' self.document_orientation: str = document_orientation or "auto" - self.html_fields: str = html_fields or '' - self.cloud_bucket: str = cloud_bucket or '' self.need_header_footer_analysis: str = need_header_footer_analysis or 'false' self.need_binarization: str = need_binarization or 'false' - self.need_pdf_table_analysis: str = need_pdf_table_analysis or 'true' + + self.delimiter: str = delimiter + self.encoding: str = encoding + self.html_fields: str = html_fields or '' self.handle_invisible_table: str = handle_invisible_table or 'false' - self.return_base64: str = return_base64 or 'false' - self.archive_as_single_file: str = archive_as_single_file or 'true' - self.upload_attachments_into_cloud: str = upload_attachments_into_cloud or 'false' - self.need_content_analysis: str = need_content_analysis or 'false' - self.recursion_deep_attachments: str = recursion_deep_attachments or '10' - self.table_type: str = table_type or '' diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py index 0de2d4e6..0a8201b7 100644 --- a/dedoc/api/dedoc_api.py +++ b/dedoc/api/dedoc_api.py @@ -1,17 +1,27 @@ import importlib import os + import uvicorn +from fastapi import Response, FastAPI, Request, Depends, UploadFile, File from fastapi.responses import UJSONResponse, ORJSONResponse +from fastapi.staticfiles import StaticFiles from starlette.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse -from fastapi import Response, FastAPI, Request, Depends, UploadFile, File from dedoc.api.api_args import QueryParameters from dedoc.api.api_utils import json2html, json2tree, json2collapsed_tree -from dedoc.api.init_api import app, config, static_files_dirs, PORT, static_path from dedoc.common.exceptions.dedoc_exception import DedocException from dedoc.common.exceptions.missing_file_exception import MissingFileException +from dedoc.config import get_config from dedoc.manager.dedoc_thread_manager import DedocThreadedManager +config = get_config() +PORT = config["api_port"] +static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static/") +static_files_dirs = config.get("static_files_dirs") + +app = FastAPI() +app.mount('/static', StaticFiles(directory=config.get("static_path", static_path)), name="static") + module_api_args = importlib.import_module(config['import_path_init_api_args']) logger = config["logger"] version_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "VERSION")) diff --git a/dedoc/api/init_api.py b/dedoc/api/init_api.py deleted file mode 100644 index 831107ed..00000000 --- a/dedoc/api/init_api.py +++ /dev/null @@ -1,18 +0,0 @@ -import os -from fastapi import FastAPI -from fastapi.staticfiles import StaticFiles -from dedoc.config import get_config - -config = get_config() -PORT = config["api_port"] - -if "static_files_dirs" in config and config["static_files_dirs"] != {}: - static_path = os.path.abspath(config["static_files_dirs"]["online_docs"]) -else: - static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static/") - - -static_files_dirs = config.get("static_files_dirs") - -app = FastAPI() -app.mount('/static', StaticFiles(directory=config.get("static_path", static_path)), name="static") diff --git a/dedoc/api/static/html_eng/form_input.html b/dedoc/api/static/html_eng/form_input.html index f066c7c8..cdb39042 100644 --- a/dedoc/api/static/html_eng/form_input.html +++ b/dedoc/api/static/html_eng/form_input.html @@ -88,7 +88,6 @@

Structure Document Recognition

document_orientation

-