Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into potter/all-doc-types
Browse files Browse the repository at this point in the history
  • Loading branch information
potter-potter committed Jun 27, 2024
2 parents c6ec797 + 137b149 commit 694bc68
Show file tree
Hide file tree
Showing 38 changed files with 12,935 additions and 454 deletions.
8 changes: 3 additions & 5 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
## 0.14.9-dev7
## 0.14.9-dev9

### Enhancements

* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used
to visualize the result of the OD model and dump the result to a file.
Additionally, the visualization of bounding boxes of each layout source is rendered and saved
for each page.
* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used to visualize the result of the OD model and dump the result to a file. Additionally, the visualization of bounding boxes of each layout source is rendered and saved for each page.
* **`partition_docx()` distinguishes "file not found" from "not a ZIP archive" error.** `partition_docx()` now provides different error messages for "file not found" and "file is not a ZIP archive (and therefore not a DOCX file)". This aids diagnosis since these two conditions generally point in different directions as to the cause and fix.

### Features

Expand Down
4 changes: 1 addition & 3 deletions requirements/deps/constraints.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,7 @@ fsspec==2024.5.0
# python 3.12 support
wrapt>=1.14.0


# NOTE(robinson): for compatiblity with voyage embeddings
langsmith==0.1.62
langchain-community>=0.2.5

# NOTE(robinson): choma was pinned to importlib-metadata>=7.1.0 but 7.1.0 was installed
# instead of 7.2.0. Need to investigate
Expand Down
4 changes: 2 additions & 2 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ comm==0.2.2
# via
# ipykernel
# ipywidgets
debugpy==1.8.1
debugpy==1.8.2
# via ipykernel
decorator==5.1.1
# via ipython
Expand Down Expand Up @@ -186,7 +186,7 @@ jupyter-server==2.14.1
# notebook-shim
jupyter-server-terminals==0.5.3
# via jupyter-server
jupyterlab==4.2.2
jupyterlab==4.2.3
# via notebook
jupyterlab-pygments==0.3.0
# via nbconvert
Expand Down
15 changes: 8 additions & 7 deletions requirements/ingest/embed-aws-bedrock.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,21 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==3.0.0
# via jsonpatch
langchain==0.2.3
langchain==0.2.6
# via langchain-community
langchain-community==0.2.4
# via -r ./ingest/embed-aws-bedrock.in
langchain-core==0.2.2
langchain-community==0.2.6
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/embed-aws-bedrock.in
langchain-core==0.2.10
# via
# langchain
# langchain-community
# langchain-text-splitters
langchain-text-splitters==0.2.1
langchain-text-splitters==0.2.2
# via langchain
langsmith==0.1.62
langsmith==0.1.82
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-community
# langchain-core
Expand Down
15 changes: 8 additions & 7 deletions requirements/ingest/embed-huggingface.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,20 +67,21 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==3.0.0
# via jsonpatch
langchain==0.2.3
langchain==0.2.6
# via langchain-community
langchain-community==0.2.4
# via -r ./ingest/embed-huggingface.in
langchain-core==0.2.2
langchain-community==0.2.6
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/embed-huggingface.in
langchain-core==0.2.10
# via
# langchain
# langchain-community
# langchain-text-splitters
langchain-text-splitters==0.2.1
langchain-text-splitters==0.2.2
# via langchain
langsmith==0.1.62
langsmith==0.1.82
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-community
# langchain-core
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/embed-octoai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ idna==3.7
# anyio
# httpx
# requests
openai==1.35.3
openai==1.35.5
# via -r ./ingest/embed-octoai.in
pydantic==2.7.4
# via openai
Expand Down
17 changes: 9 additions & 8 deletions requirements/ingest/embed-openai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,20 +63,21 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==3.0.0
# via jsonpatch
langchain==0.2.3
langchain==0.2.6
# via langchain-community
langchain-community==0.2.4
# via -r ./ingest/embed-openai.in
langchain-core==0.2.2
langchain-community==0.2.6
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/embed-openai.in
langchain-core==0.2.10
# via
# langchain
# langchain-community
# langchain-text-splitters
langchain-text-splitters==0.2.1
langchain-text-splitters==0.2.2
# via langchain
langsmith==0.1.62
langsmith==0.1.82
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-community
# langchain-core
Expand All @@ -97,7 +98,7 @@ numpy==1.26.4
# -c ./ingest/../base.txt
# langchain
# langchain-community
openai==1.35.3
openai==1.35.5
# via -r ./ingest/embed-openai.in
orjson==3.10.5
# via langsmith
Expand Down
17 changes: 9 additions & 8 deletions requirements/ingest/embed-vertexai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,25 +100,26 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==3.0.0
# via jsonpatch
langchain==0.2.3
langchain==0.2.6
# via
# -r ./ingest/embed-vertexai.in
# langchain-community
langchain-community==0.2.4
# via -r ./ingest/embed-vertexai.in
langchain-core==0.2.2
langchain-community==0.2.6
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/embed-vertexai.in
langchain-core==0.2.10
# via
# langchain
# langchain-community
# langchain-google-vertexai
# langchain-text-splitters
langchain-google-vertexai==1.0.5
langchain-google-vertexai==1.0.6
# via -r ./ingest/embed-vertexai.in
langchain-text-splitters==0.2.1
langchain-text-splitters==0.2.2
# via langchain
langsmith==0.1.62
langsmith==0.1.82
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-community
# langchain-core
Expand Down
9 changes: 4 additions & 5 deletions requirements/ingest/embed-voyageai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,20 +42,19 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==3.0.0
# via jsonpatch
langchain==0.2.3
langchain==0.2.6
# via -r ./ingest/embed-voyageai.in
langchain-core==0.2.2
langchain-core==0.2.10
# via
# langchain
# langchain-text-splitters
# langchain-voyageai
langchain-text-splitters==0.2.1
langchain-text-splitters==0.2.2
# via langchain
langchain-voyageai==0.1.1
# via -r ./ingest/embed-voyageai.in
langsmith==0.1.62
langsmith==0.1.82
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-core
multidict==6.0.5
Expand Down
39 changes: 35 additions & 4 deletions test_unstructured/partition/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,19 @@ def opts_args() -> dict[str, Any]:
class DescribeDocxPartitionerOptions:
"""Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects."""

# -- .load() ---------------------------------

def it_provides_a_validating_constructor(self, opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("simple.docx")

opts = DocxPartitionerOptions.load(**opts_args)

assert isinstance(opts, DocxPartitionerOptions)

def and_it_raises_when_options_are_not_valid(self, opts_args: dict[str, Any]):
with pytest.raises(ValueError, match="no DOCX document specified, "):
DocxPartitionerOptions.load(**opts_args)

# -- .document -------------------------------

def it_loads_the_docx_document(
Expand Down Expand Up @@ -1024,13 +1037,31 @@ def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
assert isinstance(docx_file, io.BytesIO)
assert docx_file.getvalue() == b"abcdefg"

def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided(
# -- ._validate() ----------------------------

def it_raises_when_no_file_exists_at_file_path(self, opts_args: dict[str, Any]):
opts_args["file_path"] = "l/m/n.docx"
with pytest.raises(FileNotFoundError, match="no such file or directory: 'l/m/n.docx'"):
DocxPartitionerOptions.load(**opts_args)

def and_it_raises_when_the_file_at_file_path_is_not_a_ZIP_archive(
self, opts_args: dict[str, Any]
):
opts = DocxPartitionerOptions(**opts_args)
opts_args["file_path"] = example_doc_path("simple.doc")
with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "):
DocxPartitionerOptions.load(**opts_args)

with pytest.raises(ValueError, match="No DOCX document specified, either `filename` or "):
opts._docx_file
def and_it_raises_when_the_file_like_object_is_not_a_ZIP_archive(
self, opts_args: dict[str, Any]
):
with open(example_doc_path("simple.doc"), "rb") as f:
opts_args["file"] = f
with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "):
DocxPartitionerOptions.load(**opts_args)

def and_it_raises_when_neither_a_file_path_or_file_is_provided(self, opts_args: dict[str, Any]):
with pytest.raises(ValueError, match="no DOCX document specified, either `filename` or "):
DocxPartitionerOptions.load(**opts_args)

# -- fixtures --------------------------------------------------------------------------------

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[]
Loading

0 comments on commit 694bc68

Please sign in to comment.