You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
It seems that the numpy.ndarray field in the unstructured_inference.inference.elements module is not allowed to have a mutable default value. The error suggests that a default_factory should be used instead.
Has anyone encountered this issue or found a solution for handling mutable default errors with dataclasses in the unstructured package?
I'm encountering an error while extracting text from a PDF (hierarchy.pdf) using the unstructured library.
Exception: ValueError
Message: mutable default <class 'numpy.ndarray'> for field texts is not allowed: use default_factory
Code i was running
`
from langchain_community.document_loaders import UnstructuredPDFLoader
def extract_using_unstructured(file_path):
"""
Extract and process text from a PDF file using Unstructured via LangChain.
Args:
file_path (str): The path to the PDF file.
Returns:
str: The extracted text content from the PDF, or an error message if extraction fails.
"""
try:
# Print version info for key libraries
versions = {
'numpy': importlib.metadata.version('numpy'),
'unstructured': importlib.metadata.version('unstructured'),
'unstructured-inference': importlib.metadata.version('unstructured-inference'),
'langchain': importlib.metadata.version('langchain'),
'opencv-python': importlib.metadata.version('opencv-python'),
'pandas': importlib.metadata.version('pandas'),
'scipy': importlib.metadata.version('scipy')
}
print(f"Library versions: {versions}")
# Initialize the loader with the file path
loader = UnstructuredPDFLoader(file_path)
# Load the document
docs = loader.load()
# Extract text from all pages
extracted_text = "\n".join([doc.page_content for doc in docs])
return extracted_text
except Exception as e:
# Get the full exception traceback
tb = traceback.format_exc()
# Log the detailed error
print(f"Error extracting text from PDF {file_path}:\n"
f"Exception: {type(e).__name__}\n"
f"Message: {str(e)}\n"
f"Traceback:\n{tb}")
# Return a detailed error message
raise e`
Traceback:
Traceback (most recent call last): File "C:\Source\Utils\pdfparserstest.py", line 44, in extract_using_unstructured docs = loader.load() File "Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\langchain_core\document_loaders\base.py", line 31, in load return list(self.lazy_load()) File "Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\langchain_community\document_loaders\unstructured.py", line 107, in lazy_load elements = self._get_elements() File "C:\Users\sudie\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\langchain_community\document_loaders\pdf.py", line 72, in _get_elements from unstructured.partition.pdf import partition_pdf File "C:\Users\sudie\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\unstructured\partition\pdf.py", line 56, in <module> from unstructured.partition.pdf_image.analysis.layout_dump import ( File "C:\Users\sudie\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\unstructured\partition\pdf_image\analysis\layout_dump.py", line 7, in <module> from unstructured_inference.inference.elements import ImageTextRegion, TextRegion File "C:\Users\sudie\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\unstructured_inference\inference\elements.py", line 209, in <module> @dataclass File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\dataclasses.py", line 1232, in dataclass return wrap(cls) File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\dataclasses.py", line 1222, in wrap return _process_class(cls, init, repr, eq, order, unsafe_hash, File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\dataclasses.py", line 958, in _process_class cls_fields.append(_get_field(cls, name, type, kw_only)) File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\dataclasses.py", line 815, in _get_field raise ValueError(f'mutable default {type(f.default)} for field '
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
-
Issue:
It seems that the numpy.ndarray field in the unstructured_inference.inference.elements module is not allowed to have a mutable default value. The error suggests that a default_factory should be used instead.
Has anyone encountered this issue or found a solution for handling mutable default errors with dataclasses in the unstructured package?
Any guidance would be appreciated!
Library versions:
{'numpy': '1.26.4', 'unstructured': '0.15.13', 'unstructured-inference': '0.7.37', 'langchain': '0.3.1', 'opencv-python': '4.10.0.84', 'pandas': '2.2.3', 'scipy': '1.14.1'}
Detailed Error
I'm encountering an error while extracting text from a PDF (hierarchy.pdf) using the unstructured library.
Code i was running
`
from langchain_community.document_loaders import UnstructuredPDFLoader
def extract_using_unstructured(file_path):
"""
Extract and process text from a PDF file using Unstructured via LangChain.
Traceback:
Traceback (most recent call last): File "C:\Source\Utils\pdfparserstest.py", line 44, in extract_using_unstructured docs = loader.load() File "Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\langchain_core\document_loaders\base.py", line 31, in load return list(self.lazy_load()) File "Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\langchain_community\document_loaders\unstructured.py", line 107, in lazy_load elements = self._get_elements() File "C:\Users\sudie\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\langchain_community\document_loaders\pdf.py", line 72, in _get_elements from unstructured.partition.pdf import partition_pdf File "C:\Users\sudie\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\unstructured\partition\pdf.py", line 56, in <module> from unstructured.partition.pdf_image.analysis.layout_dump import ( File "C:\Users\sudie\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\unstructured\partition\pdf_image\analysis\layout_dump.py", line 7, in <module> from unstructured_inference.inference.elements import ImageTextRegion, TextRegion File "C:\Users\sudie\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\unstructured_inference\inference\elements.py", line 209, in <module> @dataclass File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\dataclasses.py", line 1232, in dataclass return wrap(cls) File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\dataclasses.py", line 1222, in wrap return _process_class(cls, init, repr, eq, order, unsafe_hash, File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\dataclasses.py", line 958, in _process_class cls_fields.append(_get_field(cls, name, type, kw_only)) File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\dataclasses.py", line 815, in _get_field raise ValueError(f'mutable default {type(f.default)} for field '
Beta Was this translation helpful? Give feedback.
All reactions