Skip to content

Commit

Permalink
Merge pull request #5 from weni-ai/feature/loaders
Browse files Browse the repository at this point in the history
add: document loaders
  • Loading branch information
AlisoSouza authored Jan 31, 2024
2 parents 41fb300 + 823ecfd commit ec1c24b
Show file tree
Hide file tree
Showing 4 changed files with 985 additions and 30 deletions.
52 changes: 52 additions & 0 deletions app/loaders/loaders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from langchain.document_loaders import (
TextLoader, PyPDFLoader, UnstructuredExcelLoader,
UnstructuredWordDocumentLoader, Docx2txtLoader
)
from typing import Callable, List
from langchain.schema.document import Document


class DataLoader:
def __init__(self, loader: Callable, file: str) -> None:
self.loader = loader
self.file = file

def load(self) -> List[Document]:
return self.loader(self.file)

def raw_text(self) -> str:
pages = self.load()
raw_text = ""
for i, page in enumerate(pages):
text = page.page_content
if text:
raw_text += text.lower()
return raw_text


def txt_loader(file: str) -> Callable:
loader = TextLoader(file)
return loader.load()


def pdf_loader(file: str) -> Callable:
loader = PyPDFLoader(file)
pages = loader.load_and_split()
return pages


def docx_loader(file: str) -> Callable:
loader = Docx2txtLoader(file)
return loader.load()


def u_docx_loader(file: str) -> Callable:
"""Same as docx_loader but using Unstructured"""
loader = UnstructuredWordDocumentLoader(file)
return loader.load()


def xlsx_loader(file: str) -> Callable:
"""Loads .xlsx and .xls files"""
loader = UnstructuredExcelLoader(file, mode="elements")
return loader.load()
110 changes: 110 additions & 0 deletions app/tests/test_document_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import unittest
from app.loaders.loaders import (
DataLoader,
pdf_loader,
txt_loader,
docx_loader,
u_docx_loader,
xlsx_loader
)
import os
import pathlib
import shutil
import xlsxwriter
from docx import Document
from reportlab.pdfgen.canvas import Canvas


class TestDocumentLoader(unittest.TestCase):
test_dir = pathlib.Path(__file__).parent.resolve()
path = os.path.join(test_dir, 'test_files')
text_string = 'Hello, World!'
file_name = 'test_file'

@staticmethod
def _create_test_dir():
if not os.path.exists(TestDocumentLoader.path):
os.mkdir(TestDocumentLoader.path)

@staticmethod
def _create_pdf(path: str, name: str, text: str):
file_path = f'{path}/{name}.pdf'
canvas = Canvas(file_path)
canvas.drawString(72, 72, text)
canvas.save()

@staticmethod
def _create_txt(path: str, name: str, text: str):
file_path = f'{path}/{name}.txt'
with open(file_path, 'w') as f:
f.write(text)

@staticmethod
def _create_docx(path: str, name: str, text: str):
file_path = f'{path}/{name}.docx'
document = Document()
document.add_paragraph(text, style='Intense Quote')
document.save(file_path)

@staticmethod
def _create_xlsx(path: str, name: str):
file_path = f'{path}/{name}.xlsx'
workbook = xlsxwriter.Workbook(file_path)
worksheet = workbook.add_worksheet("Test sheet")
scores = (
['Lorem', 4576],
['Ipsum', 345],
['Dolor', 9088],
['Sit', 88],
['Amet', 15],
)
row = 0
col = 0
for name, score in (scores):
worksheet.write(row, col, name)
worksheet.write(row, col + 1, score)
row += 1
workbook.close()

@classmethod
def setUpClass(cls) -> None:
cls._create_test_dir()
cls._create_pdf(cls.path, cls.file_name, cls.text_string)
cls._create_txt(cls.path, cls.file_name, cls.text_string)
cls._create_docx(cls.path, cls.file_name, cls.text_string)
cls._create_xlsx(cls.path, cls.file_name)

def test_load_pdf(self):
file_path = f'{self.path}/{self.file_name}.pdf'
data_loader = DataLoader(pdf_loader, file_path)
raw_text = data_loader.raw_text()
self.assertEqual(raw_text, self.text_string.lower())

def test_load_txt(self):
file_path = f'{self.path}/{self.file_name}.txt'
data_loader = DataLoader(txt_loader, file_path)
raw_text = data_loader.raw_text()
self.assertEqual(raw_text, self.text_string.lower())

def test_load_udocx(self):
file_path = f'{self.path}/{self.file_name}.docx'
data_loader = DataLoader(u_docx_loader, file_path)
raw_text = data_loader.raw_text()
self.assertEqual(raw_text, self.text_string.lower())

def test_load_docx(self):
file_path = f'{self.path}/{self.file_name}.docx'
data_loader = DataLoader(docx_loader, file_path)
raw_text = data_loader.raw_text()
self.assertEqual(raw_text, self.text_string.lower())

def test_load_xlsx(self):
file_path = f'{self.path}/{self.file_name}.xlsx'
data_loader = DataLoader(xlsx_loader, file_path)
raw_text = data_loader.raw_text()
self.assertEqual(type(raw_text), str)

@classmethod
def tearDownClass(cls):
if os.path.exists(TestDocumentLoader.path):
shutil.rmtree(TestDocumentLoader.path, ignore_errors=True)
Loading

0 comments on commit ec1c24b

Please sign in to comment.