diff --git a/README.md b/README.md index 320d7592..7d540dd8 100644 --- a/README.md +++ b/README.md @@ -64,3 +64,15 @@ deepset Cloud is powered by Haystack, an open source framework for building end- - [Project website](https://haystack.deepset.ai/) - [GitHub repository](https://github.com/deepset-ai/haystack) + +--- + +# Licenses + +The SDK is licensed under Apache 2.0, you can see the license [here](https://github.com/deepset-ai/deepset-cloud-sdk/blob/main/LICENSE) + +We use several libraries in this SDK that are licensed under the [MPL 2.0 license](https://www.mozilla.org/en-US/MPL/2.0/) + +- [tqdm](https://github.com/tqdm/tqdm) for progress bars +- [pathspec](https://github.com/cpburnz/python-pathspec) for pattern matching file paths +- [certifi](https://github.com/certifi/python-certifi) for validating trustworthiness of SSL certificates diff --git a/deepset_cloud_sdk/__about__.py b/deepset_cloud_sdk/__about__.py index f7fc0f25..436e66d8 100644 --- a/deepset_cloud_sdk/__about__.py +++ b/deepset_cloud_sdk/__about__.py @@ -1,4 +1,4 @@ """This file defines the package version.""" # Will be automatically overridden during the release process # It's okay if this is outdated in the repo. We will use the tag from the release as the version. -__version__ = "0.0.35" +__version__ = "1.0.2" diff --git a/deepset_cloud_sdk/_api/files.py b/deepset_cloud_sdk/_api/files.py index cd3c3cb7..13790a40 100644 --- a/deepset_cloud_sdk/_api/files.py +++ b/deepset_cloud_sdk/_api/files.py @@ -18,6 +18,7 @@ from deepset_cloud_sdk._api.deepset_cloud_api import DeepsetCloudAPI from deepset_cloud_sdk._api.upload_sessions import WriteMode +from deepset_cloud_sdk._utils.constants import SUPPORTED_TYPE_SUFFIXES from deepset_cloud_sdk._utils.datetime import from_isoformat logger = structlog.get_logger(__name__) @@ -204,15 +205,15 @@ async def direct_upload_path( file_id: UUID = UUID(response.json()["file_id"]) return file_id - async def direct_upload_text( + async def direct_upload_in_memory( self, workspace_name: str, - text: str, + content: Union[bytes, str], file_name: str, meta: Optional[Dict[str, Any]] = None, write_mode: WriteMode = WriteMode.KEEP, ) -> UUID: - """Directly upload text to deepset Cloud. + """Directly upload files to deepset Cloud. :param workspace_name: Name of the workspace to use. :param text: File text to upload. @@ -225,17 +226,20 @@ async def direct_upload_text( FAIL - fails to upload the file with the same name. :return: ID of the uploaded file. """ - if not file_name.endswith(".txt"): + file_name_suffix = f".{file_name.split('.')[1]}" + if file_name_suffix not in SUPPORTED_TYPE_SUFFIXES: raise NotMatchingFileTypeException( - f"File name {file_name} is not a textfile. Please use '.txt' for text uploads." + f"File name {file_name} is not a supported file type. Please use one of {'` '.join(SUPPORTED_TYPE_SUFFIXES)} for text uploads." ) response = await self._deepset_cloud_api.post( workspace_name, "files", - data={"text": text, "meta": json.dumps(meta)}, - params={"write_mode": write_mode.value, "file_name": file_name}, + files={"file": (file_name, content)}, + data={"meta": json.dumps(meta)}, + params={"write_mode": write_mode.value}, ) + if response.status_code != codes.CREATED or response.json().get("file_id") is None: raise FailedToUploadFileException( f"Failed to upload file with status code {response.status_code}. response was: {response.text}" diff --git a/deepset_cloud_sdk/_s3/upload.py b/deepset_cloud_sdk/_s3/upload.py index 0469630e..68e77c3a 100644 --- a/deepset_cloud_sdk/_s3/upload.py +++ b/deepset_cloud_sdk/_s3/upload.py @@ -1,12 +1,11 @@ """Module for upload-related S3 operations.""" import asyncio -import json import os import re from dataclasses import dataclass from http import HTTPStatus from pathlib import Path -from typing import Any, Coroutine, List, Optional, Union +from typing import Any, Coroutine, List, Optional, Sequence, Union import aiofiles import aiohttp @@ -19,7 +18,7 @@ AWSPrefixedRequestConfig, UploadSession, ) -from deepset_cloud_sdk.models import DeepsetCloudFile +from deepset_cloud_sdk.models import DeepsetCloudFileBase logger = structlog.get_logger(__name__) @@ -181,14 +180,14 @@ async def upload_from_file( ) return S3UploadResult(file_name=file_name, success=False, exception=exception) - async def upload_from_string( + async def upload_from_memory( self, file_name: str, upload_session: UploadSession, - content: str, + content: Union[bytes, str], client_session: aiohttp.ClientSession, ) -> S3UploadResult: - """Upload text to the prefixed S3 namespace. + """Upload content to the prefixed S3 namespace. :param file_name: Name of the file. :param upload_session: UploadSession to associate the upload with. @@ -267,13 +266,13 @@ async def upload_files_from_paths( result_summary = await self._process_results(tasks, show_progress=show_progress) return result_summary - async def upload_texts( - self, upload_session: UploadSession, files: List[DeepsetCloudFile], show_progress: bool = True + async def upload_in_memory( + self, upload_session: UploadSession, files: Sequence[DeepsetCloudFileBase], show_progress: bool = True ) -> S3UploadSummary: - """Upload a set of texts to the prefixed S3 namespace given a list of paths. + """Upload a set of files to the prefixed S3 namespace given a list of paths. :param upload_session: UploadSession to associate the upload with. - :param files: A list of DeepsetCloudFiles to upload. + :param files: A list of DeepsetCloudFileBase to upload. :param show_progress: Whether to show a progress bar on the upload. :return: S3UploadSummary object. """ @@ -283,13 +282,14 @@ async def upload_texts( for file in files: # raw data file_name = file.name - tasks.append(self.upload_from_string(file_name, upload_session, file.text, client_session)) + tasks.append(self.upload_from_memory(file_name, upload_session, file.content(), client_session)) # meta if file.meta is not None: meta_name = f"{file_name}.meta.json" - metadata = json.dumps(file.meta) - tasks.append(self.upload_from_string(meta_name, upload_session, metadata, client_session)) + tasks.append( + self.upload_from_memory(meta_name, upload_session, file.meta_as_string(), client_session) + ) result_summary = await self._process_results(tasks, show_progress=show_progress) diff --git a/deepset_cloud_sdk/_service/files_service.py b/deepset_cloud_sdk/_service/files_service.py index f05819d4..6efd44a7 100644 --- a/deepset_cloud_sdk/_service/files_service.py +++ b/deepset_cloud_sdk/_service/files_service.py @@ -9,7 +9,17 @@ from collections import defaultdict from contextlib import asynccontextmanager from pathlib import Path -from typing import Any, AsyncGenerator, Dict, List, Optional, Set, Tuple, Union +from typing import ( + Any, + AsyncGenerator, + Dict, + List, + Optional, + Sequence, + Set, + Tuple, + Union, +) from uuid import UUID import structlog @@ -31,11 +41,11 @@ WriteMode, ) from deepset_cloud_sdk._s3.upload import S3, S3UploadResult, S3UploadSummary -from deepset_cloud_sdk.models import DeepsetCloudFile +from deepset_cloud_sdk._utils.constants import SUPPORTED_TYPE_SUFFIXES +from deepset_cloud_sdk.models import DeepsetCloudFileBase logger = structlog.get_logger(__name__) -SUPPORTED_TYPE_SUFFIXES = [".csv", ".docx", ".html", ".json", ".md", ".txt", ".pdf", ".pptx", ".xlsx", ".xml"] META_SUFFIX = ".meta.json" DIRECT_UPLOAD_THRESHOLD = 20 @@ -148,13 +158,18 @@ async def _wrapped_direct_upload_path( logger.error("Failed uploading file.", file_path=file_path, error=error) return S3UploadResult(file_name=file_path.name, success=False, exception=error) - async def _wrapped_direct_upload_text( - self, workspace_name: str, text: str, file_name: str, meta: Dict[str, Any], write_mode: WriteMode + async def _wrapped_direct_upload_in_memory( + self, + workspace_name: str, + content: Union[str, bytes], + file_name: str, + meta: Dict[str, Any], + write_mode: WriteMode, ) -> S3UploadResult: try: - await self._files.direct_upload_text( + await self._files.direct_upload_in_memory( workspace_name=workspace_name, - text=text, + content=content, meta=meta, file_name=file_name, write_mode=write_mode, @@ -543,10 +558,10 @@ async def download( if pbar is not None: pbar.close() - async def upload_texts( + async def upload_in_memory( self, workspace_name: str, - files: List[DeepsetCloudFile], + files: Sequence[DeepsetCloudFileBase], write_mode: WriteMode = WriteMode.KEEP, blocking: bool = True, timeout_s: Optional[int] = None, @@ -578,11 +593,11 @@ async def upload_texts( _coroutines = [] for file in files: _coroutines.append( - self._wrapped_direct_upload_text( + self._wrapped_direct_upload_in_memory( workspace_name=workspace_name, file_name=file.name, meta=file.meta or {}, - text=file.text, + content=file.content(), write_mode=write_mode, ) ) @@ -601,7 +616,7 @@ async def upload_texts( # create session to upload files to async with self._create_upload_session(workspace_name=workspace_name, write_mode=write_mode) as upload_session: - upload_summary = await self._s3.upload_texts( + upload_summary = await self._s3.upload_in_memory( upload_session=upload_session, files=files, show_progress=show_progress ) diff --git a/deepset_cloud_sdk/_utils/constants.py b/deepset_cloud_sdk/_utils/constants.py new file mode 100644 index 00000000..37a5c678 --- /dev/null +++ b/deepset_cloud_sdk/_utils/constants.py @@ -0,0 +1 @@ +SUPPORTED_TYPE_SUFFIXES = [".csv", ".docx", ".html", ".json", ".md", ".txt", ".pdf", ".pptx", ".xlsx", ".xml"] diff --git a/deepset_cloud_sdk/models.py b/deepset_cloud_sdk/models.py index f75372ba..c8184f82 100644 --- a/deepset_cloud_sdk/models.py +++ b/deepset_cloud_sdk/models.py @@ -1,6 +1,8 @@ """General data classes for deepset Cloud SDK.""" +import json +from abc import abstractmethod from dataclasses import dataclass -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Union from uuid import UUID @@ -13,10 +15,78 @@ class UserInfo: family_name: str -@dataclass -class DeepsetCloudFile: - """Data class for files in deepset Cloud.""" +class DeepsetCloudFileBase: # pylint: disable=too-few-public-methods + """Base class for deepset Cloud files.""" + + def __init__(self, name: str, meta: Optional[Dict[str, Any]] = None): + """ + Initialize DeepsetCloudFileBase. + + :param name: The file name + :param meta: The file's metadata + """ + self.name = name + self.meta = meta + + @abstractmethod + def content(self) -> Union[str, bytes]: + """Return content.""" + raise NotImplementedError + + def meta_as_string(self) -> str: + """Return metadata as a string.""" + if self.meta: + return json.dumps(self.meta) + + return json.dumps({}) + + +class DeepsetCloudFile(DeepsetCloudFileBase): # pylint: disable=too-few-public-methods + """Data class for text files in deepset Cloud.""" + + def __init__(self, text: str, name: str, meta: Optional[Dict[str, Any]] = None): + """ + Initialize DeepsetCloudFileBase. + + :param name: The file name + :param text: The text content of the file + :param meta: The file's metadata + """ + super().__init__(name, meta) + self.text = text + + def content(self) -> str: + """ + Return the content of the file. + + :return: The text of the file. + """ + return self.text + + +# Didn't want to cause breaking changes in the DeepsetCloudFile class, though it +# is technically the same as the below, the naming of the text field will be confusing +# for users that are uploading anything other than text. + + +class DeepsetCloudFileBytes(DeepsetCloudFileBase): # pylint: disable=too-few-public-methods + """Data class for uploading files of any valid type in deepset Cloud.""" + + def __init__(self, file_bytes: bytes, name: str, meta: Optional[Dict[str, Any]] = None): + """ + Initialize DeepsetCloudFileBase. + + :param name: The file name + :param file_bytes: The content of the file represented in bytes + :param meta: The file's metadata + """ + super().__init__(name, meta) + self.file_bytes = file_bytes + + def content(self) -> bytes: + """ + Return the content of the file in bytes. - text: str - name: str - meta: Optional[Dict[str, Any]] = None + :return: The content of the file in bytes. + """ + return self.file_bytes diff --git a/deepset_cloud_sdk/workflows/async_client/files.py b/deepset_cloud_sdk/workflows/async_client/files.py index 1df4b7ba..e062c786 100644 --- a/deepset_cloud_sdk/workflows/async_client/files.py +++ b/deepset_cloud_sdk/workflows/async_client/files.py @@ -19,7 +19,8 @@ WriteMode, ) from deepset_cloud_sdk._s3.upload import S3UploadSummary -from deepset_cloud_sdk._service.files_service import DeepsetCloudFile, FilesService +from deepset_cloud_sdk._service.files_service import FilesService +from deepset_cloud_sdk.models import DeepsetCloudFile, DeepsetCloudFileBytes def _get_config(api_key: Optional[str] = None, api_url: Optional[str] = None) -> CommonConfig: @@ -217,7 +218,7 @@ async def upload_texts( ) -> S3UploadSummary: """Upload raw texts to deepset Cloud. - :param files: List of DeepsetCloudFiles to upload. + :param files: List of DeepsetCloudFile objects to upload. :param api_key: deepset Cloud API key to use for authentication. :param api_url: API URL to use for authentication. :param workspace_name: Name of the workspace to upload the files to. It uses the workspace from the .ENV file by default. @@ -230,9 +231,71 @@ async def upload_texts( This may take a couple of minutes. :param timeout_s: Timeout in seconds for the `blocking` parameter. :param show_progress: Shows the upload progress. + + Example: + ```python + import asyncio + from deepset_cloud_sdk.workflows.async_client.files import upload_texts, DeepsetCloudFile + + async def my_async_context() -> None: + await upload_texts( + api_key="", + workspace_name="", # optional, by default the environment variable "DEFAULT_WORKSPACE_NAME" is used + files=[ + DeepsetCloudFile( + name="example.txt", + text="this is text", + meta={"key": "value"}, # optional + ) + ], + blocking=True, # optional, by default True + timeout_s=300, # optional, by default 300 + ) + + # Run the async function + if __name__ == "__main__": + asyncio.run(my_async_context()) + ``` + """ + async with FilesService.factory(_get_config(api_key=api_key, api_url=api_url)) as file_service: + return await file_service.upload_in_memory( + workspace_name=workspace_name, + files=files, + write_mode=write_mode, + blocking=blocking, + timeout_s=timeout_s, + show_progress=show_progress, + ) + + +async def upload_bytes( + files: List[DeepsetCloudFileBytes], + api_key: Optional[str] = None, + api_url: Optional[str] = None, + workspace_name: str = DEFAULT_WORKSPACE_NAME, + write_mode: WriteMode = WriteMode.KEEP, + blocking: bool = True, + timeout_s: Optional[int] = None, + show_progress: bool = True, +) -> S3UploadSummary: + """Upload files in byte format. + + :param files: List of DeepsetCloudFileBytes objects to upload. + :param api_key: deepset Cloud API key to use for authentication. + :param api_url: API URL to use for authentication. + :param workspace_name: Name of the workspace to upload the files to. It uses the workspace from the .ENV file by default. + :param write_mode: Specifies what to do when a file with the same name already exists in the workspace. + Possible options are: + KEEP - uploads the file with the same name and keeps both files in the workspace. + OVERWRITE - overwrites the file in the workspace with the file you're uploading. + FAIL - fails to upload the file if a file with the same name already exists in the workspace. + :param blocking: Whether to wait for the files to be listed and displayed in deepset Cloud. + This may take a couple of minutes. + :param timeout_s: Timeout in seconds for the `blocking` parameter. + :param show_progress: Shows the upload progress. """ async with FilesService.factory(_get_config(api_key=api_key, api_url=api_url)) as file_service: - return await file_service.upload_texts( + return await file_service.upload_in_memory( workspace_name=workspace_name, files=files, write_mode=write_mode, diff --git a/deepset_cloud_sdk/workflows/sync_client/files.py b/deepset_cloud_sdk/workflows/sync_client/files.py index 71850c89..a52456bb 100644 --- a/deepset_cloud_sdk/workflows/sync_client/files.py +++ b/deepset_cloud_sdk/workflows/sync_client/files.py @@ -15,7 +15,7 @@ WriteMode, ) from deepset_cloud_sdk._s3.upload import S3UploadSummary -from deepset_cloud_sdk._service.files_service import DeepsetCloudFile +from deepset_cloud_sdk.models import DeepsetCloudFile, DeepsetCloudFileBytes from deepset_cloud_sdk.workflows.async_client.files import download as async_download from deepset_cloud_sdk.workflows.async_client.files import ( get_upload_session as async_get_upload_session, @@ -27,6 +27,9 @@ list_upload_sessions as async_list_upload_sessions, ) from deepset_cloud_sdk.workflows.async_client.files import upload as async_upload +from deepset_cloud_sdk.workflows.async_client.files import ( + upload_bytes as async_upload_bytes, +) from deepset_cloud_sdk.workflows.async_client.files import ( upload_texts as async_upload_texts, ) @@ -152,6 +155,25 @@ def upload_texts( :param blocking: Whether to wait for the files to be uploaded and listed in deepset Cloud. :param timeout_s: Timeout in seconds for the `blocking` parameter. :param show_progress: Shows the upload progress. + + Example: + ```python + from deepset_cloud_sdk.workflows.sync_client.files import upload_texts, DeepsetCloudFile + + upload_texts( + api_key="", + workspace_name="", # optional, by default the environment variable "DEFAULT_WORKSPACE_NAME" is used + files=[ + DeepsetCloudFile( + name="example.txt", + text="this is text", + meta={"key": "value"}, # optional + ) + ], + blocking=True, # optional, by default True + timeout_s=300, # optional, by default 300 + ) + ``` """ return asyncio.run( async_upload_texts( @@ -167,6 +189,45 @@ def upload_texts( ) +def upload_bytes( + files: List[DeepsetCloudFileBytes], + api_key: Optional[str] = None, + api_url: Optional[str] = None, + workspace_name: str = DEFAULT_WORKSPACE_NAME, + write_mode: WriteMode = WriteMode.KEEP, + blocking: bool = True, + timeout_s: Optional[int] = None, + show_progress: bool = True, +) -> S3UploadSummary: + """Upload any supported file types to deepset Cloud. These include .csv, .docx, .html, .json, .md, .txt, .pdf, .pptx, .xlsx and .xml. + + :param files: List of DeepsetCloudFilesBytes to upload. + :param api_key: deepset Cloud API key to use for authentication. + :param api_url: API URL to use for authentication. + :param workspace_name: Name of the workspace to upload the files to. It uses the workspace from the .ENV file by default. + :param write_mode: Specifies what to do when a file with the same name already exists in the workspace. + Possible options are: + KEEP - uploads the file with the same name and keeps both files in the workspace. + OVERWRITE - overwrites the file that is in the workspace. + FAIL - fails to upload the file with the same name. + :param blocking: Whether to wait for the files to be uploaded and listed in deepset Cloud. + :param timeout_s: Timeout in seconds for the `blocking` parameter. + :param show_progress: Shows the upload progress. + """ + return asyncio.run( + async_upload_bytes( + files=files, + api_key=api_key, + api_url=api_url, + workspace_name=workspace_name, + write_mode=write_mode, + blocking=blocking, + timeout_s=timeout_s, + show_progress=show_progress, + ) + ) + + def get_upload_session( session_id: UUID, api_key: Optional[str] = None, diff --git a/pyproject.toml b/pyproject.toml index 8c175006..e26f9845 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "aiofiles==23.2.1", "tabulate==0.9.0", "tqdm==4.66.4", - "yaspin==2.3.0", + "yaspin==3.0.0", "pyrate-limiter==3.6.0", ] diff --git a/tests/data/upload_folder/example.pdf b/tests/data/upload_folder/example.pdf index 3734363a..3d791561 100644 Binary files a/tests/data/upload_folder/example.pdf and b/tests/data/upload_folder/example.pdf differ diff --git a/tests/integration/service/test_integration_files_service.py b/tests/integration/service/test_integration_files_service.py index b9d4f296..42a9b25c 100644 --- a/tests/integration/service/test_integration_files_service.py +++ b/tests/integration/service/test_integration_files_service.py @@ -12,9 +12,9 @@ from deepset_cloud_sdk._service.files_service import ( META_SUFFIX, SUPPORTED_TYPE_SUFFIXES, - DeepsetCloudFile, FilesService, ) +from deepset_cloud_sdk.models import DeepsetCloudFile, DeepsetCloudFileBytes @pytest.mark.asyncio @@ -203,7 +203,10 @@ async def test_async_upload_multiple_file_types( file00_metadata = next((file.meta for file in uploaded_files if file.name == "file00.txt"), None) assert file00_metadata == {"file_name_duplicate_check": "file00.txt", "source": "multiple file types"} - async def test_upload_texts(self, integration_config: CommonConfig, workspace_name: str) -> None: + async def test_upload_in_memory(self, integration_config: CommonConfig, workspace_name: str) -> None: + with open(Path("./tests/test_data/multiple_file_types/file08.pdf"), "rb") as f: + pdf_contents = f.read() + async with FilesService.factory(integration_config) as file_service: files = [ DeepsetCloudFile("file1", "file1.txt", {"which": 1}), @@ -211,23 +214,28 @@ async def test_upload_texts(self, integration_config: CommonConfig, workspace_na DeepsetCloudFile("file3", "file3.txt", {"which": 3}), DeepsetCloudFile("file4", "file4.txt", {"which": 4}), DeepsetCloudFile("file5", "file5.txt", {"which": 5}), + DeepsetCloudFileBytes(file_bytes=pdf_contents, name="file6.pdf", meta={"which": 6}), ] - result = await file_service.upload_texts( + result = await file_service.upload_in_memory( workspace_name=workspace_name, files=files, blocking=True, write_mode=WriteMode.KEEP, timeout_s=120, ) - assert result.total_files == 5 - assert result.successful_upload_count == 5 + assert result.total_files == 6 + assert result.successful_upload_count == 6 assert result.failed_upload_count == 0 assert len(result.failed) == 0 - async def test_upload_texts_less_than_session_threshold( + async def test_upload_in_memory_less_than_session_threshold( self, integration_config: CommonConfig, workspace_name: str, monkeypatch: MonkeyPatch ) -> None: monkeypatch.setattr("deepset_cloud_sdk._service.files_service.DIRECT_UPLOAD_THRESHOLD", -1) + + with open(Path("./tests/test_data/multiple_file_types/file08.pdf"), "rb") as f: + pdf_contents = f.read() + async with FilesService.factory(integration_config) as file_service: files = [ DeepsetCloudFile("file1", "file1.txt", {"which": 1}), @@ -235,16 +243,17 @@ async def test_upload_texts_less_than_session_threshold( DeepsetCloudFile("file3", "file3.txt", {"which": 3}), DeepsetCloudFile("file4", "file4.txt", {"which": 4}), DeepsetCloudFile("file5", "file5.txt", {"which": 5}), + DeepsetCloudFileBytes(file_bytes=pdf_contents, name="file6.pdf", meta={"which": 6}), ] - result = await file_service.upload_texts( + result = await file_service.upload_in_memory( workspace_name=workspace_name, files=files, blocking=True, write_mode=WriteMode.KEEP, timeout_s=120, ) - assert result.total_files == 10 - assert result.successful_upload_count == 10 + assert result.total_files == 12 # 6 file contents, and 6 metadata + assert result.successful_upload_count == 12 assert result.failed_upload_count == 0 assert len(result.failed) == 0 diff --git a/tests/test_data/multiple_file_types/file08.pdf b/tests/test_data/multiple_file_types/file08.pdf index 3734363a..3d791561 100644 Binary files a/tests/test_data/multiple_file_types/file08.pdf and b/tests/test_data/multiple_file_types/file08.pdf differ diff --git a/tests/unit/api/test_files.py b/tests/unit/api/test_files.py index 5a3b8781..49e38eed 100644 --- a/tests/unit/api/test_files.py +++ b/tests/unit/api/test_files.py @@ -380,10 +380,10 @@ async def test_direct_upload_with_path_as_string(self, files_api: FilesAPI, mock class TestDirectUploadText: async def test_direct_upload_file_for_wrong_file_type_name(self, files_api: FilesAPI) -> None: with pytest.raises(NotMatchingFileTypeException): - await files_api.direct_upload_text( + await files_api.direct_upload_in_memory( workspace_name="test_workspace", - file_name="basic.json", - text="some text", + file_name="basic.xls", + content=b"some text", meta={}, ) @@ -395,10 +395,10 @@ async def test_direct_upload_file_failed( status_code=error_code, ) with pytest.raises(FailedToUploadFileException): - await files_api.direct_upload_text( + await files_api.direct_upload_in_memory( workspace_name="test_workspace", file_name="basic.txt", - text="some text", + content=b"some text", meta={}, ) @@ -407,10 +407,10 @@ async def test_direct_upload_file(self, files_api: FilesAPI, mocked_deepset_clou status_code=httpx.codes.CREATED, json={"file_id": "cd16435f-f6eb-423f-bf6f-994dc8a36a10"}, ) - file_id = await files_api.direct_upload_text( + file_id = await files_api.direct_upload_in_memory( workspace_name="test_workspace", file_name="basic.txt", - text="some text", + content=b"some text", meta={"key": "value"}, write_mode=WriteMode.OVERWRITE, ) @@ -418,9 +418,7 @@ async def test_direct_upload_file(self, files_api: FilesAPI, mocked_deepset_clou mocked_deepset_cloud_api.post.assert_called_once_with( "test_workspace", "files", - data={"text": "some text", "meta": json.dumps({"key": "value"})}, - params={ - "write_mode": "OVERWRITE", - "file_name": "basic.txt", - }, + files={"file": ("basic.txt", b"some text")}, + data={"meta": json.dumps({"key": "value"})}, + params={"write_mode": "OVERWRITE"}, ) diff --git a/tests/unit/s3/test_upload.py b/tests/unit/s3/test_upload.py index d02c90be..b88ec029 100644 --- a/tests/unit/s3/test_upload.py +++ b/tests/unit/s3/test_upload.py @@ -35,7 +35,7 @@ def test_make_safe_file_name(self, input_file_name: str, expected_file_name: str @pytest.mark.asyncio class TestS3: @patch.object(tqdm, "gather") - async def test_upload_texts_with_progress( + async def test_upload_in_memory_with_progress( self, tqdm_gather: Mock, post: Mock, upload_session_response: UploadSession ) -> None: s3 = S3() @@ -44,11 +44,11 @@ async def test_upload_texts_with_progress( DeepsetCloudFile("two.txt", "two"), DeepsetCloudFile("three.txt", "three"), ] - await s3.upload_texts(upload_session=upload_session_response, files=files, show_progress=True) + await s3.upload_in_memory(upload_session=upload_session_response, files=files, show_progress=True) assert tqdm_gather.call_count == 1 - async def test_upload_texts_with_progress_check_http_calls( + async def test_upload_in_memory_with_progress_check_http_calls( self, post: Mock, upload_session_response: UploadSession ) -> None: s3 = S3() @@ -57,12 +57,12 @@ async def test_upload_texts_with_progress_check_http_calls( DeepsetCloudFile("two.txt", "two"), DeepsetCloudFile("three.txt", "three"), ] - await s3.upload_texts(upload_session=upload_session_response, files=files, show_progress=True) + await s3.upload_in_memory(upload_session=upload_session_response, files=files, show_progress=True) assert post.call_count == 3 @patch.object(tqdm, "gather") - async def test_upload_texts_without_progress( + async def test_upload_in_memory_without_progress( self, tqdm_gather: Mock, post: Mock, upload_session_response: UploadSession ) -> None: s3 = S3() @@ -71,7 +71,7 @@ async def test_upload_texts_without_progress( DeepsetCloudFile("two.txt", "two"), DeepsetCloudFile("three.txt", "three"), ] - await s3.upload_texts(upload_session=upload_session_response, files=files, show_progress=False) + await s3.upload_in_memory(upload_session=upload_session_response, files=files, show_progress=False) assert tqdm_gather.call_count == 0 @@ -101,7 +101,7 @@ async def test_upload_rate(self, post: Mock, upload_session_response: UploadSess number_of_files_to_upload = 9000 files = [DeepsetCloudFile(name=f"{i}.txt", text=f"{i}") for i in range(number_of_files_to_upload)] start = time.monotonic() - await s3.upload_texts(upload_session_response, files) + await s3.upload_in_memory(upload_session_response, files) time_taken = time.monotonic() - start expected_time_taken = number_of_files_to_upload / rate.limit assert time_taken == pytest.approx(expected_time_taken, 1) @@ -155,7 +155,7 @@ async def test_upload_files_from_path_with_client_disconnect_error( assert [f.file_name for f in results.failed] == ["16675.txt", "16675.txt.meta.json"] assert all(isinstance(f.exception, RetryableHttpError) for f in results.failed) - async def test_upload_texts_http_error(self, upload_session_response: UploadSession) -> None: + async def test_upload_in_memory_http_error(self, upload_session_response: UploadSession) -> None: exception = aiohttp.ClientResponseError(request_info=Mock(), history=Mock(), status=503) with patch.object(aiohttp.ClientSession, "post", side_effect=exception): s3 = S3() @@ -166,7 +166,7 @@ async def test_upload_texts_http_error(self, upload_session_response: UploadSess DeepsetCloudFile(name="three.txt", text="3"), ] - results = await s3.upload_texts(upload_session_response, files) + results = await s3.upload_in_memory(upload_session_response, files) assert results.total_files == 3 assert results.successful_upload_count == 0 assert results.failed_upload_count == 3 @@ -179,7 +179,7 @@ async def test_upload_texts_http_error(self, upload_session_response: UploadSess ] assert all(isinstance(f.exception, RetryableHttpError) for f in results.failed) - async def test_upload_texts_with_metadata_http_error(self, upload_session_response: UploadSession) -> None: + async def test_upload_in_memory_with_metadata_http_error(self, upload_session_response: UploadSession) -> None: exception = aiohttp.ClientResponseError(request_info=Mock(), history=Mock(), status=503) with patch.object(aiohttp.ClientSession, "post", side_effect=exception): s3 = S3() @@ -190,7 +190,7 @@ async def test_upload_texts_with_metadata_http_error(self, upload_session_respon DeepsetCloudFile(name="three.txt", text="3", meta={"something": 3}), ] - results = await s3.upload_texts(upload_session_response, files) + results = await s3.upload_in_memory(upload_session_response, files) assert results.total_files == 6 assert results.successful_upload_count == 0 assert results.failed_upload_count == 6 diff --git a/tests/unit/service/test_files_service.py b/tests/unit/service/test_files_service.py index cfe4a23f..e884b8dd 100644 --- a/tests/unit/service/test_files_service.py +++ b/tests/unit/service/test_files_service.py @@ -30,10 +30,9 @@ from deepset_cloud_sdk._s3.upload import S3UploadResult, S3UploadSummary from deepset_cloud_sdk._service.files_service import ( SUPPORTED_TYPE_SUFFIXES, - DeepsetCloudFile, FilesService, ) -from deepset_cloud_sdk.models import UserInfo +from deepset_cloud_sdk.models import DeepsetCloudFile, UserInfo @pytest.fixture @@ -336,7 +335,7 @@ async def test_upload_paths_to_file( @pytest.mark.asyncio class TestUploadTexts: - async def test_upload_texts_via_sessions( + async def test_upload_in_memory_via_sessions( self, file_service: FilesService, mocked_upload_sessions_api: Mock, @@ -346,7 +345,7 @@ async def test_upload_texts_via_sessions( ) -> None: monkeypatch.setattr("deepset_cloud_sdk._service.files_service.DIRECT_UPLOAD_THRESHOLD", -1) upload_summary = S3UploadSummary(total_files=1, successful_upload_count=1, failed_upload_count=0, failed=[]) - mocked_s3.upload_texts.return_value = upload_summary + mocked_s3.upload_in_memory.return_value = upload_summary files = [ DeepsetCloudFile( name="test_file.txt", @@ -364,7 +363,7 @@ async def test_upload_texts_via_sessions( finished_files=1, ), ) - result = await file_service.upload_texts( + result = await file_service.upload_in_memory( workspace_name="test_workspace", files=files, write_mode=WriteMode.OVERWRITE, @@ -378,7 +377,7 @@ async def test_upload_texts_via_sessions( workspace_name="test_workspace", write_mode=WriteMode.OVERWRITE ) - mocked_s3.upload_texts.assert_called_once_with( + mocked_s3.upload_in_memory.assert_called_once_with( upload_session=upload_session_response, files=files, show_progress=False ) @@ -389,7 +388,7 @@ async def test_upload_texts_via_sessions( workspace_name="test_workspace", session_id=upload_session_response.session_id ) - async def test_upload_texts_via_sync_upload( + async def test_upload_in_memory_via_sync_upload( self, file_service: FilesService, mocked_upload_sessions_api: Mock, @@ -398,7 +397,7 @@ async def test_upload_texts_via_sync_upload( mocked_files_api: Mock, ) -> None: upload_summary = S3UploadSummary(total_files=1, successful_upload_count=1, failed_upload_count=0, failed=[]) - mocked_s3.upload_texts.return_value = upload_summary + mocked_s3.upload_in_memory.return_value = upload_summary files = [ DeepsetCloudFile( name="test_file.txt", @@ -416,7 +415,7 @@ async def test_upload_texts_via_sync_upload( finished_files=1, ), ) - result = await file_service.upload_texts( + result = await file_service.upload_in_memory( workspace_name="test_workspace", files=files, write_mode=WriteMode.OVERWRITE, @@ -428,9 +427,9 @@ async def test_upload_texts_via_sync_upload( assert not mocked_upload_sessions_api.create.called, "We should not have created a session for a single file" - mocked_files_api.direct_upload_text.assert_called_once_with( + mocked_files_api.direct_upload_in_memory.assert_called_once_with( workspace_name="test_workspace", - text="test content", + content="test content", meta={"test": "test"}, file_name="test_file.txt", write_mode=WriteMode.OVERWRITE, diff --git a/tests/unit/workflows/async_client/test_async_workflow_files.py b/tests/unit/workflows/async_client/test_async_workflow_files.py index b0e9f307..9a3641be 100644 --- a/tests/unit/workflows/async_client/test_async_workflow_files.py +++ b/tests/unit/workflows/async_client/test_async_workflow_files.py @@ -18,8 +18,8 @@ UploadSessionWriteModeEnum, WriteMode, ) -from deepset_cloud_sdk._service.files_service import DeepsetCloudFile, FilesService -from deepset_cloud_sdk.models import UserInfo +from deepset_cloud_sdk._service.files_service import FilesService +from deepset_cloud_sdk.models import DeepsetCloudFile, UserInfo from deepset_cloud_sdk.workflows.async_client.files import ( download, get_upload_session, @@ -90,7 +90,7 @@ async def test_upload_with_timeout(self, monkeypatch: MonkeyPatch) -> None: async def test_upload_texts(self, monkeypatch: MonkeyPatch) -> None: mocked_upload_texts = AsyncMock(return_value=None) - monkeypatch.setattr(FilesService, "upload_texts", mocked_upload_texts) + monkeypatch.setattr(FilesService, "upload_in_memory", mocked_upload_texts) files = [ DeepsetCloudFile( name="test_file.txt", diff --git a/tests/unit/workflows/sync_client/test_sync_workflow_files.py b/tests/unit/workflows/sync_client/test_sync_workflow_files.py index e2909c65..130c0002 100644 --- a/tests/unit/workflows/sync_client/test_sync_workflow_files.py +++ b/tests/unit/workflows/sync_client/test_sync_workflow_files.py @@ -14,8 +14,7 @@ UploadSessionWriteModeEnum, WriteMode, ) -from deepset_cloud_sdk._service.files_service import DeepsetCloudFile -from deepset_cloud_sdk.models import UserInfo +from deepset_cloud_sdk.models import DeepsetCloudFile, UserInfo from deepset_cloud_sdk.workflows.sync_client.files import ( download, get_upload_session,