Skip to content

Commit

Permalink
feat: idsse-912: add optimization args to aws_cp() (#75)
Browse files Browse the repository at this point in the history
* add new --concurrency and --chunk_size args to aws_cp()
* defer to s5cmd cp defaults, instead of freezing defaults in this library
  • Loading branch information
mackenzie-grimes-noaa authored Sep 10, 2024
1 parent 7b7937a commit d750a8e
Showing 1 changed file with 25 additions and 4 deletions.
29 changes: 25 additions & 4 deletions python/idsse_common/idsse/common/aws_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,19 +65,40 @@ def aws_ls(self, path: str, prepend_path: bool = True) -> Sequence[str]:
return [os.path.join(path, filename.split(' ')[-1]) for filename in commands_result]
return [filename.split(' ')[-1] for filename in commands_result]

def aws_cp(self, path: str, dest: str) -> bool:
"""Execute an 'cp' on the AWS s3 bucket specified by path, dest
def aws_cp(self,
path: str,
dest: str,
concurrency: int | None = None,
chunk_size: int | None = None) -> bool:
"""Execute an 'cp' on the AWS s3 bucket specified by path, dest. Attempts to use
[s5cmd](https://github.com/peak/s5cmd) to copy the file from S3 with parallelization,
but falls back to (slower) aws-cli if s5cmd is not installed or throws an error.
Args:
path (str): Relative or Absolute path to the object to be copied
dest (str): The destination location
concurrency (optional, int): Number of parallel threads for s5cmd to use to copy
the file down from AWS (may be helpful to tweak for large files).
Default is None (s5cmd default).
chunk_size (optional, int): Size of chunks (in MB) for s5cmd to split up the source AWS
S3 file so it can download quicker with more threads.
Default is None (s5cmd default).
Returns:
bool: Returns True if copy is successful
"""
try:
logger.debug('First attempt with s5cmd')
commands = ['s5cmd', '--no-sign-request', 'cp', path, dest]
logger.debug('First attempt with s5cmd, concurrency: %d, chunk_size: %s',
concurrency, chunk_size)
commands = ['s5cmd', '--no-sign-request', 'cp']

# if concurrency and/or chunk_size options were provided, append to s5cmd before paths
if concurrency:
commands += ['--concurrency', concurrency]
if chunk_size:
commands += ['--part_size', chunk_size]
commands += [path, dest] # finish the command list with the src and destination

exec_cmd(commands)
return True
except FileNotFoundError:
Expand Down

0 comments on commit d750a8e

Please sign in to comment.