Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/issues 8 and 10 #40

Merged
merged 15 commits into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
TEMPO_*.nc
tests/
pytest.ini
.git
.gitignore
.travis.yaml
.swagger-codegen-ignore
tox.ini

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
venv/
.python-version

# Translations
*.mo
*.pot

# Django stuff:
*.log

# Sphinx documentation
docs/_build/

# PyBuilder
target/

#Ipython Notebook
.ipynb_checkpoints

# Intellij project settings
.idea

# VSCode project settings
.vscode

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# ruff
.ruff_cache/
# pytest
.pytest_cache/

# Pyre type checker
.pyre/

# Github integration settings
.github

# .dockerignore and builder script themselves are not needed by
# docker build.
.dockerignore
build-service
6 changes: 0 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,3 @@ repos:
- id: yamllint
args: ["-d {extends: relaxed, rules: {line-length: {max: 120}}}"]
stages: [commit, push]

- repo: https://github.com/pryorda/dockerfilelint-precommit-hooks
rev: v0.1.0
hooks:
- id: dockerfilelint
stages: [commit, push]
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added
- [PR #1](https://github.com/danielfromearth/stitchee/pull/1): An initial GitHub Actions workflow
- [Issue #8](https://github.com/danielfromearth/stitchee/issues/8): Create working Docker image
- [Issue #10](https://github.com/danielfromearth/stitchee/issues/10): Add code necessary to communicate with Harmony
### Changed
- [PR #12](https://github.com/danielfromearth/stitchee/pull/12): Changed name to "stitchee"
- [PR #15](https://github.com/danielfromearth/stitchee/pull/15): Use ruff+black chain for pre-commit lint & format
Expand Down
47 changes: 47 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@

FROM python:3.10-slim

RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get upgrade -y \
gcc \
libnetcdf-dev \
#libhdf5-dev \
#hdf5-helpers \
&& pip3 install --upgrade pip \
&& pip3 install cython \
&& pip3 install poetry \
&& apt-get clean && rm -rf /var/lib/apt/lists/*


# Create a new user
RUN adduser --quiet --disabled-password --shell /bin/sh --home /home/dockeruser --gecos "" --uid 1000 dockeruser
USER dockeruser
ENV HOME /home/dockeruser
ENV PYTHONPATH "${PYTHONPATH}:/home/dockeruser/.local/bin"
ENV PATH="/home/dockeruser/.local/bin:${PATH}"

# The 'SOURCE' argument is what will be used in 'pip install'.
ARG SOURCE

# Set this argument if running the pip install on a local directory, so
# the local dist files are copied into the container.
ARG DIST_PATH

USER root
RUN mkdir -p /worker && chown dockeruser /worker
COPY pyproject.toml /worker

WORKDIR /worker
# ENV PYTHONPATH=${PYTHONPATH}:${PWD}
COPY --chown=dockeruser $DIST_PATH $DIST_PATH
#RUN pip3 install --no-cache-dir --force --user --index-url https://pypi.org/simple/ --extra-index-url https://test.pypi.org/simple/ $SOURCE \
# && rm -rf $DIST_PATH

#install poetry as root
RUN poetry config virtualenvs.create false
RUN poetry install --only main

USER dockeruser
COPY --chown=dockeruser ./docker-entrypoint.sh docker-entrypoint.sh
# Run the service
ENTRYPOINT ["./docker-entrypoint.sh"]
3 changes: 3 additions & 0 deletions build-service
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

docker build -t "asdc-trade/stitchee:${VERSION-latest}" .
Empty file.
33 changes: 33 additions & 0 deletions concatenator/harmony/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""A Harmony CLI wrapper around the concatenate-batcher"""
from argparse import ArgumentParser

import harmony

from concatenator.harmony.service_adapter import StitcheeAdapter as HarmonyAdapter


def main(config: harmony.util.Config = None) -> None:
"""Parse command line arguments and invoke the service to respond to them.

Parameters
----------
config : harmony.util.Config
harmony.util.Config is injectable for tests

Returns
-------
None
"""
parser = ArgumentParser(
prog="Stitchee", description="Run the STITCH by Extending a dimEnsion service"
)
harmony.setup_cli(parser)
args = parser.parse_args()
if harmony.is_harmony_cli(args):
harmony.run_cli(parser, args, HarmonyAdapter, cfg=config)
else:
parser.error("Only --harmony CLIs are supported")


if __name__ == "__main__":
main()
119 changes: 119 additions & 0 deletions concatenator/harmony/download_worker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""A utility for downloading multiple granules simultaneously"""

import queue
import re
from copy import deepcopy
from multiprocessing import Manager, Process
from os import cpu_count
from pathlib import Path
from urllib.parse import urlparse

from harmony.logging import build_logger
from harmony.util import download


def multi_core_download(
urls: list, destination_dir: str, access_token: str, cfg: dict, process_count: int | None = None
) -> list[Path]:
"""
A method which automagically scales downloads to the number of CPU
cores. For further explaination, see documentation on "multi-track
drifting"

Parameters
----------
urls : list
list of urls to download
destination_dir : str
output path for downloaded files
access_token : str
access token as provided in Harmony input
cfg : dict
Harmony configuration information
process_count : int, optional
Number of worker processes to run (expected >= 1)

Returns
-------
list
list of downloaded files as pathlib.Path objects
"""

if process_count is None:
process_count = cpu_count()
if process_count is None:
raise RuntimeError("cannot determine number of cpus")

with Manager() as manager:
url_queue = manager.Queue(len(urls))
path_list = manager.list()

for url in urls:
url_queue.put(url)

# Spawn worker processes
processes = []
for _ in range(process_count):
download_process = Process(
target=_download_worker,
args=(url_queue, path_list, destination_dir, access_token, cfg),
)
processes.append(download_process)
download_process.start()

# Ensure worker processes exit successfully
for process in processes:
process.join()
if process.exitcode != 0:
raise RuntimeError(f"Download failed - exit code: {process.exitcode}")

process.close()

path_list = deepcopy(path_list) # ensure GC can cleanup multiprocessing

return [Path(path) for path in path_list]


def _download_worker(
url_queue: queue.Queue, path_list: list, destination_dir: str, access_token: str, cfg: dict
) -> None:
"""
A method to be executed in a separate process which processes the url_queue
and places paths to completed downloads into the path_list. Downloads are
handled by harmony.util.download

Parameters
----------
url_queue : queue.Queue
URLs to process - should be filled from start and only decreases
path_list : list
paths to completed file downloads
destination_dir : str
output path for downloaded files
access_token : str
access token as provided in Harmony input
cfg : dict
Harmony configuration information
"""

logger = build_logger(cfg)

while not url_queue.empty():
try:
url = url_queue.get_nowait()
except queue.Empty:
break

path = Path(
download(url, destination_dir, logger=logger, access_token=access_token, cfg=cfg)
)
filename_match = re.match(r".*\/(.+\..+)", urlparse(url).path)

if filename_match is not None:
filename = filename_match.group(1)
dest_path = path.parent.joinpath(filename)
path = path.rename(dest_path)
else:
logger.warning("Origin filename could not be assertained - %s", url)

path_list.append(str(path))
Loading