From 8ee0eb56c653ddd803daca5bb051d59eeeb9f0ed Mon Sep 17 00:00:00 2001 From: Joe Sapp Date: Fri, 12 Nov 2021 16:08:44 +0000 Subject: [PATCH 01/22] Don't consume all arguments after --extensions This behavior is now more like other utilities where specifying the flag multiple times extends the value of the argument. For example, -e '.nc .h5 .zip' becomes -e '.nc' -e '.h5' -e '.zip' This is less fragile for the user and possibly less confusing how the argument should be formatted on the command line. --- subscriber/podaac_data_subscriber.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index baf16ae..5d0cc6e 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -207,7 +207,7 @@ def create_parser(): parser.add_argument("--offset", dest="offset", help = "Flag used to shift timestamp. Units are in hours, e.g. 10 or -10.") # noqa E501 parser.add_argument("-m", "--minutes", dest="minutes", help = "How far back in time, in minutes, should the script look for data. If running this script as a cron, this value should be equal to or greater than how often your cron runs (default: 60 minutes).", type=int, default=60) # noqa E501 - parser.add_argument("-e", "--extensions", dest="extensions", help = "The extensions of products to download. Default is [.nc, .h5, .zip]", default=[".nc", ".h5", ".zip"], nargs='*') # noqa E501 + parser.add_argument("-e", "--extensions", dest="extensions", help = "The extensions of products to download. Default is [.nc, .h5, .zip]", default=None, action='append') # noqa E501 parser.add_argument("--version", dest="version", action="store_true",help="Display script version information and exit.") # noqa E501 parser.add_argument("--verbose", dest="verbose", action="store_true",help="Verbose mode.") # noqa E501 @@ -386,6 +386,8 @@ def run(): #filter list based on extension + if not extensions: + extensions = [".nc", ".h5", ".zip"] filtered_downloads = [] for f in downloads: for extension in extensions: From fcacde01ba44f7a5e5dbe53df561509114da3b71 Mon Sep 17 00:00:00 2001 From: Joe Sapp Date: Fri, 12 Nov 2021 16:13:55 +0000 Subject: [PATCH 02/22] Add ability to execute arbitrary commands on each downloaded file I did it this way so each file could be compressed without hard-coding the compression algorithm. But I could see this being used to run a pre-processing script on each downloaded file. --- subscriber/podaac_data_subscriber.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index baf16ae..985675a 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -25,6 +25,7 @@ import os from os import makedirs from os.path import isdir, basename, join, splitext +import subprocess from urllib.parse import urlencode from urllib.request import urlopen, urlretrieve from datetime import datetime, timedelta @@ -208,6 +209,7 @@ def create_parser(): parser.add_argument("-m", "--minutes", dest="minutes", help = "How far back in time, in minutes, should the script look for data. If running this script as a cron, this value should be equal to or greater than how often your cron runs (default: 60 minutes).", type=int, default=60) # noqa E501 parser.add_argument("-e", "--extensions", dest="extensions", help = "The extensions of products to download. Default is [.nc, .h5, .zip]", default=[".nc", ".h5", ".zip"], nargs='*') # noqa E501 + parser.add_argument("--process", dest="process_cmd", help = "Processing command to run on each downloaded file (e.g., compression). Can be specified multiple times.", action='append') parser.add_argument("--version", dest="version", action="store_true",help="Display script version information and exit.") # noqa E501 parser.add_argument("--verbose", dest="verbose", action="store_true",help="Verbose mode.") # noqa E501 @@ -244,6 +246,7 @@ def run(): short_name = args.collection extensions = args.extensions + process_cmd = args.process_cmd data_path = args.outputDirectory # You should change `data_path` to a suitable download path on your file system. @@ -480,6 +483,16 @@ def prepare_cycles_output(data_cycles, prefix, file): write_path = join(prefix, cycle_dir, basename(file)) return write_path + def process_file(output_path): + if not process_cmd: + return + else: + for cmd in process_cmd: + if args.verbose: + print(f'Running: {cmd} {output_path}') + subprocess.run(cmd.split() + [output_path], + check=True) + for f in downloads: try: for extension in extensions: @@ -495,6 +508,7 @@ def prepare_cycles_output(data_cycles, prefix, file): output_path = prepare_cycles_output( cycles, data_path, f) urlretrieve(f, output_path) + process_file(output_path) print(str(datetime.now()) + " SUCCESS: " + f) success_cnt = success_cnt + 1 except Exception as e: From 90e49ad160dd769c45eb42effe88258c385739c6 Mon Sep 17 00:00:00 2001 From: mgangl Date: Mon, 15 Nov 2021 09:39:34 -0800 Subject: [PATCH 03/22] updated README and tests for additive -e examples --- README.md | 44 +++++++++++++++++++++------------------- tests/test_subscriber.py | 2 +- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 4119ed4..d039736 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,8 @@ you should now have access to the subscriber CLI: ``` $> podaac-data-subscriber -h -usage: podaac-data-subscriber [-h] -c COLLECTION -d OUTPUTDIRECTORY [-m MINUTES] [-b BBOX] [-e [EXTENSIONS [EXTENSIONS ...]]] [-ds DATASINCE] [--version] [--verbose] +usage: podaac_data_subscriber.py [-h] -c COLLECTION -d OUTPUTDIRECTORY [-sd STARTDATE] [-ed ENDDATE] [-b BBOX] [-dc] [-dydoy] [-dymd] [-dy] [--offset OFFSET] + [-m MINUTES] [-e EXTENSIONS] [--version] [--verbose] [-p PROVIDER] optional arguments: -h, --help show this help message and exit @@ -41,28 +42,28 @@ optional arguments: The collection shortname for which you want to retrieve data. -d OUTPUTDIRECTORY, --data-dir OUTPUTDIRECTORY The directory where data products will be downloaded. + -sd STARTDATE, --start-date STARTDATE + The ISO date time before which data should be retrieved. For Example, --start-date 2021-01-14T00:00:00Z + -ed ENDDATE, --end-date ENDDATE + The ISO date time after which data should be retrieved. For Example, --end-date 2021-01-14T00:00:00Z + -b BBOX, --bounds BBOX + The bounding rectangle to filter result in. Format is W Longitude,S Latitude,E Longitude,N Latitude without spaces. Due to an issue with + parsing arguments, to use this command, please use the -b="-180,-90,180,90" syntax when calling from the command line. Default: + "-180,-90,180,90". -dc Flag to use cycle number for directory where data products will be downloaded. -dydoy Flag to use start time (Year/DOY) of downloaded data for directory where data products will be downloaded. -dymd Flag to use start time (Year/Month/Day) of downloaded data for directory where data products will be downloaded. -dy Flag to use start time (Year) of downloaded data for directory where data products will be downloaded. - + --offset OFFSET Flag used to shift timestamp. Units are in hours, e.g. 10 or -10. -m MINUTES, --minutes MINUTES - How far back in time, in minutes, should the script look for data. If running this script as a cron, this value should be equal to or greater than how often your - cron runs (default: 60 minutes). - -b BBOX, --bounds BBOX - The bounding rectangle to filter result in. Format is W Longitude,S Latitude,E Longitude,N Latitude without spaces. Due to an issue with parsing arguments, to use - this command, please use the -b="-180,-90,180,90" syntax when calling from the command line. Default: "-180,-90,180,90\. - -e [EXTENSIONS [EXTENSIONS ...]], --extensions [EXTENSIONS [EXTENSIONS ...]] - The extensions of products to download. Default is [.nc, .h5] - -sd STARTDATE, --start-date STARTDATE - The ISO date time before which data should be retrieved. For Example, --start-date 2021-01-14T00:00:00Z - -ed ENDDATE, --end-date ENDDATE - The ISO date time after which data should be retrieved. For Example, --end-date 2021-01-14T00:00:00Z + How far back in time, in minutes, should the script look for data. If running this script as a cron, this value should be equal to or + greater than how often your cron runs (default: 60 minutes). + -e EXTENSIONS, --extensions EXTENSIONS + The extensions of products to download. Default is [.nc, .h5, .zip] --version Display script version information and exit. --verbose Verbose mode. -p PROVIDER, --provider PROVIDER Specify a provider for collection search. Default is POCLOUD. - ``` One can also call the python package directly: @@ -95,7 +96,8 @@ For setting up your authentication, see the notes on the `netrc` file below. Usage: ``` -usage: podaac-data-subscriber [-h] -c COLLECTION -d OUTPUTDIRECTORY [-m MINUTES] [-b BBOX] [-e [EXTENSIONS [EXTENSIONS ...]]] [-ds DATASINCE] [--version] +usage: podaac_data_subscriber.py [-h] -c COLLECTION -d OUTPUTDIRECTORY [-sd STARTDATE] [-ed ENDDATE] [-b BBOX] [-dc] [-dydoy] [-dymd] [-dy] [--offset OFFSET] + [-m MINUTES] [-e EXTENSIONS] [--version] [--verbose] [-p PROVIDER] ``` To run the script, the following parameters are required: @@ -206,7 +208,7 @@ The subscriber allows the placement of downloaded files into one of several dire To automatically run and update a local file system with data files from a collection, one can use a syntax like the following: ``` -10 * * * * podaac-data-subscriber -c VIIRS_N20-OSPO-L2P-v2.61 -d /path/to/data/VIIRS_N20-OSPO-L2P-v2.61 -e .nc .h5 -m 60 -b="-180,-90,180,90" --verbose >> ~/.subscriber.log +10 * * * * podaac-data-subscriber -c VIIRS_N20-OSPO-L2P-v2.61 -d /path/to/data/VIIRS_N20-OSPO-L2P-v2.61 -e .nc -e .h5 -m 60 -b="-180,-90,180,90" --verbose >> ~/.subscriber.log ``` @@ -232,16 +234,16 @@ podaac-data-subscriber -c VIIRS_N20-OSPO-L2P-v2.61 -d ./data -b="-180,-90,180,90 ### Setting extensions -Some collections have many files. To download a specific set of files, you can set the extensions on which downloads are filtered. By default, ".nc" and ".h5" files are downloaded by default. +Some collections have many files. To download a specific set of files, you can set the extensions on which downloads are filtered. By default, ".nc", ".h5", and ".zip" files are downloaded by default. ``` --e [EXTENSIONS [EXTENSIONS ...]], --extensions [EXTENSIONS [EXTENSIONS ...]] - The extensions of products to download. Default is [.nc, .h5] +-e EXTENSIONS, --extensions EXTENSIONS + The extensions of products to download. Default is [.nc, .h5, .zip] ``` -An example of the -e usage: +An example of the -e usage- note the -e option is additive: ``` -podaac-data-subscriber -c VIIRS_N20-OSPO-L2P-v2.61 -d ./data -e .nc .h5 +podaac-data-subscriber -c VIIRS_N20-OSPO-L2P-v2.61 -d ./data -e .nc -e .h5 ``` diff --git a/tests/test_subscriber.py b/tests/test_subscriber.py index 2ab1c5e..d07aace 100644 --- a/tests/test_subscriber.py +++ b/tests/test_subscriber.py @@ -26,7 +26,7 @@ def test_validate(): a = validate(["-c", "viirs", "-d", "/data", "-b=-180,-90,180,90", "-m", "100"]) assert a.minutes == 100, "should equal 100" - a = validate(["-c", "viirs", "-d", "/data", "-b=-180,-90,180,90", "-e", ".txt", ".nc"]) + a = validate(["-c", "viirs", "-d", "/data", "-b=-180,-90,180,90", "-e", ".txt", "-e", ".nc"]) assert ".txt" in a.extensions assert ".nc" in a.extensions From 80f8368064d23541afe23c7aeacb87da5cdfa3c0 Mon Sep 17 00:00:00 2001 From: mgangl Date: Mon, 15 Nov 2021 09:42:56 -0800 Subject: [PATCH 04/22] force 'action' --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d039736..ee52530 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [![Python Build](https://github.com/podaac/data-subscriber/actions/workflows/python-app.yml/badge.svg?branch=main)](https://github.com/podaac/data-subscriber/actions/workflows/python-app.yml) # Scripted Access to PODAAC data - + ---- ![N|Solid](https://podaac.jpl.nasa.gov/sites/default/files/image/custom_thumbs/podaac_logo.png) From a9daf76d5616974de5915356d82a56022d8ed08d Mon Sep 17 00:00:00 2001 From: mgangl Date: Mon, 15 Nov 2021 10:06:31 -0800 Subject: [PATCH 05/22] merged code for extensions, process call, and updated documentation --- CHANGELOG.md | 12 ++++++++++++ README.md | 20 ++++++++++++-------- setup.py | 2 +- subscriber/podaac_data_subscriber.py | 2 +- 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 58e1080..f1f31c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) +## [1.7.0] +### Added +- Added ability to call a process on downlaoded files. [Thank to Joe Sapp](https://github.com/sappjw) + +### Changed +- Turned -e option into 'additive' mode (multiple -e options allowed.) [Thanks to Joe Sapp](https://github.com/sappjw) + +### Deprecated +### Removed +### Fixed +### Security + ## [1.6.1] ### Added - added warning for more than 2k granules diff --git a/README.md b/README.md index ee52530..2dd66a5 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [![Python Build](https://github.com/podaac/data-subscriber/actions/workflows/python-app.yml/badge.svg?branch=main)](https://github.com/podaac/data-subscriber/actions/workflows/python-app.yml) # Scripted Access to PODAAC data - + ---- ![N|Solid](https://podaac.jpl.nasa.gov/sites/default/files/image/custom_thumbs/podaac_logo.png) @@ -33,8 +33,8 @@ you should now have access to the subscriber CLI: ``` $> podaac-data-subscriber -h -usage: podaac_data_subscriber.py [-h] -c COLLECTION -d OUTPUTDIRECTORY [-sd STARTDATE] [-ed ENDDATE] [-b BBOX] [-dc] [-dydoy] [-dymd] [-dy] [--offset OFFSET] - [-m MINUTES] [-e EXTENSIONS] [--version] [--verbose] [-p PROVIDER] +usage: podaac_data_subscriber.py [-h] -c COLLECTION -d OUTPUTDIRECTORY [-sd STARTDATE] [-ed ENDDATE] [-b BBOX] [-dc] [-dydoy] [-dymd] [-dy] [--offset OFFSET] [-m MINUTES] + [-e EXTENSIONS] [--process PROCESS_CMD] [--version] [--verbose] [-p PROVIDER] optional arguments: -h, --help show this help message and exit @@ -47,19 +47,20 @@ optional arguments: -ed ENDDATE, --end-date ENDDATE The ISO date time after which data should be retrieved. For Example, --end-date 2021-01-14T00:00:00Z -b BBOX, --bounds BBOX - The bounding rectangle to filter result in. Format is W Longitude,S Latitude,E Longitude,N Latitude without spaces. Due to an issue with - parsing arguments, to use this command, please use the -b="-180,-90,180,90" syntax when calling from the command line. Default: - "-180,-90,180,90". + The bounding rectangle to filter result in. Format is W Longitude,S Latitude,E Longitude,N Latitude without spaces. Due to an issue with parsing + arguments, to use this command, please use the -b="-180,-90,180,90" syntax when calling from the command line. Default: "-180,-90,180,90". -dc Flag to use cycle number for directory where data products will be downloaded. -dydoy Flag to use start time (Year/DOY) of downloaded data for directory where data products will be downloaded. -dymd Flag to use start time (Year/Month/Day) of downloaded data for directory where data products will be downloaded. -dy Flag to use start time (Year) of downloaded data for directory where data products will be downloaded. --offset OFFSET Flag used to shift timestamp. Units are in hours, e.g. 10 or -10. -m MINUTES, --minutes MINUTES - How far back in time, in minutes, should the script look for data. If running this script as a cron, this value should be equal to or - greater than how often your cron runs (default: 60 minutes). + How far back in time, in minutes, should the script look for data. If running this script as a cron, this value should be equal to or greater than how + often your cron runs (default: 60 minutes). -e EXTENSIONS, --extensions EXTENSIONS The extensions of products to download. Default is [.nc, .h5, .zip] + --process PROCESS_CMD + Processing command to run on each downloaded file (e.g., compression). Can be specified multiple times. --version Display script version information and exit. --verbose Verbose mode. -p PROVIDER, --provider PROVIDER @@ -245,6 +246,9 @@ An example of the -e usage- note the -e option is additive: ``` podaac-data-subscriber -c VIIRS_N20-OSPO-L2P-v2.61 -d ./data -e .nc -e .h5 ``` +### run a post download process + +Using the `--process` option, you can run a simple command agaisnt the "just" downloaded file. This will take the format of " ". This means you can run a command like `--process gzip` to gzip all downloaded files. We do not support more advanced processes at this time (piping, running a process on a directory, etc). ### Changing how far back the script looks for data diff --git a/setup.py b/setup.py index b4e0a1c..874c403 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ long_description = fh.read() setup(name='podaac-data-subscriber', - version='1.6.1', + version='1.7.0', description='PO.DAAC Data Susbcriber Command Line Tool', url='https://github.com/podaac/data-subscriber', long_description=long_description, diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index a91b09f..3d200fe 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -30,7 +30,7 @@ from urllib.request import urlopen, urlretrieve from datetime import datetime, timedelta -__version__ = "1.6.1" +__version__ = "1.7.0" LOGLEVEL = os.environ.get('SUBSCRIBER_LOGLEVEL', 'WARNING').upper() logging.basicConfig(level=LOGLEVEL) From 481c3dac57e8b03171fcf19a1bebbe36f8f2c44b Mon Sep 17 00:00:00 2001 From: mgangl Date: Mon, 15 Nov 2021 10:16:06 -0800 Subject: [PATCH 06/22] fix for https://github.com/podaac/data-subscriber/issues/28 --- subscriber/podaac_data_subscriber.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index baf16ae..204a6bb 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -357,6 +357,7 @@ def run(): 'Specify an output directory or ' 'choose another output directory flag other than -dc.') # noqa E501 + timestamp = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") # Neatly print the first granule record (if one was returned): @@ -429,8 +430,18 @@ def prepare_time_output(times, prefix, file): write_path string path to where granules will be written """ + time_match = [dt for dt in - times if dt[0] == splitext(basename(file))[0]][0][1] + times if dt[0] == splitext(basename(file))[0]] + + # Found on 11/11/21 + # https://github.com/podaac/data-subscriber/issues/28 + # if we don't find the time match array, try again using the + # filename AND its suffix (above removes it...) + if len(time_match) == 0: + time_match = [dt for dt in + times if dt[0] == basename(file)] + time_match = time_match[0][1] # offset timestamp for output paths if args.offset: From 07c77872c2c5bfb92a36746b9f0174e614448250 Mon Sep 17 00:00:00 2001 From: mgangl Date: Mon, 15 Nov 2021 10:22:14 -0800 Subject: [PATCH 07/22] updated CHANGELOG --- CHANGELOG.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f1f31c3..2f3fa36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,14 +5,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ## [1.7.0] ### Added -- Added ability to call a process on downlaoded files. [Thank to Joe Sapp](https://github.com/sappjw) - +- Added ability to call a process on downlaoded files. [Thank to Joe Sapp](https://github.com/sappjw). ### Changed - Turned -e option into 'additive' mode (multiple -e options allowed.) [Thanks to Joe Sapp](https://github.com/sappjw) - ### Deprecated ### Removed ### Fixed +- issue not being able to find granuleUR [#28](https://github.com/podaac/data-subscriber/issues/28) ### Security ## [1.6.1] From 13c386494ea008facf4c5831a55cb11f28034a4a Mon Sep 17 00:00:00 2001 From: Frank Greguska Date: Wed, 23 Mar 2022 16:17:55 -0700 Subject: [PATCH 08/22] Change print statements to log statements --- CHANGELOG.md | 3 +- dev-requirements.txt | 1 + setup.py | 4 +- subscriber/podaac_access.py | 60 +++++++----- subscriber/podaac_data_downloader.py | 141 +++++++++++++++++---------- subscriber/podaac_data_subscriber.py | 139 ++++++++++++++++---------- 6 files changed, 221 insertions(+), 127 deletions(-) create mode 100644 dev-requirements.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index f90b329..fdeceb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,9 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) -## Unreleased +## [Unreleased] ### Added ### Changed +- Converted print statements to log statements ### Deprecated ### Removed ### Fixed diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 0000000..49435c9 --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1 @@ +pytest==7.1.1 \ No newline at end of file diff --git a/setup.py b/setup.py index a96b7f7..25adf1b 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ packages=['subscriber'], entry_points=''' [console_scripts] - podaac-data-subscriber=subscriber.podaac_data_subscriber:run - podaac-data-downloader=subscriber.podaac_data_downloader:run + podaac-data-subscriber=subscriber.podaac_data_subscriber:main + podaac-data-downloader=subscriber.podaac_data_downloader:main ''', zip_safe=False) diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index 0bdf10b..a082cd1 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -1,14 +1,16 @@ -from urllib import request -from http.cookiejar import CookieJar -import netrc -import requests import json +import logging +import netrc +import subprocess +from datetime import datetime +from http.cookiejar import CookieJar from os import makedirs from os.path import isdir, basename, join, splitext -import subprocess +from urllib import request from urllib.parse import urlencode from urllib.request import urlopen -from datetime import datetime + +import requests __version__ = "1.8.0" extensions = [".nc", ".h5", ".zip", ".tar.gz"] @@ -18,6 +20,7 @@ IPAddr = "127.0.0.1" # socket.gethostbyname(hostname) + # ## Authentication setup # # The function below will allow Python scripts to log into any Earthdata Login @@ -60,7 +63,7 @@ def setup_earthdata_login_auth(endpoint): # FileNotFound = There's no .netrc file # TypeError = The endpoint isn't in the netrc file, # causing the above to try unpacking None - print("There's no .netrc file or the The endpoint isn't in the netrc file") # noqa E501 + logging.warning("There's no .netrc file or the The endpoint isn't in the netrc file") manager = request.HTTPPasswordMgrWithDefaultRealm() manager.add_password(None, endpoint, username, password) @@ -82,15 +85,15 @@ def get_token(url: str, client_id: str, endpoint: str) -> str: username, _, password = netrc.netrc().authenticators(endpoint) xml: str = """ {}{}{} - {}""".format(username, password, client_id, IPAddr) # noqa E501 - headers: Dict = {'Content-Type': 'application/xml', 'Accept': 'application/json'} # noqa E501 + {}""".format(username, password, client_id, IPAddr) # noqa E501 + headers: Dict = {'Content-Type': 'application/xml', 'Accept': 'application/json'} # noqa E501 resp = requests.post(url, headers=headers, data=xml) response_content: Dict = json.loads(resp.content) token = response_content['token']['id'] # What error is thrown here? Value Error? Request Errors? except: # noqa E722 - print("Error getting the token - check user name and password") + logging.warning("Error getting the token - check user name and password") return token @@ -99,45 +102,50 @@ def get_token(url: str, client_id: str, endpoint: str) -> str: ############################################################################### def delete_token(url: str, token: str) -> None: try: - headers: Dict = {'Content-Type': 'application/xml','Accept': 'application/json'} # noqa E501 + headers: Dict = {'Content-Type': 'application/xml', 'Accept': 'application/json'} # noqa E501 url = '{}/{}'.format(url, token) resp = requests.request('DELETE', url, headers=headers) if resp.status_code == 204: - print("CMR token successfully deleted") + logging.info("CMR token successfully deleted") else: - print("CMR token deleting failed.") + logging.info("CMR token deleting failed.") except: # noqa E722 - print("Error deleting the token") + logging.warning("Error deleting the token") def validate(args): bounds = args.bbox.split(',') if len(bounds) != 4: - raise ValueError("Error parsing '--bounds': " + args.bbox + ". Format is W Longitude,S Latitude,E Longitude,N Latitude without spaces ") # noqa E501 + raise ValueError( + "Error parsing '--bounds': " + args.bbox + ". Format is W Longitude,S Latitude,E Longitude,N Latitude without spaces ") # noqa E501 for b in bounds: try: float(b) except ValueError: - raise ValueError("Error parsing '--bounds': " + args.bbox + ". Format is W Longitude,S Latitude,E Longitude,N Latitude without spaces ") # noqa E501 + raise ValueError( + "Error parsing '--bounds': " + args.bbox + ". Format is W Longitude,S Latitude,E Longitude,N Latitude without spaces ") # noqa E501 if args.startDate: try: datetime.strptime(args.startDate, '%Y-%m-%dT%H:%M:%SZ') except ValueError: - raise ValueError("Error parsing '--start-date' date: " + args.startDate + ". Format must be like 2021-01-14T00:00:00Z") # noqa E501 + raise ValueError( + "Error parsing '--start-date' date: " + args.startDate + ". Format must be like 2021-01-14T00:00:00Z") # noqa E501 if args.endDate: try: datetime.strptime(args.endDate, '%Y-%m-%dT%H:%M:%SZ') except ValueError: - raise ValueError("Error parsing '--end-date' date: " + args.endDate + ". Format must be like 2021-01-14T00:00:00Z") # noqa E501 + raise ValueError( + "Error parsing '--end-date' date: " + args.endDate + ". Format must be like 2021-01-14T00:00:00Z") # noqa E501 if 'minutes' in args: if args.minutes: try: int(args.minutes) except ValueError: - raise ValueError("Error parsing '--minutes': " + args.minutes + ". Number must be an integer.") # noqa E501 + raise ValueError( + "Error parsing '--minutes': " + args.minutes + ". Number must be an integer.") # noqa E501 # Error catching for output directory specifications # Must specify -d output path or one time-based output directory flag @@ -243,9 +251,9 @@ def process_file(process_cmd, output_path, args): else: for cmd in process_cmd: if args.verbose: - print(f'Running: {cmd} {output_path}') + logging.info(f'Running: {cmd} {output_path}') subprocess.run(cmd.split() + [output_path], - check=True) + check=True, stdout=PIPE, stderr=PIPE) def get_temporal_range(start, end, now): @@ -267,7 +275,7 @@ def get_search_results(args, params): query = urlencode(params) url = "https://" + cmr + "/search/granules.umm_json?" + query if args.verbose: - print(url) + logging.info(url) # Get a new timestamp that represents the UTC time of the search. # Then download the records in `umm_json` format for granules @@ -279,7 +287,9 @@ def get_search_results(args, params): def parse_start_times(results): try: - file_start_times = [(r['meta']['native-id'], datetime.strptime((r['umm']['TemporalExtent']['RangeDateTime']['BeginningDateTime']), "%Y-%m-%dT%H:%M:%S.%fZ")) for r in results['items']] # noqa E501 + file_start_times = [(r['meta']['native-id'], + datetime.strptime((r['umm']['TemporalExtent']['RangeDateTime']['BeginningDateTime']), + "%Y-%m-%dT%H:%M:%S.%fZ")) for r in results['items']] # noqa E501 except KeyError: raise ValueError('Could not locate start time for data.') return file_start_times @@ -287,7 +297,9 @@ def parse_start_times(results): def parse_cycles(results): try: - cycles = [(splitext(r['meta']['native-id'])[0],str(r['umm']['SpatialExtent']['HorizontalSpatialDomain']['Track']['Cycle'])) for r in results['items']] # noqa E501 + cycles = [(splitext(r['meta']['native-id'])[0], + str(r['umm']['SpatialExtent']['HorizontalSpatialDomain']['Track']['Cycle'])) for r in + results['items']] # noqa E501 except KeyError: raise ValueError('No cycles found within collection granules. ' 'Specify an output directory or ' diff --git a/subscriber/podaac_data_downloader.py b/subscriber/podaac_data_downloader.py index 442ac5b..83e9225 100644 --- a/subscriber/podaac_data_downloader.py +++ b/subscriber/podaac_data_downloader.py @@ -2,26 +2,23 @@ import argparse import logging import os +import sys +from datetime import datetime, timedelta from os import makedirs from os.path import isdir, basename, join from urllib.request import urlretrieve -from datetime import datetime, timedelta from subscriber import podaac_access as pa __version__ = pa.__version__ -LOGLEVEL = os.environ.get('PODAAC_LOGLEVEL', 'WARNING').upper() - -logging.basicConfig(level=LOGLEVEL) -logging.debug("Log level set to " + LOGLEVEL) - page_size = 2000 edl = pa.edl cmr = pa.cmr token_url = pa.token_url + # The lines below are to get the IP address. You can make this static and # assign a fixed value to the IPAddr variable @@ -37,13 +34,17 @@ def parse_cycles(cycle_input): def validate(args): if args.search_cycles is None and args.startDate is None and args.endDate is None: - raise ValueError("Error parsing command line arguments: one of [--start-date and --end-date] or [--cycles] are required") # noqa E501 + raise ValueError( + "Error parsing command line arguments: one of [--start-date and --end-date] or [--cycles] are required") # noqa E501 if args.search_cycles is not None and args.startDate is not None: - raise ValueError("Error parsing command line arguments: only one of -sd/--start-date and --cycles are allowed") # noqa E501 + raise ValueError( + "Error parsing command line arguments: only one of -sd/--start-date and --cycles are allowed") # noqa E501 if args.search_cycles is not None and args.endDate is not None: - raise ValueError("Error parsing command line arguments: only one of -ed/--end-date and --cycles are allowed") # noqa E50 + raise ValueError( + "Error parsing command line arguments: only one of -ed/--end-date and --cycles are allowed") # noqa E50 if None in [args.endDate, args.startDate] and args.search_cycles is None: - raise ValueError("Error parsing command line arguments: Both --start-date and --end-date must be specified") # noqa E50 + raise ValueError( + "Error parsing command line arguments: Both --start-date and --end-date must be specified") # noqa E50 def create_parser(): @@ -51,35 +52,55 @@ def create_parser(): parser = argparse.ArgumentParser(prog='PO.DAAC bulk-data downloader') # Adding Required arguments - parser.add_argument("-c", "--collection-shortname", dest="collection",required=True, help = "The collection shortname for which you want to retrieve data.") # noqa E501 - parser.add_argument("-d", "--data-dir", dest="outputDirectory", required=True, help = "The directory where data products will be downloaded.") # noqa E501 + parser.add_argument("-c", "--collection-shortname", dest="collection", required=True, + help="The collection shortname for which you want to retrieve data.") # noqa E501 + parser.add_argument("-d", "--data-dir", dest="outputDirectory", required=True, + help="The directory where data products will be downloaded.") # noqa E501 # Required through validation - parser.add_argument("--cycle", required=False, dest="search_cycles", help="Cycle number for determining downloads. can be repeated for multiple cycles", action='append', type=int) - parser.add_argument("-sd", "--start-date", required=False, dest="startDate", help="The ISO date time before which data should be retrieved. For Example, --start-date 2021-01-14T00:00:00Z") # noqa E501 - parser.add_argument("-ed", "--end-date", required=False, dest="endDate", help="The ISO date time after which data should be retrieved. For Example, --end-date 2021-01-14T00:00:00Z") # noqa E501 + parser.add_argument("--cycle", required=False, dest="search_cycles", + help="Cycle number for determining downloads. can be repeated for multiple cycles", + action='append', type=int) + parser.add_argument("-sd", "--start-date", required=False, dest="startDate", + help="The ISO date time before which data should be retrieved. For Example, --start-date 2021-01-14T00:00:00Z") # noqa E501 + parser.add_argument("-ed", "--end-date", required=False, dest="endDate", + help="The ISO date time after which data should be retrieved. For Example, --end-date 2021-01-14T00:00:00Z") # noqa E501 # Adding optional arguments # spatiotemporal arguments - parser.add_argument("-b", "--bounds", dest="bbox", help = "The bounding rectangle to filter result in. Format is W Longitude,S Latitude,E Longitude,N Latitude without spaces. Due to an issue with parsing arguments, to use this command, please use the -b=\"-180,-90,180,90\" syntax when calling from the command line. Default: \"-180,-90,180,90\".", default="-180,-90,180,90") # noqa E501 + parser.add_argument("-b", "--bounds", dest="bbox", + help="The bounding rectangle to filter result in. Format is W Longitude,S Latitude,E Longitude,N Latitude without spaces. Due to an issue with parsing arguments, to use this command, please use the -b=\"-180,-90,180,90\" syntax when calling from the command line. Default: \"-180,-90,180,90\".", + default="-180,-90,180,90") # noqa E501 # Arguments for how data are stored locally - much processing is based on # the underlying directory structure (e.g. year/Day-of-year) - parser.add_argument("-dc", dest="cycle", action="store_true", help = "Flag to use cycle number for directory where data products will be downloaded.") # noqa E501 - parser.add_argument("-dydoy", dest="dydoy", action="store_true", help = "Flag to use start time (Year/DOY) of downloaded data for directory where data products will be downloaded.") # noqa E501 - parser.add_argument("-dymd", dest="dymd", action="store_true", help = "Flag to use start time (Year/Month/Day) of downloaded data for directory where data products will be downloaded.") # noqa E501 - parser.add_argument("-dy", dest="dy", action="store_true", help = "Flag to use start time (Year) of downloaded data for directory where data products will be downloaded.") # noqa E501 - parser.add_argument("--offset", dest="offset", help = "Flag used to shift timestamp. Units are in hours, e.g. 10 or -10.") # noqa E501 - - parser.add_argument("-e", "--extensions", dest="extensions", help="The extensions of products to download. Default is [.nc, .h5, .zip, .tar.gz]", default=None, action='append') # noqa E501 - parser.add_argument("--process", dest="process_cmd", help="Processing command to run on each downloaded file (e.g., compression). Can be specified multiple times.", action='append') - - - parser.add_argument("--version", action="version", version='%(prog)s ' + __version__, help="Display script version information and exit.") # noqa E501 - parser.add_argument("--verbose", dest="verbose", action="store_true",help="Verbose mode.") # noqa E501 - parser.add_argument("-p", "--provider", dest="provider", default='POCLOUD', help="Specify a provider for collection search. Default is POCLOUD.") # noqa E501 - - parser.add_argument("--limit", dest="limit", default='2000', type=int, help="Integer limit for number of granules to download. Useful in testing. Defaults to " + str(page_size)) # noqa E501 + parser.add_argument("-dc", dest="cycle", action="store_true", + help="Flag to use cycle number for directory where data products will be downloaded.") # noqa E501 + parser.add_argument("-dydoy", dest="dydoy", action="store_true", + help="Flag to use start time (Year/DOY) of downloaded data for directory where data products will be downloaded.") # noqa E501 + parser.add_argument("-dymd", dest="dymd", action="store_true", + help="Flag to use start time (Year/Month/Day) of downloaded data for directory where data products will be downloaded.") # noqa E501 + parser.add_argument("-dy", dest="dy", action="store_true", + help="Flag to use start time (Year) of downloaded data for directory where data products will be downloaded.") # noqa E501 + parser.add_argument("--offset", dest="offset", + help="Flag used to shift timestamp. Units are in hours, e.g. 10 or -10.") # noqa E501 + + parser.add_argument("-e", "--extensions", dest="extensions", + help="The extensions of products to download. Default is [.nc, .h5, .zip, .tar.gz]", + default=None, action='append') # noqa E501 + parser.add_argument("--process", dest="process_cmd", + help="Processing command to run on each downloaded file (e.g., compression). Can be specified multiple times.", + action='append') + + parser.add_argument("--version", action="version", version='%(prog)s ' + __version__, + help="Display script version information and exit.") # noqa E501 + parser.add_argument("--verbose", dest="verbose", action="store_true", help="Verbose mode.") # noqa E501 + parser.add_argument("-p", "--provider", dest="provider", default='POCLOUD', + help="Specify a provider for collection search. Default is POCLOUD.") # noqa E501 + + parser.add_argument("--limit", dest="limit", default='2000', type=int, + help="Integer limit for number of granules to download. Useful in testing. Defaults to " + str( + page_size)) # noqa E501 return parser @@ -98,8 +119,8 @@ def run(): validate(args) except ValueError as v: - print(v) - exit() + logging.error(str(v)) + exit(1) pa.setup_earthdata_login_auth(edl) token = pa.get_token(token_url, 'podaac-subscriber', edl) @@ -130,7 +151,7 @@ def run(): # This cell will replace the timestamp above with the one read from the `.update` file in the data directory, if it exists. if not isdir(data_path): - print("NOTE: Making new data directory at " + data_path + "(This is the first run.)") + logging.info("NOTE: Making new data directory at " + data_path + "(This is the first run.)") makedirs(data_path, exist_ok=True) # Change this to whatever extent you need. Format is W Longitude,S Latitude,E Longitude,N Latitude @@ -150,10 +171,11 @@ def run(): for v in cmr_cycles: params.append(("cycle[]", v)) if args.verbose: - print("cycles: " + str(cmr_cycles)) + logging.info("cycles: " + str(cmr_cycles)) else: - temporal_range = pa.get_temporal_range(start_date_time, end_date_time, datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) # noqa E501 + temporal_range = pa.get_temporal_range(start_date_time, end_date_time, + datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) # noqa E501 params = { 'scroll': "true", 'page_size': page_size, @@ -165,15 +187,15 @@ def run(): 'bounding_box': bounding_extent, } if args.verbose: - print("Temporal Range: " + temporal_range) + logging.info("Temporal Range: " + temporal_range) if args.verbose: - print("Provider: " + provider) + logging.info("Provider: " + provider) results = pa.get_search_results(args, params) if args.verbose: - print(str(results['hits'])+" granules found for "+short_name) # noqa E501 + logging.info(str(results['hits']) + " granules found for " + short_name) # noqa E501 if any([args.dy, args.dydoy, args.dymd]): file_start_times = pa.parse_start_times(results) @@ -181,8 +203,11 @@ def run(): cycles = pa.parse_cycles(results) downloads_all = [] - downloads_data = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "GET DATA" and ('Subtype' not in u or u['Subtype'] != "OPENDAP DATA")] for r in results['items']] - downloads_metadata = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "EXTENDED METADATA"] for r in results['items']] + downloads_data = [[u['URL'] for u in r['umm']['RelatedUrls'] if + u['Type'] == "GET DATA" and ('Subtype' not in u or u['Subtype'] != "OPENDAP DATA")] for r in + results['items']] + downloads_metadata = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "EXTENDED METADATA"] for r in + results['items']] for f in downloads_data: downloads_all.append(f) @@ -192,7 +217,8 @@ def run(): downloads = [item for sublist in downloads_all for item in sublist] if len(downloads) >= page_size: - print("Warning: only the most recent " + str(page_size) + " granules will be downloaded; try adjusting your search criteria (suggestion: reduce time period or spatial region of search) to ensure you retrieve all granules.") + logging.warning("Only the most recent " + str( + page_size) + " granules will be downloaded; try adjusting your search criteria (suggestion: reduce time period or spatial region of search) to ensure you retrieve all granules.") # filter list based on extension if not extensions: @@ -208,9 +234,9 @@ def run(): # https://github.com/podaac/data-subscriber/issues/33 # Make this a non-verbose message # if args.verbose: - print("Found " + str(len(downloads)) + " total files to download") + logging.info("Found " + str(len(downloads)) + " total files to download") if args.verbose: - print("Downloading files with extensions: " + str(extensions)) + logging.info("Downloading files with extensions: " + str(extensions)) # NEED TO REFACTOR THIS, A LOT OF STUFF in here # Finish by downloading the files to the data directory in a loop. @@ -230,19 +256,32 @@ def run(): cycles, data_path, f) urlretrieve(f, output_path) pa.process_file(process_cmd, output_path, args) - print(str(datetime.now()) + " SUCCESS: " + f) + logging.info(str(datetime.now()) + " SUCCESS: " + f) success_cnt = success_cnt + 1 except Exception as e: - print(str(datetime.now()) + " FAILURE: " + f) + logging.warning(str(datetime.now()) + " FAILURE: " + f, exc_info=True) failure_cnt = failure_cnt + 1 - print(e) - print("Downloaded: " + str(success_cnt) + " files\n") - print("Files Failed to download:" + str(failure_cnt) + "\n") + logging.info("Downloaded: " + str(success_cnt) + " files\n") + logging.info("Files Failed to download:" + str(failure_cnt) + "\n") pa.delete_token(token_url, token) - print("END \n\n") + logging.info("END \n\n") exit(0) +def main(): + log_level = os.environ.get('PODAAC_LOGLEVEL', 'INFO').upper() + logging.basicConfig(stream=sys.stdout, + format='[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', + level=log_level) + logging.debug("Log level set to " + log_level) + + try: + run() + except Exception as e: + logging.exception("Uncaught exception occurred during execution.") + exit(hash(e)) + + if __name__ == '__main__': - run() + main() diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index d749453..0dea683 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -15,19 +15,16 @@ import argparse import logging import os +import sys +from datetime import datetime, timedelta from os import makedirs from os.path import isdir, basename, join, isfile from urllib.request import urlretrieve -from datetime import datetime, timedelta from subscriber import podaac_access as pa __version__ = pa.__version__ -LOGLEVEL = os.environ.get('PODAAC_LOGLEVEL', 'WARNING').upper() -logging.basicConfig(level=LOGLEVEL) -logging.debug("Log level set to " + LOGLEVEL) - page_size = 2000 edl = pa.edl @@ -39,7 +36,9 @@ def get_update_file(data_dir, collection_name): if isfile(data_dir + "/.update__" + collection_name): return data_dir + "/.update__" + collection_name elif isfile(data_dir + "/.update"): - print("WARNING: found a deprecated use of '.update' file at {0}. After this run it will be renamed to {1}".format(data_dir + "/.update", data_dir + "/.update__" + collection_name)) + logging.warning( + "found a deprecated use of '.update' file at {0}. After this run it will be renamed to {1}".format( + data_dir + "/.update", data_dir + "/.update__" + collection_name)) return data_dir + "/.update" return None @@ -47,7 +46,8 @@ def get_update_file(data_dir, collection_name): def validate(args): if args.minutes is None and args.startDate is False and args.endDate is False: - raise ValueError("Error parsing command line arguments: one of --start-date, --end-date or --minutes are required") + raise ValueError( + "Error parsing command line arguments: one of --start-date, --end-date or --minutes are required") def create_parser(): @@ -55,32 +55,53 @@ def create_parser(): parser = argparse.ArgumentParser(prog='PO.DAAC data subscriber') # Adding Required arguments - parser.add_argument("-c", "--collection-shortname", dest="collection",required=True, help = "The collection shortname for which you want to retrieve data.") # noqa E501 - parser.add_argument("-d", "--data-dir", dest="outputDirectory", required=True, help = "The directory where data products will be downloaded.") # noqa E501 + parser.add_argument("-c", "--collection-shortname", dest="collection", required=True, + help="The collection shortname for which you want to retrieve data.") # noqa E501 + parser.add_argument("-d", "--data-dir", dest="outputDirectory", required=True, + help="The directory where data products will be downloaded.") # noqa E501 # Adding optional arguments # spatiotemporal arguments - parser.add_argument("-sd", "--start-date", dest="startDate", help = "The ISO date time before which data should be retrieved. For Example, --start-date 2021-01-14T00:00:00Z", default=False) # noqa E501 - parser.add_argument("-ed", "--end-date", dest="endDate", help = "The ISO date time after which data should be retrieved. For Example, --end-date 2021-01-14T00:00:00Z", default=False) # noqa E501 - parser.add_argument("-b", "--bounds", dest="bbox", help = "The bounding rectangle to filter result in. Format is W Longitude,S Latitude,E Longitude,N Latitude without spaces. Due to an issue with parsing arguments, to use this command, please use the -b=\"-180,-90,180,90\" syntax when calling from the command line. Default: \"-180,-90,180,90\".", default="-180,-90,180,90") # noqa E501 + parser.add_argument("-sd", "--start-date", dest="startDate", + help="The ISO date time before which data should be retrieved. For Example, --start-date 2021-01-14T00:00:00Z", + default=False) # noqa E501 + parser.add_argument("-ed", "--end-date", dest="endDate", + help="The ISO date time after which data should be retrieved. For Example, --end-date 2021-01-14T00:00:00Z", + default=False) # noqa E501 + parser.add_argument("-b", "--bounds", dest="bbox", + help="The bounding rectangle to filter result in. Format is W Longitude,S Latitude,E Longitude,N Latitude without spaces. Due to an issue with parsing arguments, to use this command, please use the -b=\"-180,-90,180,90\" syntax when calling from the command line. Default: \"-180,-90,180,90\".", + default="-180,-90,180,90") # noqa E501 # Arguments for how data are stored locally - much processing is based on # the underlying directory structure (e.g. year/Day-of-year) - parser.add_argument("-dc", dest="cycle", action="store_true", help = "Flag to use cycle number for directory where data products will be downloaded.") # noqa E501 - parser.add_argument("-dydoy", dest="dydoy", action="store_true", help = "Flag to use start time (Year/DOY) of downloaded data for directory where data products will be downloaded.") # noqa E501 - parser.add_argument("-dymd", dest="dymd", action="store_true", help = "Flag to use start time (Year/Month/Day) of downloaded data for directory where data products will be downloaded.") # noqa E501 - parser.add_argument("-dy", dest="dy", action="store_true", help = "Flag to use start time (Year) of downloaded data for directory where data products will be downloaded.") # noqa E501 - parser.add_argument("--offset", dest="offset", help = "Flag used to shift timestamp. Units are in hours, e.g. 10 or -10.") # noqa E501 - - parser.add_argument("-m", "--minutes", dest="minutes", help = "How far back in time, in minutes, should the script look for data. If running this script as a cron, this value should be equal to or greater than how often your cron runs.", type=int, default=None) # noqa E501 - parser.add_argument("-e", "--extensions", dest="extensions", help = "The extensions of products to download. Default is [.nc, .h5, .zip]", default=None, action='append') # noqa E501 - parser.add_argument("--process", dest="process_cmd", help="Processing command to run on each downloaded file (e.g., compression). Can be specified multiple times.", action='append') - - parser.add_argument("--version", action="version", version='%(prog)s ' + __version__, help="Display script version information and exit.") # noqa E501 - parser.add_argument("--verbose", dest="verbose", action="store_true", help="Verbose mode.") # noqa E501 - - parser.add_argument("-p", "--provider", dest="provider", default='POCLOUD', help="Specify a provider for collection search. Default is POCLOUD.") # noqa E501 + parser.add_argument("-dc", dest="cycle", action="store_true", + help="Flag to use cycle number for directory where data products will be downloaded.") # noqa E501 + parser.add_argument("-dydoy", dest="dydoy", action="store_true", + help="Flag to use start time (Year/DOY) of downloaded data for directory where data products will be downloaded.") # noqa E501 + parser.add_argument("-dymd", dest="dymd", action="store_true", + help="Flag to use start time (Year/Month/Day) of downloaded data for directory where data products will be downloaded.") # noqa E501 + parser.add_argument("-dy", dest="dy", action="store_true", + help="Flag to use start time (Year) of downloaded data for directory where data products will be downloaded.") # noqa E501 + parser.add_argument("--offset", dest="offset", + help="Flag used to shift timestamp. Units are in hours, e.g. 10 or -10.") # noqa E501 + + parser.add_argument("-m", "--minutes", dest="minutes", + help="How far back in time, in minutes, should the script look for data. If running this script as a cron, this value should be equal to or greater than how often your cron runs.", + type=int, default=None) # noqa E501 + parser.add_argument("-e", "--extensions", dest="extensions", + help="The extensions of products to download. Default is [.nc, .h5, .zip]", default=None, + action='append') # noqa E501 + parser.add_argument("--process", dest="process_cmd", + help="Processing command to run on each downloaded file (e.g., compression). Can be specified multiple times.", + action='append') + + parser.add_argument("--version", action="version", version='%(prog)s ' + __version__, + help="Display script version information and exit.") # noqa E501 + parser.add_argument("--verbose", dest="verbose", action="store_true", help="Verbose mode.") # noqa E501 + + parser.add_argument("-p", "--provider", dest="provider", default='POCLOUD', + help="Specify a provider for collection search. Default is POCLOUD.") # noqa E501 return parser @@ -92,8 +113,8 @@ def run(): pa.validate(args) validate(args) except ValueError as v: - print(v) - exit() + logging.error(str(v)) + exit(1) pa.setup_earthdata_login_auth(edl) token = pa.get_token(token_url, 'podaac-subscriber', edl) @@ -136,7 +157,7 @@ def run(): # This cell will replace the timestamp above with the one read from the `.update` file in the data directory, if it exists. if not isdir(data_path): - print("NOTE: Making new data directory at " + data_path + "(This is the first run.)") + logging.info("NOTE: Making new data directory at " + data_path + "(This is the first run.)") makedirs(data_path, exist_ok=True) else: @@ -145,11 +166,12 @@ def run(): try: with open(update_file, "r") as f: data_within_last_timestamp = f.read().strip() - print("NOTE: Update found in the data directory. (The last run was at " + data_within_last_timestamp + ".)") + logging.info( + "NOTE: Update found in the data directory. (The last run was at " + data_within_last_timestamp + ".)") except FileNotFoundError: - print("WARN: No .update in the data directory. (Is this the first run?)") + logging.warning("No .update in the data directory. (Is this the first run?)") else: - print("WARN: No .update__" + short_name + " in the data directory. (Is this the first run?)") + logging.warning("No .update__" + short_name + " in the data directory. (Is this the first run?)") # Change this to whatever extent you need. Format is W Longitude,S Latitude,E Longitude,N Latitude bounding_extent = args.bbox @@ -163,7 +185,8 @@ def run(): if defined_time_range: # if(data_since): - temporal_range = pa.get_temporal_range(start_date_time, end_date_time, datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) # noqa E501 + temporal_range = pa.get_temporal_range(start_date_time, end_date_time, + datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) # noqa E501 params = { 'scroll': "true", @@ -190,16 +213,17 @@ def run(): } if args.verbose: - print("Temporal Range: " + temporal_range) + logging.info("Temporal Range: " + temporal_range) if args.verbose: - print("Provider: " + provider) - print("Updated Since: " + data_within_last_timestamp) + logging.info("Provider: " + provider) + logging.info("Updated Since: " + data_within_last_timestamp) results = pa.get_search_results(args, params) if args.verbose: - print(str(results['hits'])+" new granules found for "+short_name+" since "+data_within_last_timestamp) # noqa E501 + logging.info(str(results[ + 'hits']) + " new granules found for " + short_name + " since " + data_within_last_timestamp) # noqa E501 if any([args.dy, args.dydoy, args.dymd]): file_start_times = pa.parse_start_times(results) @@ -209,8 +233,11 @@ def run(): timestamp = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") downloads_all = [] - downloads_data = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "GET DATA" and ('Subtype' not in u or u['Subtype'] != "OPENDAP DATA")] for r in results['items']] - downloads_metadata = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "EXTENDED METADATA"] for r in results['items']] + downloads_data = [[u['URL'] for u in r['umm']['RelatedUrls'] if + u['Type'] == "GET DATA" and ('Subtype' not in u or u['Subtype'] != "OPENDAP DATA")] for r in + results['items']] + downloads_metadata = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "EXTENDED METADATA"] for r in + results['items']] for f in downloads_data: downloads_all.append(f) @@ -220,7 +247,8 @@ def run(): downloads = [item for sublist in downloads_all for item in sublist] if len(downloads) >= page_size: - print("Warning: only the most recent " + str(page_size) + " granules will be downloaded; try adjusting your search criteria (suggestion: reduce time period or spatial region of search) to ensure you retrieve all granules.") + logging.warning("Only the most recent " + str( + page_size) + " granules will be downloaded; try adjusting your search criteria (suggestion: reduce time period or spatial region of search) to ensure you retrieve all granules.") # filter list based on extension if not extensions: @@ -236,9 +264,9 @@ def run(): # https://github.com/podaac/data-subscriber/issues/33 # Make this a non-verbose message # if args.verbose: - print("Found " + str(len(downloads)) + " total files to download") + logging.info("Found " + str(len(downloads)) + " total files to download") if args.verbose: - print("Downloading files with extensions: " + str(extensions)) + logging.info("Downloading files with extensions: " + str(extensions)) # NEED TO REFACTOR THIS, A LOT OF STUFF in here # Finish by downloading the files to the data directory in a loop. @@ -258,12 +286,11 @@ def run(): cycles, data_path, f) urlretrieve(f, output_path) pa.process_file(process_cmd, output_path, args) - print(str(datetime.now()) + " SUCCESS: " + f) + logging.info(str(datetime.now()) + " SUCCESS: " + f) success_cnt = success_cnt + 1 except Exception as e: - print(str(datetime.now()) + " FAILURE: " + f) + logging.warning(str(datetime.now()) + " FAILURE: " + f, exc_info=True) failure_cnt = failure_cnt + 1 - print(e) # If there were updates to the local time series during this run and no # exceptions were raised during the download loop, then overwrite the @@ -274,12 +301,26 @@ def run(): with open(data_path + "/.update__" + short_name, "w") as f: f.write(timestamp) - print("Downloaded: " + str(success_cnt) + " files\n") - print("Files Failed to download:" + str(failure_cnt) + "\n") + logging.info("Downloaded: " + str(success_cnt) + " files\n") + logging.info("Files Failed to download:" + str(failure_cnt) + "\n") pa.delete_token(token_url, token) - print("END \n\n") + logging.info("END \n\n") exit(0) +def main(): + log_level = os.environ.get('PODAAC_LOGLEVEL', 'INFO').upper() + logging.basicConfig(stream=sys.stdout, + format='[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', + level=log_level) + logging.debug("Log level set to " + log_level) + + try: + run() + except Exception as e: + logging.exception("Uncaught exception occurred during execution.") + exit(hash(e)) + + if __name__ == '__main__': - run() + main() From f1e750af94299fda208dab421fef84dbd8fd0693 Mon Sep 17 00:00:00 2001 From: Frank Greguska Date: Wed, 23 Mar 2022 16:39:58 -0700 Subject: [PATCH 09/22] Fix flake errors --- subscriber/podaac_access.py | 2 +- subscriber/podaac_data_downloader.py | 2 +- subscriber/podaac_data_subscriber.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index a082cd1..a4a1edf 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -253,7 +253,7 @@ def process_file(process_cmd, output_path, args): if args.verbose: logging.info(f'Running: {cmd} {output_path}') subprocess.run(cmd.split() + [output_path], - check=True, stdout=PIPE, stderr=PIPE) + check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) def get_temporal_range(start, end, now): diff --git a/subscriber/podaac_data_downloader.py b/subscriber/podaac_data_downloader.py index 83e9225..0bb1666 100644 --- a/subscriber/podaac_data_downloader.py +++ b/subscriber/podaac_data_downloader.py @@ -258,7 +258,7 @@ def run(): pa.process_file(process_cmd, output_path, args) logging.info(str(datetime.now()) + " SUCCESS: " + f) success_cnt = success_cnt + 1 - except Exception as e: + except Exception: logging.warning(str(datetime.now()) + " FAILURE: " + f, exc_info=True) failure_cnt = failure_cnt + 1 diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index 0dea683..4861c03 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -288,7 +288,7 @@ def run(): pa.process_file(process_cmd, output_path, args) logging.info(str(datetime.now()) + " SUCCESS: " + f) success_cnt = success_cnt + 1 - except Exception as e: + except Exception: logging.warning(str(datetime.now()) + " FAILURE: " + f, exc_info=True) failure_cnt = failure_cnt + 1 From 5d9d39b752881cf0dd0fa6d69809f7de91039c7b Mon Sep 17 00:00:00 2001 From: Frank Greguska Date: Wed, 23 Mar 2022 17:52:45 -0700 Subject: [PATCH 10/22] Add retry logic for 500 and 401 errors from CMR --- CHANGELOG.md | 2 ++ requirements.txt | 1 + subscriber/podaac_access.py | 21 +++++++++++++++++++++ subscriber/podaac_data_downloader.py | 12 +++++++++++- subscriber/podaac_data_subscriber.py | 12 +++++++++++- 5 files changed, 46 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fdeceb8..e7e6b8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ## [Unreleased] ### Added ### Changed +- Retry CMR queries on server error using random exponential backoff max 60 seconds and 10 retries +- Refresh token if CMR returns 401 error - Converted print statements to log statements ### Deprecated ### Removed diff --git a/requirements.txt b/requirements.txt index b63590e..9319567 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ chardet==4.0.0 idna==2.10 requests==2.25.1 urllib3>=1.26.5 +tenacity>=8.0.1 \ No newline at end of file diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index a4a1edf..cb652da 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -7,11 +7,19 @@ from os import makedirs from os.path import isdir, basename, join, splitext from urllib import request +from typing import Dict +from urllib import request +from urllib.error import HTTPError +import subprocess from urllib.parse import urlencode from urllib.request import urlopen import requests +import requests +import tenacity +from datetime import datetime + __version__ = "1.8.0" extensions = [".nc", ".h5", ".zip", ".tar.gz"] edl = "urs.earthdata.nasa.gov" @@ -113,6 +121,12 @@ def delete_token(url: str, token: str) -> None: logging.warning("Error deleting the token") +def refresh_token(old_token: str, client_id: str): + setup_earthdata_login_auth(edl) + delete_token(token_url, old_token) + return get_token(token_url, client_id, edl) + + def validate(args): bounds = args.bbox.split(',') if len(bounds) != 4: @@ -270,6 +284,13 @@ def get_temporal_range(start, end, now): raise ValueError("One of start-date or end-date must be specified.") +# Retry using random exponential backoff if a 500 error is raised. Maximum 10 attempts. +@tenacity.retry(wait=tenacity.wait_random_exponential(multiplier=1, max=60), + stop=tenacity.stop_after_attempt(10), + reraise=True, + retry=(tenacity.retry_if_exception_type(HTTPError) & tenacity.retry_if_exception( + lambda exc: exc.code == 500)) + ) def get_search_results(args, params): # Get the query parameters as a string and then the complete search url: query = urlencode(params) diff --git a/subscriber/podaac_data_downloader.py b/subscriber/podaac_data_downloader.py index 0bb1666..6b99f20 100644 --- a/subscriber/podaac_data_downloader.py +++ b/subscriber/podaac_data_downloader.py @@ -6,6 +6,7 @@ from datetime import datetime, timedelta from os import makedirs from os.path import isdir, basename, join +from urllib.error import HTTPError from urllib.request import urlretrieve from subscriber import podaac_access as pa @@ -192,7 +193,16 @@ def run(): if args.verbose: logging.info("Provider: " + provider) - results = pa.get_search_results(args, params) + # If 401 is raised, refresh token and try one more time + try: + results = pa.get_search_results(args, params) + except HTTPError as e: + if e.code == 401: + token = pa.refresh_token(token, 'podaac-subscriber') + params['token'] = token + results = pa.get_search_results(args, params) + else: + raise e if args.verbose: logging.info(str(results['hits']) + " granules found for " + short_name) # noqa E501 diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index 4861c03..7ec7769 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -19,6 +19,7 @@ from datetime import datetime, timedelta from os import makedirs from os.path import isdir, basename, join, isfile +from urllib.error import HTTPError from urllib.request import urlretrieve from subscriber import podaac_access as pa @@ -219,7 +220,16 @@ def run(): logging.info("Provider: " + provider) logging.info("Updated Since: " + data_within_last_timestamp) - results = pa.get_search_results(args, params) + # If 401 is raised, refresh token and try one more time + try: + results = pa.get_search_results(args, params) + except HTTPError as e: + if e.code == 401: + token = pa.refresh_token(token, 'podaac-subscriber') + params['token'] = token + results = pa.get_search_results(args, params) + else: + raise e if args.verbose: logging.info(str(results[ From 3d8ce727f96ec0bd0da099bc7a1ae3363067e884 Mon Sep 17 00:00:00 2001 From: Wilbert Veit Date: Mon, 25 Apr 2022 02:37:52 -0700 Subject: [PATCH 11/22] Subscriber check if file exists before downloading Prevents re-downloading files (e.g. in case previous run failed because of other file failures). If the subscriber sees a file already exists, it will also calculate the file checksum and see if it matches the checksum in CMR. If the checcksum doesn't match, it will re-download. There is now a --force/-f option that will cause subscriber to re-download even if the file exists and is up to date. Issue #17 --- CHANGELOG.md | 1 + Subscriber.md | 21 +++- subscriber/podaac_data_subscriber.py | 101 +++++++++++++++- tests/test_subscriber_extracting_checksums.py | 113 ++++++++++++++++++ tests/test_subscriber_matching_checksums.py | 72 +++++++++++ 5 files changed, 299 insertions(+), 9 deletions(-) create mode 100644 tests/test_subscriber_extracting_checksums.py create mode 100644 tests/test_subscriber_matching_checksums.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f90b329..21544b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ## Unreleased ### Added +- check if file exists before downloading a file. [17](https://github.com/podaac/data-subscriber/issues/17) ### Changed ### Deprecated ### Removed diff --git a/Subscriber.md b/Subscriber.md index eab2c7a..175f0c6 100644 --- a/Subscriber.md +++ b/Subscriber.md @@ -6,7 +6,7 @@ For installation and dependency information, please see the [top-level README](R ``` $> podaac-data-subscriber -h -usage: PO.DAAC data subscriber [-h] -c COLLECTION -d OUTPUTDIRECTORY [-sd STARTDATE] [-ed ENDDATE] [-b BBOX] [-dc] [-dydoy] [-dymd] [-dy] [--offset OFFSET] [-m MINUTES] [-e EXTENSIONS] [--process PROCESS_CMD] [--version] [--verbose] [-p PROVIDER] +usage: PO.DAAC data subscriber [-h] -c COLLECTION -d OUTPUTDIRECTORY [-f] [-sd STARTDATE] [-ed ENDDATE] [-b BBOX] [-dc] [-dydoy] [-dymd] [-dy] [--offset OFFSET] [-m MINUTES] [-e EXTENSIONS] [--process PROCESS_CMD] [--version] [--verbose] [-p PROVIDER] optional arguments: -h, --help show this help message and exit @@ -14,6 +14,7 @@ optional arguments: The collection shortname for which you want to retrieve data. -d OUTPUTDIRECTORY, --data-dir OUTPUTDIRECTORY The directory where data products will be downloaded. + -f, --force Flag to force downloading files that are listed in CMR query, even if the file exists and checksum matches -sd STARTDATE, --start-date STARTDATE The ISO date time before which data should be retrieved. For Example, --start-date 2021-01-14T00:00:00Z -ed ENDDATE, --end-date ENDDATE @@ -37,12 +38,11 @@ optional arguments: Specify a provider for collection search. Default is POCLOUD. ``` -##Run the Script +## Run the Script Usage: ``` -usage: podaac_data_subscriber.py [-h] -c COLLECTION -d OUTPUTDIRECTORY [-sd STARTDATE] [-ed ENDDATE] [-b BBOX] [-dc] [-dydoy] [-dymd] [-dy] [--offset OFFSET] - [-m MINUTES] [-e EXTENSIONS] [--version] [--verbose] [-p PROVIDER] +usage: podaac_data_subscriber.py [-h] -c COLLECTION -d OUTPUTDIRECTORY [-f] [-sd STARTDATE] [-ed ENDDATE] [-b BBOX] [-dc] [-dydoy] [-dymd] [-dy] [--offset OFFSET] [-m MINUTES] [-e EXTENSIONS] [--version] [--verbose] [-p PROVIDER] ``` To run the script, the following parameters are required: @@ -112,6 +112,19 @@ machine urs.earthdata.nasa.gov **If the script cannot find the netrc file, you will be prompted to enter the username and password and the script wont be able to generate the CMR token** +## How the subscriber handles download failures + +If any downloads fail while the subscriber is running (e.g. a network failure), the subscriber will not record the current run as complete. However many of the data-files may have successfully downloaded. It could take a lot of extra time to re-download files the next time subscriber is run. + +Therefore, to prevent unnecessary re-downloading of files, the subscriber does a check before downloading each file. It checks if the file already exists in the output directory, and if the file is up to date (using the checksum). If the file is already there (and up to date), the default behavior is for the subscriber to skip downloading that file. + +You can override this default behavior - forcing the subscriber to always download files that show up in the search step, by using --force/-f. + +``` +podaac-data-subscriber -c SENTINEL-1A_SLC -d myData -f +``` + + ## Advanced Usage ### Request data from another DAAC... diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index d749453..055a65d 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -16,9 +16,10 @@ import logging import os from os import makedirs -from os.path import isdir, basename, join, isfile +from os.path import isdir, basename, join, isfile, exists from urllib.request import urlretrieve from datetime import datetime, timedelta +import hashlib from subscriber import podaac_access as pa @@ -59,6 +60,7 @@ def create_parser(): parser.add_argument("-d", "--data-dir", dest="outputDirectory", required=True, help = "The directory where data products will be downloaded.") # noqa E501 # Adding optional arguments + parser.add_argument("-f", "--force", dest="force", action="store_true", help = "Flag to force downloading files that are listed in CMR query, even if the file exists and checksum matches") # noqa E501 # spatiotemporal arguments parser.add_argument("-sd", "--start-date", dest="startDate", help = "The ISO date time before which data should be retrieved. For Example, --start-date 2021-01-14T00:00:00Z", default=False) # noqa E501 @@ -84,6 +86,86 @@ def create_parser(): return parser +def extract_checksums(granule_results): + """ + Create a dictionary containing checksum information from files. + + Parameters + ---------- + granule_results : dict + The cmr granule search results (umm_json format) + + Returns + ------- + A dictionary where the keys are filenames and the values are + checksum information (checksum value and checksum algorithm). + + For Example: + { + "some-granule-name.nc": { + "Value": "d96387295ea979fb8f7b9aa5f231c4ab", + "Algorithm": "MD5" + }, + "some-granule-name.nc.md5": { + "Value": '320876f087da0876edc0876ab0876b7a", + "Algorithm": "MD5" + }, + ... + } + """ + checksums = {} + for granule in granule_results["items"]: + try: + items = granule["umm"]["DataGranule"]["ArchiveAndDistributionInformation"] + for item in items: + try: + checksums[item["Name"]] = item["Checksum"] + except: + pass + except: + pass + return checksums + + +def checksum_does_match(file_path, checksums): + """ + Checks if a file's checksum matches a checksum in the checksums dict + + Parameters + ---------- + file_path : string + The relative or absolute path to an existing file + + checksums: dict + A dictionary where keys are filenames (not including the path) + and values are checksum information (checksum value and checksum algorithm) + + Returns + ------- + True - if the file's checksum matches a checksum in the checksum dict + False - if the file doesn't have a checksum, or if the checksum doesn't match + """ + filename = basename(file_path) + checksum = checksums.get(filename) + if not checksum: + return False + return make_checksum(file_path, checksum["Algorithm"]) == checksum["Value"] + + +def make_checksum(file_path, algorithm): + """ + Create checksum of file using the specified algorithm + """ + # Based on https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file#answer-3431838 + # with modification to handle multiple algorithms + hash = getattr(hashlib, algorithm.lower())() + + with open(file_path, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + hash.update(chunk) + return hash.hexdigest() + + def run(): parser = create_parser() args = parser.parse_args() @@ -211,6 +293,7 @@ def run(): downloads_all = [] downloads_data = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "GET DATA" and ('Subtype' not in u or u['Subtype'] != "OPENDAP DATA")] for r in results['items']] downloads_metadata = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "EXTENDED METADATA"] for r in results['items']] + checksums = extract_checksums(results) for f in downloads_data: downloads_all.append(f) @@ -243,7 +326,7 @@ def run(): # NEED TO REFACTOR THIS, A LOT OF STUFF in here # Finish by downloading the files to the data directory in a loop. # Overwrite `.update` with a new timestamp on success. - success_cnt = failure_cnt = 0 + success_cnt = failure_cnt = skip_cnt = 0 for f in downloads: try: # -d flag, args.outputDirectory @@ -256,6 +339,13 @@ def run(): if args.cycle: output_path = pa.prepare_cycles_output( cycles, data_path, f) + + # decide if we should actually download this file (e.g. we may already have the latest version) + if(exists(output_path) and not args.force and checksum_does_match(output_path, checksums)): + print(str(datetime.now()) + " SKIPPED: " + f) + skip_cnt += 1 + continue + urlretrieve(f, output_path) pa.process_file(process_cmd, output_path, args) print(str(datetime.now()) + " SUCCESS: " + f) @@ -274,10 +364,11 @@ def run(): with open(data_path + "/.update__" + short_name, "w") as f: f.write(timestamp) - print("Downloaded: " + str(success_cnt) + " files\n") - print("Files Failed to download:" + str(failure_cnt) + "\n") + print("\nDownloaded Files: " + str(success_cnt)) + print("Failed Files: " + str(failure_cnt)) + print("Skipped Files: " + str(skip_cnt) + "\n") pa.delete_token(token_url, token) - print("END \n\n") + print("\nEND\n\n") exit(0) diff --git a/tests/test_subscriber_extracting_checksums.py b/tests/test_subscriber_extracting_checksums.py new file mode 100644 index 0000000..2aa6a44 --- /dev/null +++ b/tests/test_subscriber_extracting_checksums.py @@ -0,0 +1,113 @@ +from subscriber.podaac_data_subscriber import extract_checksums +import json + +minimal_granule_search_results = """{ + "hits": 13, + "took": 51, + "items": [ + { + "umm": { + "DataGranule": { + "ArchiveAndDistributionInformation": [ + { + "SizeUnit": "MB", + "Size": 4.312029838562012, + "Checksum": { + "Value": "d96387295ea979fb8f7b9aa5f231c4ab", + "Algorithm": "MD5" + }, + "SizeInBytes": 4521491, + "Name": "20211231000000-REMSS-L3U_GHRSST-SSTsubskin-AMSR2-f34_20211231v8-v02.0-fv01.0.nc" + }, + { + "SizeUnit": "MB", + "Size": 1.068115234375e-4, + "Checksum": { + "Value": "8704789dd2cad4554481f6e438acb376", + "Algorithm": "MD5" + }, + "SizeInBytes": 112, + "Name": "20211231000000-REMSS-L3U_GHRSST-SSTsubskin-AMSR2-f34_20211231v8-v02.0-fv01.0.nc.md5" + } + ] + } + } + }, + { + "umm": { + "DataGranule": { + "ArchiveAndDistributionInformation": [ + { + "SizeUnit": "MB", + "Size": 4.267633438110352, + "SizeInBytes": 4474938, + "Name": "this-shouldnt-be-counted-because-theres-no-checksum-info.nc" + } + ] + } + } + }, + { + "umm": { + "DataGranule": { + "ArchiveAndDistributionInformation": [ + { + "SizeUnit": "MB", + "Size": 4.267633438110352, + "SizeInBytes": 4474938, + "Name": "this-also-shouldnt-be-counted-because-no-checksum-info.nc" + }, + { + "SizeUnit": "MB", + "Size": 4.267633438110352, + "Checksum": { + "Value": "98d330cad6d1233c258178bcc07102d6", + "Algorithm": "MD5" + }, + "SizeInBytes": 4474938, + "Name": "this-should-be-counted.nc" + } + ] + } + } + }, + { + "umm": { + "DataGranule": { + "ArchiveAndDistributionInformation": [ + { + "SizeUnit": "MB", + "Size": 4.267633438110352, + "Checksum": { + "Value": "98d330cad6d1233c258178bcc07102d6", + "Algorithm": "MD5" + }, + "SizeInBytes": 4474938, + "Name": "20220101000000-REMSS-L3U_GHRSST-SSTsubskin-AMSR2-f34_20220101v8-v02.0-fv01.0.nc" + }, + { + "SizeUnit": "MB", + "Size": 1.068115234375e-4, + "Checksum": { + "Value": "667a931589ec574acbf8791b73aeff1a", + "Algorithm": "MD5" + }, + "SizeInBytes": 112, + "Name": "20220101000000-REMSS-L3U_GHRSST-SSTsubskin-AMSR2-f34_20220101v8-v02.0-fv01.0.nc.md5" + } + ] + } + } + } + ] +} +""" + +def test_extract_checksums(): + checksums = extract_checksums(json.loads(minimal_granule_search_results)) + assert checksums["20211231000000-REMSS-L3U_GHRSST-SSTsubskin-AMSR2-f34_20211231v8-v02.0-fv01.0.nc"] == { + "Value": "d96387295ea979fb8f7b9aa5f231c4ab", + "Algorithm": "MD5" + } + assert len(checksums) == 5 + diff --git a/tests/test_subscriber_matching_checksums.py b/tests/test_subscriber_matching_checksums.py new file mode 100644 index 0000000..cc19f78 --- /dev/null +++ b/tests/test_subscriber_matching_checksums.py @@ -0,0 +1,72 @@ +from subscriber.podaac_data_subscriber import checksum_does_match + +def test_checksum_does_match__positive_match_md5(tmpdir): + output_path = str(tmpdir) + '/tmp.nc' + checksums = { + "tmp.nc": { + "Value": "f83f9ad1718d9b95220ddd6b18dbcecf", + "Algorithm": "MD5" + } + } + + with open(output_path, 'w') as f: + f.write("This is a temporary test file\n") + + assert checksum_does_match(output_path, checksums) + + +def test_checksum_does_match__negative_match_md5(tmpdir): + output_path = str(tmpdir) + '/tmp.nc' + checksums = { + "tmp.nc": { + "Value": "f83f9ad1718d9b95220ddd6b18dbcecf", + "Algorithm": "MD5" + } + } + + with open(output_path, 'w') as f: + f.write("This is a different temporary test file\n") + + assert not checksum_does_match(output_path, checksums) + + +def test_checksum_does_match__positive_match_sha512(tmpdir): + output_path = str(tmpdir) + '/tmp.nc' + checksums = { + "tmp.nc": { + "Value": "3f5bda96115a5d8fcbcbd71bc28ade2de24bba5f48ce485012f933c877d279d78be3ad028f69af620325a010ce34bd19be78c8b6bf083b0d523165ede8669483", + "Algorithm": "SHA512" + } + } + + with open(output_path, 'w') as f: + f.write("This is a temporary test file\n") + + assert checksum_does_match(output_path, checksums) + + +def test_checksum_does_match__negative_match_sha512(tmpdir): + output_path = str(tmpdir) + '/tmp.nc' + checksums = { + "tmp.nc": { + "Value": "3f5bda96115a5d8fcbcbd71bc28ade2de24bba5f48ce485012f933c877d279d78be3ad028f69af620325a010ce34bd19be78c8b6bf083b0d523165ede8669483", + "Algorithm": "SHA512" + } + } + + with open(output_path, 'w') as f: + f.write("This is a different temporary test file\n") + + assert not checksum_does_match(output_path, checksums) + + +def test_checksum_does_match__with_no_checksum(tmpdir): + output_path = str(tmpdir) + '/tmp.nc' + checksums = { + "tmp.nc": None + } + + with open(output_path, 'w') as f: + f.write("This is a temporary test file\n") + + assert not checksum_does_match(output_path, checksums) \ No newline at end of file From d1161f7f592551079a2f9ed1baadea4dcc4b88e3 Mon Sep 17 00:00:00 2001 From: mike-gangl <59702631+mike-gangl@users.noreply.github.com> Date: Tue, 26 Apr 2022 08:30:48 -0700 Subject: [PATCH 12/22] Issues/15 (#65) * updated get_search to include verbose option, not entire 'args' option * added search after functionality to podaac access; removed scroll from initial parameters * updated changelog * closes #15 --- CHANGELOG.md | 1 + subscriber/podaac_access.py | 32 +++++++++++++++++++++++----- subscriber/podaac_data_downloader.py | 6 ++---- subscriber/podaac_data_subscriber.py | 16 ++++++-------- tests/test_subscriber.py | 16 ++++++++++++++ 5 files changed, 53 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f62f1db..2a8366c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ### Added - check if file exists before downloading a file. [17](https://github.com/podaac/data-subscriber/issues/17) ### Changed +- Implemented Search After CMR interface to allow granule listings > 2000 [15](https://github.com/podaac/data-subscriber/issues/15) - Retry CMR queries on server error using random exponential backoff max 60 seconds and 10 retries - Refresh token if CMR returns 401 error - Converted print statements to log statements diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index cb652da..f5d9399 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -12,7 +12,7 @@ from urllib.error import HTTPError import subprocess from urllib.parse import urlencode -from urllib.request import urlopen +from urllib.request import Request, urlopen import requests @@ -291,18 +291,40 @@ def get_temporal_range(start, end, now): retry=(tenacity.retry_if_exception_type(HTTPError) & tenacity.retry_if_exception( lambda exc: exc.code == 500)) ) -def get_search_results(args, params): +def get_search_results(params, verbose=False): # Get the query parameters as a string and then the complete search url: query = urlencode(params) url = "https://" + cmr + "/search/granules.umm_json?" + query - if args.verbose: + if verbose: logging.info(url) # Get a new timestamp that represents the UTC time of the search. # Then download the records in `umm_json` format for granules # that match our search parameters: - with urlopen(url) as f: - results = json.loads(f.read().decode()) + results = None + search_after_header = None + while True: + # Build the request, add the search after header to it if it's not None (e.g. after the first iteration) + req = Request(url) + if search_after_header is not None: + req.add_header('CMR-Search-After', search_after_header) + response = urlopen(req) + + # Build the results object, load entire result if it's the first time. + if results is None: + results = json.loads(response.read().decode()) + # if not the first time, add the new items to the existing array + else: + results['items'].extend(json.loads(response.read().decode())['items']) + + # get the new Search After header, if it's not set, we have all the results and we're done. + search_after_header = None + search_after_header = response.info()['CMR-Search-After'] + if search_after_header is not None: + logging.debug("Search After response header defined, paging CMR for more data.") + else: + break + # return all of the paged CMR results. return results diff --git a/subscriber/podaac_data_downloader.py b/subscriber/podaac_data_downloader.py index 6b99f20..f2c1420 100644 --- a/subscriber/podaac_data_downloader.py +++ b/subscriber/podaac_data_downloader.py @@ -161,7 +161,6 @@ def run(): if search_cycles is not None: cmr_cycles = search_cycles params = [ - ('scroll', "true"), ('page_size', page_size), ('sort_key', "-start_date"), ('provider', provider), @@ -178,7 +177,6 @@ def run(): temporal_range = pa.get_temporal_range(start_date_time, end_date_time, datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) # noqa E501 params = { - 'scroll': "true", 'page_size': page_size, 'sort_key': "-start_date", 'provider': provider, @@ -195,12 +193,12 @@ def run(): # If 401 is raised, refresh token and try one more time try: - results = pa.get_search_results(args, params) + results = pa.get_search_results(params, args.verbose) except HTTPError as e: if e.code == 401: token = pa.refresh_token(token, 'podaac-subscriber') params['token'] = token - results = pa.get_search_results(args, params) + results = pa.get_search_results(params, args.verbose) else: raise e diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index 4b10be3..ad3549e 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -124,9 +124,9 @@ def extract_checksums(granule_results): For Example: { - "some-granule-name.nc": { - "Value": "d96387295ea979fb8f7b9aa5f231c4ab", - "Algorithm": "MD5" + "some-granule-name.nc": { + "Value": "d96387295ea979fb8f7b9aa5f231c4ab", + "Algorithm": "MD5" }, "some-granule-name.nc.md5": { "Value": '320876f087da0876edc0876ab0876b7a", @@ -157,7 +157,7 @@ def checksum_does_match(file_path, checksums): ---------- file_path : string The relative or absolute path to an existing file - + checksums: dict A dictionary where keys are filenames (not including the path) and values are checksum information (checksum value and checksum algorithm) @@ -272,7 +272,6 @@ def run(): datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) # noqa E501 params = { - 'scroll': "true", 'page_size': page_size, 'sort_key': "-start_date", 'provider': provider, @@ -284,7 +283,6 @@ def run(): if defined_time_range: params = { - 'scroll': "true", 'page_size': page_size, 'sort_key': "-start_date", 'provider': provider, @@ -304,12 +302,12 @@ def run(): # If 401 is raised, refresh token and try one more time try: - results = pa.get_search_results(args, params) + results = pa.get_search_results(params, args.verbose) except HTTPError as e: if e.code == 401: token = pa.refresh_token(token, 'podaac-subscriber') params['token'] = token - results = pa.get_search_results(args, params) + results = pa.get_search_results(params, args.verbose) else: raise e @@ -325,7 +323,7 @@ def run(): timestamp = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") downloads_all = [] - + downloads_data = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "GET DATA" and ('Subtype' not in u or u['Subtype'] != "OPENDAP DATA")] for r in results['items']] diff --git a/tests/test_subscriber.py b/tests/test_subscriber.py index 675c2b4..2e9d4cb 100644 --- a/tests/test_subscriber.py +++ b/tests/test_subscriber.py @@ -24,6 +24,22 @@ def cleanup_update_test(): shutil.rmtree(data_dir_with_updates) +def test_search_after(): + # cmr query: https://cmr.earthdata.nasa.gov/search/granules.umm_json?page_size=2000&sort_key=-start_date&provider=POCLOUD&ShortName=JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F&temporal=2000-01-01T10%3A00%3A00Z%2C2022-04-15T00%3A00%3A00Z&bounding_box=-180%2C-90%2C180%2C90 + # requires page-After + # ends up with 3748 granules + params = { + 'page_size': 2000, + 'sort_key': "-start_date", + 'provider': "POCLOUD", + 'ShortName': "JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F", + 'temporal': "2000-01-01T10:00:00Z,2022-04-15T00:00:00Z", + 'bounding_box': "-180,-90,180,90", + } + results = pa.get_search_results(params, True) + assert results['hits'] == 3748 + assert len(results['items']) == 3748 + def test_update_format_change(cleanup_update_test): print("Running Test") data_dir_with_updates = "./test_update_format_change" From 3ce5bfeb585c979a3faab01e356b98981ae2c8be Mon Sep 17 00:00:00 2001 From: mike-gangl <59702631+mike-gangl@users.noreply.github.com> Date: Tue, 26 Apr 2022 14:08:52 -0700 Subject: [PATCH 13/22] Update python-app.yml added netrc creation for future use of regression tests. --- .github/workflows/python-app.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 8b214f6..c8a5f11 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -34,3 +34,9 @@ jobs: - name: Test with pytest run: | pytest + - name: netrc-gen + uses: extractions/netrc@v1 + with: + machine: urs.earthdata.nasa.gov + username: ${{ secrets.EDL_OPS_USERNAME }} + password: ${{ secrets.EDL_OPS_PASSWORD }} From 60f2bceb66d6ecfaf243d61715c1888e59be60cd Mon Sep 17 00:00:00 2001 From: Wilbert Veit Date: Wed, 27 Apr 2022 16:59:00 -0700 Subject: [PATCH 14/22] Add checks for pre-existing files to downloader (#67) * Check if file exists before download - downloader * Update documentation Co-authored-by: Wilbert Veit --- Downloader.md | 24 +++++- Subscriber.md | 28 +++--- subscriber/podaac_access.py | 82 ++++++++++++++++++ subscriber/podaac_data_downloader.py | 21 +++-- subscriber/podaac_data_subscriber.py | 85 +------------------ tests/test_subscriber_extracting_checksums.py | 2 +- tests/test_subscriber_matching_checksums.py | 2 +- 7 files changed, 138 insertions(+), 106 deletions(-) diff --git a/Downloader.md b/Downloader.md index cd46f4f..78b3af6 100644 --- a/Downloader.md +++ b/Downloader.md @@ -6,9 +6,7 @@ For installation and dependency information, please see the [top-level README](R ``` $> podaac-data-downloader -h -usage: PO.DAAC bulk-data downloader [-h] -c COLLECTION -d OUTPUTDIRECTORY [--cycle SEARCH_CYCLES] [-sd STARTDATE] [-ed ENDDATE] - [-b BBOX] [-dc] [-dydoy] [-dymd] [-dy] [--offset OFFSET] [-e EXTENSIONS] [--process PROCESS_CMD] - [--version] [--verbose] [-p PROVIDER] [--limit LIMIT] +usage: PO.DAAC bulk-data downloader [-h] -c COLLECTION -d OUTPUTDIRECTORY [--cycle SEARCH_CYCLES] [-sd STARTDATE] [-ed ENDDATE] [-f] [-b BBOX] [-dc] [-dydoy] [-dymd] [-dy] [--offset OFFSET] [-e EXTENSIONS] [--process PROCESS_CMD] [--version] [--verbose] [-p PROVIDER] [--limit LIMIT] optional arguments: -h, --help show this help message and exit @@ -22,6 +20,8 @@ optional arguments: The ISO date time before which data should be retrieved. For Example, --start-date 2021-01-14T00:00:00Z -ed ENDDATE, --end-date ENDDATE The ISO date time after which data should be retrieved. For Example, --end-date 2021-01-14T00:00:00Z + -f, --force + Flag to force downloading files that are listed in CMR query, even if the file exists and checksum matches -b BBOX, --bounds BBOX The bounding rectangle to filter result in. Format is W Longitude,S Latitude,E Longitude,N Latitude without spaces. Due to an issue with parsing arguments, to use this command, please use the -b="-180,-90,180,90" syntax @@ -50,7 +50,7 @@ optional arguments: Usage: ``` -usage: PO.DAAC bulk-data downloader [-h] -c COLLECTION -d OUTPUTDIRECTORY [--cycle SEARCH_CYCLES] [-sd STARTDATE] [-ed ENDDATE] +usage: PO.DAAC bulk-data downloader [-h] -c COLLECTION -d OUTPUTDIRECTORY [--cycle SEARCH_CYCLES] [-sd STARTDATE] [-ed ENDDATE] [-f] [-b BBOX] [-dc] [-dydoy] [-dymd] [-dy] [--offset OFFSET] [-e EXTENSIONS] [--process PROCESS_CMD] [--version] [--verbose] [-p PROVIDER] [--limit LIMIT] ``` @@ -163,6 +163,22 @@ The subscriber allows the placement of downloaded files into one of several dire * -dymd - optional, relative paths use the start time of a granule to layout data in a YEAR/MONTH/DAY path +### Downloader behavior when a file already exists + +By default, when the downloader is about to download a file, it first: +- Checks if the file already exists in the target location +- Creates a checksum for the file and sees if it matches the checksum for that file in CMR + +If the file already exists AND the checksum matches, the downloader will skip downloading that file. + +This can drastically reduce the time for the downloader to complete. Also, since the checksum is verified, files will still be re-downloaded if for some reason the file has changed (or the file already on disk is corrupted). + +You can override this default behavior - forcing the downloader to always download matching files, by using --force/-f. + +``` +podaac-data-downloader -c SENTINEL-1A_SLC -d myData -f +``` + ### Setting a bounding rectangle for filtering results If you're interested in a specific region, you can set the bounds parameter on your request to filter data that passes through a certain area. This is useful in particular for non-global datasets (such as swath datasets) with non-global coverage per file. diff --git a/Subscriber.md b/Subscriber.md index 175f0c6..0cd5e3f 100644 --- a/Subscriber.md +++ b/Subscriber.md @@ -112,18 +112,6 @@ machine urs.earthdata.nasa.gov **If the script cannot find the netrc file, you will be prompted to enter the username and password and the script wont be able to generate the CMR token** -## How the subscriber handles download failures - -If any downloads fail while the subscriber is running (e.g. a network failure), the subscriber will not record the current run as complete. However many of the data-files may have successfully downloaded. It could take a lot of extra time to re-download files the next time subscriber is run. - -Therefore, to prevent unnecessary re-downloading of files, the subscriber does a check before downloading each file. It checks if the file already exists in the output directory, and if the file is up to date (using the checksum). If the file is already there (and up to date), the default behavior is for the subscriber to skip downloading that file. - -You can override this default behavior - forcing the subscriber to always download files that show up in the search step, by using --force/-f. - -``` -podaac-data-subscriber -c SENTINEL-1A_SLC -d myData -f -``` - ## Advanced Usage @@ -154,6 +142,22 @@ The subscriber allows the placement of downloaded files into one of several dire * -dydoy - optional, relative paths use the start time of a granule to layout data in a YEAR/DAY-OF-YEAR path * -dymd - optional, relative paths use the start time of a granule to layout data in a YEAR/MONTH/DAY path +### Subscriber behavior when a file already exists + +By default, when the subscriber is about to download a file, it first: +- Checks if the file already exists in the target location +- Creates a checksum for the file and sees if it matches the checksum for that file in CMR + +If the file already exists AND the checksum matches, the subscriber will skip downloading that file. + +This can drastically reduce the time for the subscriber to complete. Also, since the checksum is verified, files will still be re-downloaded if for some reason the file has changed (or the file already on disk is corrupted). + +You can override this default behavior - forcing the subscriber to always download matching files, by using --force/-f. + +``` +podaac-data-subscriber -c SENTINEL-1A_SLC -d myData -f +``` + ### Running as a Cron job To automatically run and update a local file system with data files from a collection, one can use a syntax like the following: diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index f5d9399..3121b89 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -13,6 +13,7 @@ import subprocess from urllib.parse import urlencode from urllib.request import Request, urlopen +import hashlib import requests @@ -348,3 +349,84 @@ def parse_cycles(results): 'Specify an output directory or ' 'choose another output directory flag other than -dc.') # noqa E501 return cycles + + + +def extract_checksums(granule_results): + """ + Create a dictionary containing checksum information from files. + + Parameters + ---------- + granule_results : dict + The cmr granule search results (umm_json format) + + Returns + ------- + A dictionary where the keys are filenames and the values are + checksum information (checksum value and checksum algorithm). + + For Example: + { + "some-granule-name.nc": { + "Value": "d96387295ea979fb8f7b9aa5f231c4ab", + "Algorithm": "MD5" + }, + "some-granule-name.nc.md5": { + "Value": '320876f087da0876edc0876ab0876b7a", + "Algorithm": "MD5" + }, + ... + } + """ + checksums = {} + for granule in granule_results["items"]: + try: + items = granule["umm"]["DataGranule"]["ArchiveAndDistributionInformation"] + for item in items: + try: + checksums[item["Name"]] = item["Checksum"] + except: + pass + except: + pass + return checksums + + +def checksum_does_match(file_path, checksums): + """ + Checks if a file's checksum matches a checksum in the checksums dict + + Parameters + ---------- + file_path : string + The relative or absolute path to an existing file + + checksums: dict + A dictionary where keys are filenames (not including the path) + and values are checksum information (checksum value and checksum algorithm) + + Returns + ------- + True - if the file's checksum matches a checksum in the checksum dict + False - if the file doesn't have a checksum, or if the checksum doesn't match + """ + filename = basename(file_path) + checksum = checksums.get(filename) + if not checksum: + return False + return make_checksum(file_path, checksum["Algorithm"]) == checksum["Value"] + + +def make_checksum(file_path, algorithm): + """ + Create checksum of file using the specified algorithm + """ + # Based on https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file#answer-3431838 + # with modification to handle multiple algorithms + hash = getattr(hashlib, algorithm.lower())() + + with open(file_path, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + hash.update(chunk) + return hash.hexdigest() diff --git a/subscriber/podaac_data_downloader.py b/subscriber/podaac_data_downloader.py index f2c1420..739a302 100644 --- a/subscriber/podaac_data_downloader.py +++ b/subscriber/podaac_data_downloader.py @@ -5,7 +5,7 @@ import sys from datetime import datetime, timedelta from os import makedirs -from os.path import isdir, basename, join +from os.path import isdir, basename, join, exists from urllib.error import HTTPError from urllib.request import urlretrieve @@ -66,7 +66,9 @@ def create_parser(): help="The ISO date time before which data should be retrieved. For Example, --start-date 2021-01-14T00:00:00Z") # noqa E501 parser.add_argument("-ed", "--end-date", required=False, dest="endDate", help="The ISO date time after which data should be retrieved. For Example, --end-date 2021-01-14T00:00:00Z") # noqa E501 + # Adding optional arguments + parser.add_argument("-f", "--force", dest="force", action="store_true", help = "Flag to force downloading files that are listed in CMR query, even if the file exists and checksum matches") # noqa E501 # spatiotemporal arguments parser.add_argument("-b", "--bounds", dest="bbox", @@ -216,6 +218,7 @@ def run(): results['items']] downloads_metadata = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "EXTENDED METADATA"] for r in results['items']] + checksums = pa.extract_checksums(results) for f in downloads_data: downloads_all.append(f) @@ -249,7 +252,7 @@ def run(): # NEED TO REFACTOR THIS, A LOT OF STUFF in here # Finish by downloading the files to the data directory in a loop. # Overwrite `.update` with a new timestamp on success. - success_cnt = failure_cnt = 0 + success_cnt = failure_cnt = skip_cnt = 0 for f in downloads: try: # -d flag, args.outputDirectory @@ -262,6 +265,13 @@ def run(): if args.cycle: output_path = pa.prepare_cycles_output( cycles, data_path, f) + + # decide if we should actually download this file (e.g. we may already have the latest version) + if(exists(output_path) and not args.force and pa.checksum_does_match(output_path, checksums)): + logging.info(str(datetime.now()) + " SKIPPED: " + f) + skip_cnt += 1 + continue + urlretrieve(f, output_path) pa.process_file(process_cmd, output_path, args) logging.info(str(datetime.now()) + " SUCCESS: " + f) @@ -270,10 +280,11 @@ def run(): logging.warning(str(datetime.now()) + " FAILURE: " + f, exc_info=True) failure_cnt = failure_cnt + 1 - logging.info("Downloaded: " + str(success_cnt) + " files\n") - logging.info("Files Failed to download:" + str(failure_cnt) + "\n") + logging.info("Downloaded Files: " + str(success_cnt)) + logging.info("Failed Files: " + str(failure_cnt)) + logging.info("Skipped Files: " + str(skip_cnt)) pa.delete_token(token_url, token) - logging.info("END \n\n") + logging.info("END\n\n") exit(0) diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index ad3549e..7e44816 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -21,7 +21,6 @@ from os.path import isdir, basename, join, isfile, exists from urllib.error import HTTPError from urllib.request import urlretrieve -import hashlib from subscriber import podaac_access as pa @@ -108,85 +107,6 @@ def create_parser(): return parser -def extract_checksums(granule_results): - """ - Create a dictionary containing checksum information from files. - - Parameters - ---------- - granule_results : dict - The cmr granule search results (umm_json format) - - Returns - ------- - A dictionary where the keys are filenames and the values are - checksum information (checksum value and checksum algorithm). - - For Example: - { - "some-granule-name.nc": { - "Value": "d96387295ea979fb8f7b9aa5f231c4ab", - "Algorithm": "MD5" - }, - "some-granule-name.nc.md5": { - "Value": '320876f087da0876edc0876ab0876b7a", - "Algorithm": "MD5" - }, - ... - } - """ - checksums = {} - for granule in granule_results["items"]: - try: - items = granule["umm"]["DataGranule"]["ArchiveAndDistributionInformation"] - for item in items: - try: - checksums[item["Name"]] = item["Checksum"] - except: - pass - except: - pass - return checksums - - -def checksum_does_match(file_path, checksums): - """ - Checks if a file's checksum matches a checksum in the checksums dict - - Parameters - ---------- - file_path : string - The relative or absolute path to an existing file - - checksums: dict - A dictionary where keys are filenames (not including the path) - and values are checksum information (checksum value and checksum algorithm) - - Returns - ------- - True - if the file's checksum matches a checksum in the checksum dict - False - if the file doesn't have a checksum, or if the checksum doesn't match - """ - filename = basename(file_path) - checksum = checksums.get(filename) - if not checksum: - return False - return make_checksum(file_path, checksum["Algorithm"]) == checksum["Value"] - - -def make_checksum(file_path, algorithm): - """ - Create checksum of file using the specified algorithm - """ - # Based on https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file#answer-3431838 - # with modification to handle multiple algorithms - hash = getattr(hashlib, algorithm.lower())() - - with open(file_path, 'rb') as f: - for chunk in iter(lambda: f.read(4096), b""): - hash.update(chunk) - return hash.hexdigest() - def run(): parser = create_parser() @@ -323,13 +243,12 @@ def run(): timestamp = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") downloads_all = [] - downloads_data = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "GET DATA" and ('Subtype' not in u or u['Subtype'] != "OPENDAP DATA")] for r in results['items']] downloads_metadata = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "EXTENDED METADATA"] for r in results['items']] - checksums = extract_checksums(results) + checksums = pa.extract_checksums(results) for f in downloads_data: downloads_all.append(f) @@ -378,7 +297,7 @@ def run(): cycles, data_path, f) # decide if we should actually download this file (e.g. we may already have the latest version) - if(exists(output_path) and not args.force and checksum_does_match(output_path, checksums)): + if(exists(output_path) and not args.force and pa.checksum_does_match(output_path, checksums)): logging.info(str(datetime.now()) + " SKIPPED: " + f) skip_cnt += 1 continue diff --git a/tests/test_subscriber_extracting_checksums.py b/tests/test_subscriber_extracting_checksums.py index 2aa6a44..87a5f00 100644 --- a/tests/test_subscriber_extracting_checksums.py +++ b/tests/test_subscriber_extracting_checksums.py @@ -1,5 +1,5 @@ -from subscriber.podaac_data_subscriber import extract_checksums import json +from subscriber.podaac_access import extract_checksums minimal_granule_search_results = """{ "hits": 13, diff --git a/tests/test_subscriber_matching_checksums.py b/tests/test_subscriber_matching_checksums.py index cc19f78..cd67a80 100644 --- a/tests/test_subscriber_matching_checksums.py +++ b/tests/test_subscriber_matching_checksums.py @@ -1,4 +1,4 @@ -from subscriber.podaac_data_subscriber import checksum_does_match +from subscriber.podaac_access import checksum_does_match def test_checksum_does_match__positive_match_md5(tmpdir): output_path = str(tmpdir) + '/tmp.nc' From e35c01284cfe9a986e5596f65fc90261e8370c31 Mon Sep 17 00:00:00 2001 From: mike-gangl <59702631+mike-gangl@users.noreply.github.com> Date: Wed, 27 Apr 2022 16:59:56 -0700 Subject: [PATCH 15/22] Programmatic Regression Testing (#66) * added programmatice regression testing. currently relies on a valid .netrc file, refactoring might be needed to manually add a user/password to the CMR/TEA downloads * Update python-app.yml --- .github/workflows/python-app.yml | 2 +- pyproject.toml | 4 ++ subscriber/podaac_data_downloader.py | 10 ++-- subscriber/podaac_data_subscriber.py | 10 ++-- tests/MANUAL.md | 8 ++-- tests/test_downloader_regression.py | 35 ++++++++++++++ tests/test_subscriber_regression.py | 71 ++++++++++++++++++++++++++++ 7 files changed, 127 insertions(+), 13 deletions(-) create mode 100644 tests/test_downloader_regression.py create mode 100644 tests/test_subscriber_regression.py diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index c8a5f11..d30e4c3 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -33,7 +33,7 @@ jobs: flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest run: | - pytest + pytest -m "not regression" - name: netrc-gen uses: extractions/netrc@v1 with: diff --git a/pyproject.toml b/pyproject.toml index 374b58c..aed08bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,3 +4,7 @@ requires = [ "wheel" ] build-backend = "setuptools.build_meta" +[tool.pytest.ini_options] +markers = [ + "regression: marks a test as a regression, requires netrc file (deselect with '-m \"not regresion\"')" +] diff --git a/subscriber/podaac_data_downloader.py b/subscriber/podaac_data_downloader.py index 739a302..abe69fa 100644 --- a/subscriber/podaac_data_downloader.py +++ b/subscriber/podaac_data_downloader.py @@ -108,9 +108,10 @@ def create_parser(): return parser -def run(): - parser = create_parser() - args = parser.parse_args() +def run(args=None): + if args is None: + parser = create_parser() + args = parser.parse_args() try: pa.validate(args) @@ -285,7 +286,8 @@ def run(): logging.info("Skipped Files: " + str(skip_cnt)) pa.delete_token(token_url, token) logging.info("END\n\n") - exit(0) + + def main(): diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index 7e44816..66d79cc 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -108,9 +108,11 @@ def create_parser(): -def run(): - parser = create_parser() - args = parser.parse_args() +def run(args=None): + if args is None: + parser = create_parser() + args = parser.parse_args() + try: pa.validate(args) @@ -324,7 +326,7 @@ def run(): logging.info("Skipped Files: " + str(skip_cnt)) pa.delete_token(token_url, token) logging.info("END\n\n") - exit(0) + #exit(0) def main(): diff --git a/tests/MANUAL.md b/tests/MANUAL.md index 52a1e1e..6fe9608 100644 --- a/tests/MANUAL.md +++ b/tests/MANUAL.md @@ -3,7 +3,7 @@ ## Subscriber -### Test 1 +### Test 1 - added to test_regression.py use to test: * download to `this` directory. * download using only 'enddate' @@ -29,7 +29,7 @@ ls -rth .update__ECCO_L4_ATM_STATE_05DEG_DAILY_V4R4 .update__ECCO_L4_ATM_STATE_05DEG_DAILY_V4R4 ``` -### Test 2 +### Test 2 - added to regression test use to test: * cycle based directory layouts * Bounding box limiting search results @@ -54,7 +54,7 @@ JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F/ ``` -### Test 3 +### Test 3 -- added to regression, but not the .update file log message portion use to test: * offset Usage * start/end date is working @@ -137,7 +137,7 @@ MUR25-JPL-L4-GLOB-v04.2/ 4 directories, 2 files ``` - +### Test 1 Download by cycle ``` rm -r JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F diff --git a/tests/test_downloader_regression.py b/tests/test_downloader_regression.py new file mode 100644 index 0000000..a97941a --- /dev/null +++ b/tests/test_downloader_regression.py @@ -0,0 +1,35 @@ +import pytest +import os +from os.path import exists +from subscriber import podaac_data_downloader as pdd +import shutil + +# REGRESSION TEST CURRENTLY REQUIRES A .NETRC file for CMR/Data Download + +def create_downloader_args(args): + parser = pdd.create_parser() + args2 = parser.parse_args(args) + return args2 + +#Test the downlaoder on MUR25 data for start/stop/, yyyy/mmm/dd dir structure, +# and offset. Running it a second time to ensure it downlaods the files again- +# the downloader doesn't care about updates. +@pytest.mark.regression +def test_downloader_MUR(): + shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2', ignore_errors=True) + args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4'.split()) + pdd.run(args2) + assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + t1 = os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + t2 = os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + + # this part of the test should not re-download the files unless the --force + # option is used. Currently that's not implemented in Downloader, so we'll + # have to update this when that is implemented (that is, the t1/t2 should + # be equal to the gettime of the file) + pdd.run(args2) + assert t1 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + assert t2 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + + shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2') diff --git a/tests/test_subscriber_regression.py b/tests/test_subscriber_regression.py new file mode 100644 index 0000000..07dc2f8 --- /dev/null +++ b/tests/test_subscriber_regression.py @@ -0,0 +1,71 @@ +import pytest +import os +from os.path import exists +from subscriber import podaac_data_subscriber as pds +from subscriber import podaac_data_downloader as pdd +import shutil + +# REGRESSION TEST CURRENTLY REQUIRES A .NETRC file for CMR/Data Download +# +def create_args(args): + parser = pds.create_parser() + args2 = parser.parse_args(args) + return args2 + +# Test to download ECCO data by start/stop date and put it in the year/doy dir +# structure. +@pytest.mark.regression +def test_subscriber_ecco_only_enddate(): + args2 = create_args('-c ECCO_L4_ATM_STATE_05DEG_DAILY_V4R4 -ed 1992-01-03T00:00:00Z -d ./ECCO_L4_ATM_STATE_05DEG_DAILY_V4R4 -dydoy'.split()) + pds.run(args2) + assert exists('./ECCO_L4_ATM_STATE_05DEG_DAILY_V4R4/1992/001/ATM_SURFACE_TEMP_HUM_WIND_PRES_day_mean_1992-01-01_ECCO_V4r4_latlon_0p50deg.nc') + assert exists('./ECCO_L4_ATM_STATE_05DEG_DAILY_V4R4/1992/002/ATM_SURFACE_TEMP_HUM_WIND_PRES_day_mean_1992-01-02_ECCO_V4r4_latlon_0p50deg.nc') + assert exists('./ECCO_L4_ATM_STATE_05DEG_DAILY_V4R4/1992/003/ATM_SURFACE_TEMP_HUM_WIND_PRES_day_mean_1992-01-03_ECCO_V4r4_latlon_0p50deg.nc') + shutil.rmtree('./ECCO_L4_ATM_STATE_05DEG_DAILY_V4R4') + +# test to download S6 data by start/stop time, and bbox, and put it in the +# cycle based directory structure +@pytest.mark.regression +def test_subscriber_cycle_bbox(): + args2 = create_args('-c JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F -d ./JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F -dc -sd 2022-01-01T00:00:00Z -ed 2022-01-02T00:00:00Z -b=-20,-20,20,20'.split()) + pds.run(args2) + assert exists('./JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F/c0042/S6A_P4_2__LR_STD__NR_042_071_20211231T232728_20220101T012144_F04.nc') + assert exists('./JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F/c0042/S6A_P4_2__LR_STD__NR_042_082_20220101T090557_20220101T104242_F04.nc') + assert exists('./JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F/c0042/S6A_P4_2__LR_STD__NR_042_083_20220101T104242_20220101T123506_F04.nc') + assert exists('./JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F/c0042/S6A_P4_2__LR_STD__NR_042_095_20220101T215702_20220101T234905_F04.nc') + assert exists('./JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F/c0042/S6A_P4_2__LR_STD__NR_042_097_20220101T234905_20220102T014431_F04.nc') + assert exists('./JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F/.update__JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F') + shutil.rmtree('./JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F') + +# Test to download MUR25 data by start/stop, put it in yyyy/mm/dd dir structure, +# using the offset so it aligns with the right day in the filename. +# +# Test will run it again, to ensure that the files are not re-downlaoded, that +# is, they have the same modified time before/after the second run +@pytest.mark.regression +def test_subscriber_MUR_update_file_no_redownload(): + try: + os.remove('MUR25-JPL-L4-GLOB-v04.2/.update') + except OSError as e: + print("Expecting this...") + try: + os.remove('MUR25-JPL-L4-GLOB-v04.2/..update__MUR25-JPL-L4-GLOB-v04.2') + except OSError as e: + print("Expecting this...") + + args2 = create_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4'.split()) + pds.run(args2) + assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + assert exists('./MUR25-JPL-L4-GLOB-v04.2/.update__MUR25-JPL-L4-GLOB-v04.2') + t1 = os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + t2 = os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + + # Compare another run to existing times to ensure it didn't redownload the file + pds.run(args2) + assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + assert exists('./MUR25-JPL-L4-GLOB-v04.2/.update__MUR25-JPL-L4-GLOB-v04.2') + assert t1 == os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + assert t2 == os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2') From 5c90c8a4f66bfcc84cbde1864aa9f8f9135f435e Mon Sep 17 00:00:00 2001 From: mike-gangl Date: Thu, 28 Apr 2022 09:29:40 -0700 Subject: [PATCH 16/22] updated regression tests, readied 1.9.0 version --- CHANGELOG.md | 4 +++- setup.py | 2 +- subscriber/podaac_access.py | 2 +- tests/test_downloader_regression.py | 14 ++++++++++---- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a8366c..294b444 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,9 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) -## [Unreleased] + +## [1.9.0] ### Added - check if file exists before downloading a file. [17](https://github.com/podaac/data-subscriber/issues/17) +- added automated regression testing ### Changed - Implemented Search After CMR interface to allow granule listings > 2000 [15](https://github.com/podaac/data-subscriber/issues/15) - Retry CMR queries on server error using random exponential backoff max 60 seconds and 10 retries diff --git a/setup.py b/setup.py index 25adf1b..40ae947 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ long_description = fh.read() setup(name='podaac-data-subscriber', - version='1.8.0', + version='1.9.0', description='PO.DAAC Data Susbcriber Command Line Tool', url='https://github.com/podaac/data-subscriber', long_description=long_description, diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index 3121b89..b4711df 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -21,7 +21,7 @@ import tenacity from datetime import datetime -__version__ = "1.8.0" +__version__ = "1.9.0" extensions = [".nc", ".h5", ".zip", ".tar.gz"] edl = "urs.earthdata.nasa.gov" cmr = "cmr.earthdata.nasa.gov" diff --git a/tests/test_downloader_regression.py b/tests/test_downloader_regression.py index a97941a..17dad33 100644 --- a/tests/test_downloader_regression.py +++ b/tests/test_downloader_regression.py @@ -3,6 +3,7 @@ from os.path import exists from subscriber import podaac_data_downloader as pdd import shutil +from pathlib import Path # REGRESSION TEST CURRENTLY REQUIRES A .NETRC file for CMR/Data Download @@ -25,11 +26,16 @@ def test_downloader_MUR(): t2 = os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') # this part of the test should not re-download the files unless the --force - # option is used. Currently that's not implemented in Downloader, so we'll - # have to update this when that is implemented (that is, the t1/t2 should - # be equal to the gettime of the file) + # option is used. + pdd.run(args2) + assert t1 == os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + assert t2 == os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + + # Update a file to change the checksum, then re-download + os.remove('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + Path('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc').touch() pdd.run(args2) assert t1 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') - assert t2 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + assert t2 == os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2') From 2edda7bfe629f0a87ea9f73ed2b64a244ec81794 Mon Sep 17 00:00:00 2001 From: mike-gangl Date: Thu, 28 Apr 2022 09:32:24 -0700 Subject: [PATCH 17/22] added -f option test to downloader regression --- tests/test_downloader_regression.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_downloader_regression.py b/tests/test_downloader_regression.py index 17dad33..852cbf3 100644 --- a/tests/test_downloader_regression.py +++ b/tests/test_downloader_regression.py @@ -38,4 +38,12 @@ def test_downloader_MUR(): assert t1 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') assert t2 == os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + t1 = os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + + # Set the args to --force to re-download those data + args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 -f'.split()) + pdd.run(args2) + assert t1 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + assert t2 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') + shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2') From 54b39629307896d54a28a6b0d2bd07ff76d1b688 Mon Sep 17 00:00:00 2001 From: mike-gangl <59702631+mike-gangl@users.noreply.github.com> Date: Thu, 28 Apr 2022 09:34:25 -0700 Subject: [PATCH 18/22] Update python-app.yml --- .github/workflows/python-app.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index d30e4c3..c02122f 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -7,7 +7,7 @@ on: push: branches: [ main, develop ] pull_request: - branches: [ main ] + branches: [ main, develop ] jobs: build: @@ -40,3 +40,6 @@ jobs: machine: urs.earthdata.nasa.gov username: ${{ secrets.EDL_OPS_USERNAME }} password: ${{ secrets.EDL_OPS_PASSWORD }} + - name: Regression Test with pytest + run: | + pytest -m "regression" From c52b2dc27c56906a2ed6b9ed94868fe9b026af2f Mon Sep 17 00:00:00 2001 From: Frank Greguska <89428916+frankinspace@users.noreply.github.com> Date: Thu, 19 May 2022 08:25:16 -0700 Subject: [PATCH 19/22] issues/72: Fix package structure and switch to using poetry for the build system. (#73) * issues/72: Fix package structure and switch to using poetry for the build system. * issues/72: Fix package structure and switch to using poetry for the build system. * issues/72: Fix package structure and switch to using poetry for the build system. * issues/72: Fix package structure and switch to using poetry for the build system. * issues/72: Fix package structure and switch to using poetry for the build system. * issues/72: Fix package structure and switch to using poetry for the build system. Co-authored-by: Frank Greguska --- .github/workflows/python-app.yml | 30 +- .github/workflows/release.yml | 14 +- .gitignore | 4 +- BUILD.md | 41 ++- CHANGELOG.md | 4 +- README.md | 11 +- dev-requirements.txt | 1 - poetry.lock | 301 ++++++++++++++++++++ pyproject.toml | 34 ++- requirements.txt | 6 - setup.py | 21 -- subscriber/podaac_access.py | 14 +- tests/test_subscriber_matching_checksums.py | 87 +++--- 13 files changed, 450 insertions(+), 118 deletions(-) delete mode 100644 dev-requirements.txt create mode 100644 poetry.lock delete mode 100644 requirements.txt delete mode 100644 setup.py diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index c02122f..f56086e 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -11,29 +11,35 @@ on: jobs: build: - - runs-on: ubuntu-latest - + strategy: + fail-fast: false + matrix: + python-version: [ "3.8", "3.9", "3.10" ] + poetry-version: [ "1.1" ] + os: [ ubuntu-18.04, macos-latest, windows-latest ] + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 - - name: Set up Python 3.9 + - name: Set up Python uses: actions/setup-python@v2 with: - python-version: 3.9 + python-version: ${{ matrix.python-version }} + - name: Install Poetry + uses: abatilo/actions-poetry@v2.0.0 + with: + poetry-version: ${{ matrix.poetry-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + poetry install - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest run: | - pytest -m "not regression" + poetry run pytest -m "not regression" - name: netrc-gen uses: extractions/netrc@v1 with: @@ -42,4 +48,4 @@ jobs: password: ${{ secrets.EDL_OPS_PASSWORD }} - name: Regression Test with pytest run: | - pytest -m "regression" + poetry run pytest -m "regression" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ff0d98f..62d0631 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -12,21 +12,21 @@ jobs: runs-on: ubuntu-latest steps: - - - uses: actions/checkout@v2 - name: Set up Python 3.9 uses: actions/setup-python@v2 with: - python-version: 3.9 + python-version: 3.10 + - name: Install Poetry + uses: abatilo/actions-poetry@v2.0.0 + with: + poetry-version: 1.1 - name: Install dependencies run: | - python -m pip install --upgrade pip - python -m pip install --upgrade build - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + poetry install - name: build run: | - python -m build + poetry build - name: Publish a Python distribution to PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: diff --git a/.gitignore b/.gitignore index 95c1177..6597826 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,6 @@ build/ dist/ podaac_data_subscriber.egg-info/ .idea/ -venv \ No newline at end of file +venv +.pytest_cache +MUR25-JPL-L4-GLOB-v04.2 \ No newline at end of file diff --git a/BUILD.md b/BUILD.md index 10ba345..e68bc08 100644 --- a/BUILD.md +++ b/BUILD.md @@ -1,21 +1,46 @@ -# Manually building +If you are interested in contributing to this project, you will need to be able to manually build it. -## Make sure we have all the build Dependencies +However, if you just want to download and use this project, please refer back to the [Installation](./README.md#installation) instructions in the README file. + +# Manually building (for development) + +This project is built with [Poetry](https://python-poetry.org/). +In order to build it, please follow the [installation instructions](https://python-poetry.org/docs/#installation) for poetry first. +If you are unfamiliar with poetry as a build tool, it is recommended to review the [Basic Usage](https://python-poetry.org/docs/basic-usage/) documentation before continuing. + +## Installing ``` -python3 -m pip install --upgrade build -python3 -m pip install --upgrade pip -python3 -m pip install --upgrade twine +poetry install ``` +## Running tests +All tests +``` +poetry run pytest +``` + +Exclude regression tests +``` +poetry run pytest -m "not regression" +``` + +Only regression tests +``` +poetry run pytest -m "regression" +``` + + ## Clean, build, and upload ``` rm -r dist -python3 -m build +poetry install +poetry build # pypi test upload -# python3 -m twine upload --repository testpypi dist/* +# poetry config repositories.testpypi https://test.pypi.org/legacy/ +# poetry publish -r testpypi #pypi upload -python3 -m twine upload dist/* +poetry publish ``` ## Install a specific version diff --git a/CHANGELOG.md b/CHANGELOG.md index 294b444..3703636 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,9 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - +## [Unreleased] +### Changed +- Switched to [poetry](https://python-poetry.org/) as the build tool for the project ## [1.9.0] ### Added diff --git a/README.md b/README.md index ab0a35e..853fb92 100644 --- a/README.md +++ b/README.md @@ -23,17 +23,12 @@ The Downloader is useful if you need to download PO.DAAC data once in a while or The subscriber is useful for users who need to continuously pull the latest data from the PO.DAAC archive. If you feed data into a model or real time process, the subscriber allows you to repeatedly run the script and only download the latest data. -## Dependencies - -Aside from **python 3**, the only dependency is the python 'requests' module, which can be installed via pip. Pip is the `package installer for python`. you don't need to know much of anything about python or pip, as long as you have it installed on the machine you're using. - -``` -python -m pip install requests -``` ## Installation -The subscriber and downloader scripes are available in the [pypi python repository](https://pypi.org/project/podaac-data-subscriber/), it can be installed via pip: +Both subscriber and download require Python >= 3.8. + +The subscriber and downloader scripts are available in the [pypi python repository](https://pypi.org/project/podaac-data-subscriber/), it can be installed via pip: ``` pip install podaac-data-subscriber diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index 49435c9..0000000 --- a/dev-requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pytest==7.1.1 \ No newline at end of file diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..4ef628e --- /dev/null +++ b/poetry.lock @@ -0,0 +1,301 @@ +[[package]] +name = "atomicwrites" +version = "1.4.0" +description = "Atomic file writes." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "attrs" +version = "21.4.0" +description = "Classes Without Boilerplate" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.extras] +dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"] +docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] +tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"] +tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"] + +[[package]] +name = "certifi" +version = "2022.5.18" +description = "Python package for providing Mozilla's CA Bundle." +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "charset-normalizer" +version = "2.0.12" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "main" +optional = false +python-versions = ">=3.5.0" + +[package.extras] +unicode_backport = ["unicodedata2"] + +[[package]] +name = "colorama" +version = "0.4.4" +description = "Cross-platform colored terminal text." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "flake8" +version = "4.0.1" +description = "the modular source code checker: pep8 pyflakes and co" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +mccabe = ">=0.6.0,<0.7.0" +pycodestyle = ">=2.8.0,<2.9.0" +pyflakes = ">=2.4.0,<2.5.0" + +[[package]] +name = "idna" +version = "3.3" +description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "mccabe" +version = "0.6.1" +description = "McCabe checker, plugin for flake8" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "packaging" +version = "21.3" +description = "Core utilities for Python packages" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" + +[[package]] +name = "pluggy" +version = "1.0.0" +description = "plugin and hook calling mechanisms for python" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "py" +version = "1.11.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "pycodestyle" +version = "2.8.0" +description = "Python style guide checker" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "pyflakes" +version = "2.4.0" +description = "passive checker of Python programs" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "pyparsing" +version = "3.0.9" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +category = "dev" +optional = false +python-versions = ">=3.6.8" + +[package.extras] +diagrams = ["railroad-diagrams", "jinja2"] + +[[package]] +name = "pytest" +version = "7.1.2" +description = "pytest: simple powerful testing with Python" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +py = ">=1.8.2" +tomli = ">=1.0.0" + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] + +[[package]] +name = "requests" +version = "2.27.1" +description = "Python HTTP for Humans." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""} +idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""} +urllib3 = ">=1.21.1,<1.27" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] +use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"] + +[[package]] +name = "tenacity" +version = "8.0.1" +description = "Retry code until it succeeds" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.extras] +doc = ["reno", "sphinx", "tornado (>=4.5)"] + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "urllib3" +version = "1.26.9" +description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" + +[package.extras] +brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] + +[metadata] +lock-version = "1.1" +python-versions = "^3.8" +content-hash = "5458e3368f3db9e1610832e6cbe1bf58f1a6238697eed761401c685ff5a2cf74" + +[metadata.files] +atomicwrites = [ + {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, + {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, +] +attrs = [ + {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"}, + {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"}, +] +certifi = [ + {file = "certifi-2022.5.18-py3-none-any.whl", hash = "sha256:8d15a5a7fde18536a249c49e07e8e462b8fc13de21b3c80e8a68315dfa227c99"}, + {file = "certifi-2022.5.18.tar.gz", hash = "sha256:6ae10321df3e464305a46e997da41ea56c1d311fb9ff1dd4e04d6f14653ec63a"}, +] +charset-normalizer = [ + {file = "charset-normalizer-2.0.12.tar.gz", hash = "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597"}, + {file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"}, +] +colorama = [ + {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, + {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, +] +flake8 = [ + {file = "flake8-4.0.1-py2.py3-none-any.whl", hash = "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d"}, + {file = "flake8-4.0.1.tar.gz", hash = "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"}, +] +idna = [ + {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, + {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, +] +iniconfig = [ + {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, + {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, +] +mccabe = [ + {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, + {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, +] +packaging = [ + {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, + {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, +] +pluggy = [ + {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, + {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, +] +py = [ + {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, + {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, +] +pycodestyle = [ + {file = "pycodestyle-2.8.0-py2.py3-none-any.whl", hash = "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20"}, + {file = "pycodestyle-2.8.0.tar.gz", hash = "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"}, +] +pyflakes = [ + {file = "pyflakes-2.4.0-py2.py3-none-any.whl", hash = "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"}, + {file = "pyflakes-2.4.0.tar.gz", hash = "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c"}, +] +pyparsing = [ + {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, + {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, +] +pytest = [ + {file = "pytest-7.1.2-py3-none-any.whl", hash = "sha256:13d0e3ccfc2b6e26be000cb6568c832ba67ba32e719443bfe725814d3c42433c"}, + {file = "pytest-7.1.2.tar.gz", hash = "sha256:a06a0425453864a270bc45e71f783330a7428defb4230fb5e6a731fde06ecd45"}, +] +requests = [ + {file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"}, + {file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"}, +] +tenacity = [ + {file = "tenacity-8.0.1-py3-none-any.whl", hash = "sha256:f78f4ea81b0fabc06728c11dc2a8c01277bfc5181b321a4770471902e3eb844a"}, + {file = "tenacity-8.0.1.tar.gz", hash = "sha256:43242a20e3e73291a28bcbcacfd6e000b02d3857a9a9fff56b297a27afdc932f"}, +] +tomli = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] +urllib3 = [ + {file = "urllib3-1.26.9-py2.py3-none-any.whl", hash = "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14"}, + {file = "urllib3-1.26.9.tar.gz", hash = "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"}, +] diff --git a/pyproject.toml b/pyproject.toml index aed08bf..19009bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,34 @@ -[build-system] -requires = [ - "setuptools>=42", - "wheel" +[tool.poetry] +name = "podaac-data-subscriber" +version = "1.9.0" +description = "PO.DAAC Data Subscriber Command Line Tool" +authors = ["PO.DAAC "] +readme = "README.md" +license = "Apache-2.0" +repository = "https://github.com/podaac/data-subscriber" +exclude = ['cmr', 'img', 'tests'] +packages = [ + { include = "subscriber" }, ] -build-backend = "setuptools.build_meta" + +[tool.poetry.dependencies] +python = "^3.8" +requests = "^2.27.1" +tenacity = "^8.0.1" + +[tool.poetry.dev-dependencies] +pytest = "^7.1.2" +flake8 = "^4.0.1" + +[tool.poetry.scripts] +podaac-data-subscriber = 'subscriber.podaac_data_subscriber:main' +podaac-data-downloader = 'subscriber.podaac_data_downloader:main' + [tool.pytest.ini_options] markers = [ "regression: marks a test as a regression, requires netrc file (deselect with '-m \"not regresion\"')" ] + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 9319567..0000000 --- a/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -certifi==2020.12.5 -chardet==4.0.0 -idna==2.10 -requests==2.25.1 -urllib3>=1.26.5 -tenacity>=8.0.1 \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 40ae947..0000000 --- a/setup.py +++ /dev/null @@ -1,21 +0,0 @@ -from setuptools import setup - -with open("README.md", "r", encoding="utf-8") as fh: - long_description = fh.read() - -setup(name='podaac-data-subscriber', - version='1.9.0', - description='PO.DAAC Data Susbcriber Command Line Tool', - url='https://github.com/podaac/data-subscriber', - long_description=long_description, - long_description_content_type='text/markdown', - author='PO.DAAC', - author_email='podaac@podaac.jpl.nasa.gov', - license='apache-2', - packages=['subscriber'], - entry_points=''' - [console_scripts] - podaac-data-subscriber=subscriber.podaac_data_subscriber:main - podaac-data-downloader=subscriber.podaac_data_downloader:main - ''', - zip_safe=False) diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index b4711df..1533257 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -351,7 +351,6 @@ def parse_cycles(results): return cycles - def extract_checksums(granule_results): """ Create a dictionary containing checksum information from files. @@ -415,7 +414,12 @@ def checksum_does_match(file_path, checksums): checksum = checksums.get(filename) if not checksum: return False - return make_checksum(file_path, checksum["Algorithm"]) == checksum["Value"] + + computed_checksum = make_checksum(file_path, checksum["Algorithm"]) + checksums_match = computed_checksum == checksum["Value"] + if not checksums_match: + logging.warning(f'Computed checksum {computed_checksum} does not match expected checksum {checksum["Value"]}') + return checksums_match def make_checksum(file_path, algorithm): @@ -424,9 +428,9 @@ def make_checksum(file_path, algorithm): """ # Based on https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file#answer-3431838 # with modification to handle multiple algorithms - hash = getattr(hashlib, algorithm.lower())() + hash_alg = getattr(hashlib, algorithm.lower())() with open(file_path, 'rb') as f: for chunk in iter(lambda: f.read(4096), b""): - hash.update(chunk) - return hash.hexdigest() + hash_alg.update(chunk) + return hash_alg.hexdigest() diff --git a/tests/test_subscriber_matching_checksums.py b/tests/test_subscriber_matching_checksums.py index cd67a80..65b7bd5 100644 --- a/tests/test_subscriber_matching_checksums.py +++ b/tests/test_subscriber_matching_checksums.py @@ -1,72 +1,73 @@ from subscriber.podaac_access import checksum_does_match + def test_checksum_does_match__positive_match_md5(tmpdir): - output_path = str(tmpdir) + '/tmp.nc' - checksums = { - "tmp.nc": { - "Value": "f83f9ad1718d9b95220ddd6b18dbcecf", - "Algorithm": "MD5" + output_path = str(tmpdir) + '/tmp.nc' + checksums = { + "tmp.nc": { + "Value": "28d864459bb7628af122ee854439d143", + "Algorithm": "MD5" + } } - } - with open(output_path, 'w') as f: - f.write("This is a temporary test file\n") + with open(output_path, 'w') as f: + f.write("This is a temporary test file") - assert checksum_does_match(output_path, checksums) + assert checksum_does_match(output_path, checksums) def test_checksum_does_match__negative_match_md5(tmpdir): - output_path = str(tmpdir) + '/tmp.nc' - checksums = { - "tmp.nc": { - "Value": "f83f9ad1718d9b95220ddd6b18dbcecf", - "Algorithm": "MD5" + output_path = str(tmpdir) + '/tmp.nc' + checksums = { + "tmp.nc": { + "Value": "28d864459bb7628af122ee854439d143", + "Algorithm": "MD5" + } } - } - with open(output_path, 'w') as f: - f.write("This is a different temporary test file\n") + with open(output_path, 'w') as f: + f.write("This is a different temporary test file") - assert not checksum_does_match(output_path, checksums) + assert not checksum_does_match(output_path, checksums) def test_checksum_does_match__positive_match_sha512(tmpdir): - output_path = str(tmpdir) + '/tmp.nc' - checksums = { - "tmp.nc": { - "Value": "3f5bda96115a5d8fcbcbd71bc28ade2de24bba5f48ce485012f933c877d279d78be3ad028f69af620325a010ce34bd19be78c8b6bf083b0d523165ede8669483", - "Algorithm": "SHA512" + output_path = str(tmpdir) + '/tmp.nc' + checksums = { + "tmp.nc": { + "Value": "439de7997fe599d7af6d108534cae418ac95f70f614e3c2fda7a26b03e599211ffbfc85eede5dd933aa7a3c5cfe87d6b3de30ab2d9b4fd45162a5e22b71fffe8", + "Algorithm": "SHA512" + } } - } - with open(output_path, 'w') as f: - f.write("This is a temporary test file\n") + with open(output_path, 'w') as f: + f.write("This is a temporary test file") - assert checksum_does_match(output_path, checksums) + assert checksum_does_match(output_path, checksums) def test_checksum_does_match__negative_match_sha512(tmpdir): - output_path = str(tmpdir) + '/tmp.nc' - checksums = { - "tmp.nc": { - "Value": "3f5bda96115a5d8fcbcbd71bc28ade2de24bba5f48ce485012f933c877d279d78be3ad028f69af620325a010ce34bd19be78c8b6bf083b0d523165ede8669483", - "Algorithm": "SHA512" + output_path = str(tmpdir) + '/tmp.nc' + checksums = { + "tmp.nc": { + "Value": "439de7997fe599d7af6d108534cae418ac95f70f614e3c2fda7a26b03e599211ffbfc85eede5dd933aa7a3c5cfe87d6b3de30ab2d9b4fd45162a5e22b71fffe8", + "Algorithm": "SHA512" + } } - } - with open(output_path, 'w') as f: - f.write("This is a different temporary test file\n") + with open(output_path, 'w') as f: + f.write("This is a different temporary test file") - assert not checksum_does_match(output_path, checksums) + assert not checksum_does_match(output_path, checksums) def test_checksum_does_match__with_no_checksum(tmpdir): - output_path = str(tmpdir) + '/tmp.nc' - checksums = { - "tmp.nc": None - } + output_path = str(tmpdir) + '/tmp.nc' + checksums = { + "tmp.nc": None + } - with open(output_path, 'w') as f: - f.write("This is a temporary test file\n") + with open(output_path, 'w') as f: + f.write("This is a temporary test file\n") - assert not checksum_does_match(output_path, checksums) \ No newline at end of file + assert not checksum_does_match(output_path, checksums) From a04420149a3e199cee570d8e4cf92f19799e937f Mon Sep 17 00:00:00 2001 From: mike-gangl <59702631+mike-gangl@users.noreply.github.com> Date: Thu, 19 May 2022 08:35:16 -0700 Subject: [PATCH 20/22] Release 1.9.1 (#74) * updated release to 1.9.1 * Update podaac_access.py * Update CHANGELOG.md --- CHANGELOG.md | 2 +- pyproject.toml | 2 +- subscriber/podaac_access.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3703636..13a238d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) -## [Unreleased] +## [1.9.1] ### Changed - Switched to [poetry](https://python-poetry.org/) as the build tool for the project diff --git a/pyproject.toml b/pyproject.toml index 19009bf..ac86840 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "podaac-data-subscriber" -version = "1.9.0" +version = "1.9.1" description = "PO.DAAC Data Subscriber Command Line Tool" authors = ["PO.DAAC "] readme = "README.md" diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index 1533257..7379122 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -21,7 +21,7 @@ import tenacity from datetime import datetime -__version__ = "1.9.0" +__version__ = "1.9.1" extensions = [".nc", ".h5", ".zip", ".tar.gz"] edl = "urs.earthdata.nasa.gov" cmr = "cmr.earthdata.nasa.gov" From 3576e059aedacdfe814ce044179daf395ddc4d48 Mon Sep 17 00:00:00 2001 From: mike-gangl Date: Thu, 19 May 2022 08:48:59 -0700 Subject: [PATCH 21/22] leftovers from merge --- CHANGELOG.md | 4 ---- pyproject.toml | 8 -------- subscriber/podaac_access.py | 21 --------------------- 3 files changed, 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc1ba5f..13a238d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,10 +7,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ### Changed - Switched to [poetry](https://python-poetry.org/) as the build tool for the project -<<<<<<< HEAD -======= - ->>>>>>> main ## [1.9.0] ### Added - check if file exists before downloading a file. [17](https://github.com/podaac/data-subscriber/issues/17) diff --git a/pyproject.toml b/pyproject.toml index a96e48b..ac86840 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,15 +28,7 @@ podaac-data-downloader = 'subscriber.podaac_data_downloader:main' markers = [ "regression: marks a test as a regression, requires netrc file (deselect with '-m \"not regresion\"')" ] -<<<<<<< HEAD [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" -======= -build-backend = "setuptools.build_meta" -[tool.pytest.ini_options] -markers = [ - "regression: marks a test as a regression, requires netrc file (deselect with '-m \"not regresion\"')" -] ->>>>>>> main diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index 016495d..7379122 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -21,11 +21,7 @@ import tenacity from datetime import datetime -<<<<<<< HEAD __version__ = "1.9.1" -======= -__version__ = "1.9.0" ->>>>>>> main extensions = [".nc", ".h5", ".zip", ".tar.gz"] edl = "urs.earthdata.nasa.gov" cmr = "cmr.earthdata.nasa.gov" @@ -355,10 +351,6 @@ def parse_cycles(results): return cycles -<<<<<<< HEAD -======= - ->>>>>>> main def extract_checksums(granule_results): """ Create a dictionary containing checksum information from files. @@ -422,16 +414,12 @@ def checksum_does_match(file_path, checksums): checksum = checksums.get(filename) if not checksum: return False -<<<<<<< HEAD computed_checksum = make_checksum(file_path, checksum["Algorithm"]) checksums_match = computed_checksum == checksum["Value"] if not checksums_match: logging.warning(f'Computed checksum {computed_checksum} does not match expected checksum {checksum["Value"]}') return checksums_match -======= - return make_checksum(file_path, checksum["Algorithm"]) == checksum["Value"] ->>>>>>> main def make_checksum(file_path, algorithm): @@ -440,18 +428,9 @@ def make_checksum(file_path, algorithm): """ # Based on https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file#answer-3431838 # with modification to handle multiple algorithms -<<<<<<< HEAD hash_alg = getattr(hashlib, algorithm.lower())() with open(file_path, 'rb') as f: for chunk in iter(lambda: f.read(4096), b""): hash_alg.update(chunk) return hash_alg.hexdigest() -======= - hash = getattr(hashlib, algorithm.lower())() - - with open(file_path, 'rb') as f: - for chunk in iter(lambda: f.read(4096), b""): - hash.update(chunk) - return hash.hexdigest() ->>>>>>> main From 94a6fca27f8a1c27114e7509caba8e108b95ea13 Mon Sep 17 00:00:00 2001 From: mike-gangl Date: Thu, 19 May 2022 08:56:56 -0700 Subject: [PATCH 22/22] atom not remving the head/main merge issues correctly? cleaning up manually. --- tests/test_subscriber_matching_checksums.py | 74 --------------------- 1 file changed, 74 deletions(-) diff --git a/tests/test_subscriber_matching_checksums.py b/tests/test_subscriber_matching_checksums.py index 65524c4..1d40c79 100644 --- a/tests/test_subscriber_matching_checksums.py +++ b/tests/test_subscriber_matching_checksums.py @@ -1,7 +1,5 @@ from subscriber.podaac_access import checksum_does_match -<<<<<<< HEAD - def test_checksum_does_match__positive_match_md5(tmpdir): output_path = str(tmpdir) + '/tmp.nc' checksums = { @@ -72,75 +70,3 @@ def test_checksum_does_match__with_no_checksum(tmpdir): f.write("This is a temporary test file\n") assert not checksum_does_match(output_path, checksums) -======= -def test_checksum_does_match__positive_match_md5(tmpdir): - output_path = str(tmpdir) + '/tmp.nc' - checksums = { - "tmp.nc": { - "Value": "f83f9ad1718d9b95220ddd6b18dbcecf", - "Algorithm": "MD5" - } - } - - with open(output_path, 'w') as f: - f.write("This is a temporary test file\n") - - assert checksum_does_match(output_path, checksums) - - -def test_checksum_does_match__negative_match_md5(tmpdir): - output_path = str(tmpdir) + '/tmp.nc' - checksums = { - "tmp.nc": { - "Value": "f83f9ad1718d9b95220ddd6b18dbcecf", - "Algorithm": "MD5" - } - } - - with open(output_path, 'w') as f: - f.write("This is a different temporary test file\n") - - assert not checksum_does_match(output_path, checksums) - - -def test_checksum_does_match__positive_match_sha512(tmpdir): - output_path = str(tmpdir) + '/tmp.nc' - checksums = { - "tmp.nc": { - "Value": "3f5bda96115a5d8fcbcbd71bc28ade2de24bba5f48ce485012f933c877d279d78be3ad028f69af620325a010ce34bd19be78c8b6bf083b0d523165ede8669483", - "Algorithm": "SHA512" - } - } - - with open(output_path, 'w') as f: - f.write("This is a temporary test file\n") - - assert checksum_does_match(output_path, checksums) - - -def test_checksum_does_match__negative_match_sha512(tmpdir): - output_path = str(tmpdir) + '/tmp.nc' - checksums = { - "tmp.nc": { - "Value": "3f5bda96115a5d8fcbcbd71bc28ade2de24bba5f48ce485012f933c877d279d78be3ad028f69af620325a010ce34bd19be78c8b6bf083b0d523165ede8669483", - "Algorithm": "SHA512" - } - } - - with open(output_path, 'w') as f: - f.write("This is a different temporary test file\n") - - assert not checksum_does_match(output_path, checksums) - - -def test_checksum_does_match__with_no_checksum(tmpdir): - output_path = str(tmpdir) + '/tmp.nc' - checksums = { - "tmp.nc": None - } - - with open(output_path, 'w') as f: - f.write("This is a temporary test file\n") - - assert not checksum_does_match(output_path, checksums) ->>>>>>> main