From f86163a362a711f3da09902dee23b9778516682b Mon Sep 17 00:00:00 2001 From: zoewalschots <97630001+zoewalschots@users.noreply.github.com> Date: Thu, 22 Jun 2023 14:31:43 -0700 Subject: [PATCH 1/2] Edit Search and Download SWOT Notebook Implemented earthaccess package to simplify authentication, search, and download of data --- notebooks/SearchDownload_SWOTviaCMR.ipynb | 486 ++++++++++++---------- 1 file changed, 274 insertions(+), 212 deletions(-) diff --git a/notebooks/SearchDownload_SWOTviaCMR.ipynb b/notebooks/SearchDownload_SWOTviaCMR.ipynb index c0fa02ca..cfd35970 100644 --- a/notebooks/SearchDownload_SWOTviaCMR.ipynb +++ b/notebooks/SearchDownload_SWOTviaCMR.ipynb @@ -11,26 +11,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Search and Download Simulated SWOT Data via the Common Metadata Repository (CMR)\n", + "# Search and Download Simulated SWOT Data via `earthaccess`\n", "#### *Author: Cassandra Nickles, PO.DAAC*\n", "\n", "## Summary\n", - "This notebook will find and download simulated SWOT data programmatically via CMR. It searches for the desired data by shapefile extent but can be modified to do otherwise.\n", + "This notebook will find and download simulated SWOT data programmatically via earthaccess. For more information about earthaccess visit: https://nsidc.github.io/earthaccess/\n", "\n", "## Requirements\n", "### 1. Compute environment \n", "This tutorial can be run in the following environments:\n", - "- **AWS instance running in us-west-2**: NASA Earthdata Cloud data in S3 can be directly accessed via temporary credentials; this access is limited to requests made within the US West (Oregon) (code: `us-west-2`) AWS region.\n", "- **Local compute environment** e.g. laptop, server: this tutorial can be run on your local machine\n", "\n", "### 2. Earthdata Login\n", "\n", "An Earthdata Login account is required to access data, as well as discover restricted data, from the NASA Earthdata system. Thus, to access NASA data, you need Earthdata Login. Please visit https://urs.earthdata.nasa.gov to register and manage your Earthdata Login account. This account is free to create and only takes a moment to set up.\n", "\n", - "### 3. netrc File\n", - "\n", - "You will need a `.netrc` file containing your NASA Earthdata Login credentials. A `.netrc` file can be created manually within text editor and saved to your home directory. For additional information see: [Authentication for NASA Earthdata tutorial](https://nasa-openscapes.github.io/2021-Cloud-Workshop-AGU/tutorials/02_NASA_Earthdata_Authentication.html). If you do not have this file, a code block has been added below as a work around.\n", - "\n", "### Import libraries" ] }, @@ -49,72 +44,33 @@ "import os\n", "import zipfile\n", "from urllib.request import urlretrieve\n", - "from json import dumps" + "from json import dumps\n", + "import earthaccess\n", + "from earthaccess import Auth, DataCollections, DataGranules, Store" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In this notebook, we will be calling the authentication in the below cell, a work around if you do not yet have a netrc file." + "In this notebook, we will be calling the authentication in the below cell." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Please provide your Earthdata Login credentials for access.\n", - "Your info will only be passed to urs.earthdata.nasa.gov and will not be exposed in Jupyter.\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Username: nickles\n", - "Password: ···········\n" - ] - } - ], + "outputs": [], "source": [ - "from urllib import request\n", - "from http.cookiejar import CookieJar\n", - "from getpass import getpass\n", - "import netrc\n", - "from platform import system\n", - "from os.path import join, isfile, basename, abspath, expanduser\n", - "\n", - "def setup_earthdata_login_auth(endpoint: str='urs.earthdata.nasa.gov'):\n", - " netrc_name = \"_netrc\" if system()==\"Windows\" else \".netrc\"\n", - " try:\n", - " username, _, password = netrc(file=join(expanduser('~'), netrc_name)).authenticators(endpoint)\n", - " except (FileNotFoundError, TypeError):\n", - " print('Please provide your Earthdata Login credentials for access.')\n", - " print('Your info will only be passed to %s and will not be exposed in Jupyter.' % (endpoint))\n", - " username = input('Username: ')\n", - " password = getpass('Password: ')\n", - " manager = request.HTTPPasswordMgrWithDefaultRealm()\n", - " manager.add_password(None, endpoint, username, password)\n", - " auth = request.HTTPBasicAuthHandler(manager)\n", - " jar = CookieJar()\n", - " processor = request.HTTPCookieProcessor(jar)\n", - " opener = request.build_opener(auth, processor)\n", - " request.install_opener(opener)\n", - " \n", - "setup_earthdata_login_auth('urs.earthdata.nasa.gov')" + "auth = earthaccess.login(strategy=\"interactive\", persist=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Search Common Metadata Repository (CMR) for SWOT sample data links by Shapefile\n", - "We want to find the SWOT sample files that will cross over our region of interest. For this tutorial, we use a shapefile of the United States, finding 44 total granules over the land. Each dataset has it's own unique collection ID. For the SWOT_SIMULATED_NA_CONTINENT_L2_HR_RIVERSP_V1 dataset, we find the collection ID [here](https://podaac.jpl.nasa.gov/dataset/SWOT_SIMULATED_NA_CONTINENT_L2_HR_RIVERSP_V1).\n", + "### Search for SWOT sample data links\n", + "We want to find the SWOT sample files that will cross over our region of interest. Each dataset has it's own unique collection ID. For the SWOT_SIMULATED_NA_CONTINENT_L2_HR_RIVERSP_V1 dataset, we find the collection ID [here](https://podaac.jpl.nasa.gov/dataset/SWOT_SIMULATED_NA_CONTINENT_L2_HR_RIVERSP_V1).\n", "\n", "**Sample SWOT Hydrology Datasets and Associated Collection IDs:**\n", "1. **River Vector Shapefile** - SWOT_SIMULATED_NA_CONTINENT_L2_HR_RIVERSP_V1 - **C2263384307-POCLOUD**\n", @@ -137,68 +93,93 @@ "name": "stdout", "output_type": "stream", "text": [ - "44\n" + "Granule hits: 46\n" ] } ], "source": [ - "# the URL of the CMR service\n", - "cmr_url = 'https://cmr.earthdata.nasa.gov/search/granules.json'\n", - "\n", - "#The shapefile we want to use in our search\n", - "shp_file = open('../resources/US_shapefile.zip', 'rb')\n", - "\n", - "#need to declare the file and the type we are uploading\n", - "files = {'shapefile':('US_shapefile.zip',shp_file, 'application/shapefile+zip')}\n", - "\n", - "#used to define parameters such as the concept-id and things like temporal searches\n", - "parameters = {'collection_concept_id':'C2263384307-POCLOUD', #insert desired collection ID here\n", - " 'page_size': 2000}#, #default will only return 10 granules, so we set it to the max\n", - "\n", - "#request the granules from this collection that align with the shapefile\n", - "response = requests.post(cmr_url, params=parameters, files=files)\n", - "\n", - "#If you want to search by bounding box instead of shapefile, use the following instead:\n", - "#parameters = {'collection_concept_id':'C2263384307-POCLOUD',\n", - "# 'page_size': 2000, \n", - "# 'bounding_box':\"-124.848974,24.396308,-66.885444,49.384358\"} \n", - "#response = requests.post(cmr_url, params=parameters)\n", - "\n", - "if len(response.json()['feed']['entry'])>0:\n", - " print(len(response.json()['feed']['entry'])) #print out number of files found\n", - " #print(dumps(response.json()['feed']['entry'][0], indent=2)) #print out the first file information" + "#earthaccess data search\n", + "Query = DataGranules().concept_id(\"C2263384307-POCLOUD\").bounding_box(-124.848974,24.396308,-66.885444,49.384358)\n", + "print(f\"Granule hits: {Query.hits()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "granules = Query.get()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#extract the data links from the granules\n", + "data_links = [g.data_links(access=\"on_prem\") for g in granules]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/SWOT_SIMULATED_NA_CONTINENT_L2_HR_RIVERSP_V1/SWOT_L2_HR_RiverSP_Node_007_022_NA_20220804T224145_20220804T224402_PGA0_01.zip']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_links[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Get Download links from CMR search results" + "### Get Download links from `earthaccess` search results" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "22\n" + "23\n" ] } ], "source": [ + "#add desired links to a list\n", + "#if the link has \"Reach\" instead of \"Node\" in the name, we want to download it for the swath use case\n", "downloads = []\n", - "for r in response.json()['feed']['entry']:\n", - " for l in r['links']:\n", - " #if the link starts with the following, it is the download link we want\n", - " if 'https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/' in l['href']: \n", - " #if the link has \"Reach\" instead of \"Node\" in the name, we want to download it for the swath use case\n", - " if 'Reach' in l['href']:\n", - " downloads.append(l['href'])\n", - "print(len(downloads)) #should end up with half the number of files above since we only need reach files, not node files" + "for r in data_links:\n", + " for l in r:\n", + " if 'https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/' in l:\n", + " if 'Reach' in l:\n", + " downloads.append(l)\n", + " \n", + "print(len(downloads))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This leaves us with half of the original links from our search." ] }, { @@ -210,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -223,12 +204,87 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b6aa5d4e12b74d7cbdb328482ee40beb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "QUEUEING TASKS | : 0%| | 0/23 [00:00 Date: Thu, 22 Jun 2023 14:48:44 -0700 Subject: [PATCH 2/2] Add ref to US bbox Added line of text to refer to region of interest. --- notebooks/SearchDownload_SWOTviaCMR.ipynb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/notebooks/SearchDownload_SWOTviaCMR.ipynb b/notebooks/SearchDownload_SWOTviaCMR.ipynb index cfd35970..2bfe6602 100644 --- a/notebooks/SearchDownload_SWOTviaCMR.ipynb +++ b/notebooks/SearchDownload_SWOTviaCMR.ipynb @@ -70,7 +70,9 @@ "metadata": {}, "source": [ "### Search for SWOT sample data links\n", - "We want to find the SWOT sample files that will cross over our region of interest. Each dataset has it's own unique collection ID. For the SWOT_SIMULATED_NA_CONTINENT_L2_HR_RIVERSP_V1 dataset, we find the collection ID [here](https://podaac.jpl.nasa.gov/dataset/SWOT_SIMULATED_NA_CONTINENT_L2_HR_RIVERSP_V1).\n", + "We want to find the SWOT sample files that will cross over our region of interest, in the case, a bounding box of the United States. \n", + "\n", + "Each dataset has it's own unique collection ID. For the SWOT_SIMULATED_NA_CONTINENT_L2_HR_RIVERSP_V1 dataset, we find the collection ID [here](https://podaac.jpl.nasa.gov/dataset/SWOT_SIMULATED_NA_CONTINENT_L2_HR_RIVERSP_V1).\n", "\n", "**Sample SWOT Hydrology Datasets and Associated Collection IDs:**\n", "1. **River Vector Shapefile** - SWOT_SIMULATED_NA_CONTINENT_L2_HR_RIVERSP_V1 - **C2263384307-POCLOUD**\n", @@ -481,7 +483,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.8.13" }, "vscode": { "interpreter": {