From 6b6ad6a14b3ca0656b3ca40cd85dd2231e2a49dc Mon Sep 17 00:00:00 2001 From: Edwin <34405119+edsaac@users.noreply.github.com> Date: Fri, 25 Aug 2023 09:44:21 -0500 Subject: [PATCH] Improve Metadata class (#110) * make Metadata a dataclass * Remove `__name__ == "__main__"` from module * Refactor Metadata, take response in initiliazer * Check where Metadata objects are constructed * Refactor `set_metadata` functions within the `__init__` of `BaseMetadata` subclasses. * Add a `__repr__` to BaseMetadata * Remove dataclass import * Update children Metadata docstrings. * Implement site_info and variable_info * site_info and variable_info as property, not as callable * Implement site_info property * Update tests to handle site_info as property * Add tests to BaseMetadata * site_info and variable_info are not callables anymore * Complete docstring with class attributes * Add docstring to WQP metadata class --- dataretrieval/nwis.py | 159 ++++++++++++++++++++++-------------- dataretrieval/utils.py | 84 +++++++++++++------ dataretrieval/wqp.py | 78 +++++++++++++----- tests/nwis_test.py | 61 +++++++------- tests/utils_test.py | 20 +++++ tests/waterservices_test.py | 4 +- 6 files changed, 264 insertions(+), 142 deletions(-) diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index 64f95b2..acb1af6 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -18,7 +18,7 @@ import re from dataretrieval.utils import to_str, format_datetime, update_merge -from dataretrieval.utils import set_metadata as set_md +from dataretrieval.utils import BaseMetadata from .utils import query WATERDATA_BASE_URL = 'https://nwis.waterdata.usgs.gov/' @@ -211,7 +211,7 @@ def _qwdata(datetime_index=True, ssl_check=True, **kwargs): df = format_datetime(df, 'sample_dt', 'sample_tm', 'sample_start_time_datum_cd') - return format_response(df, **kwargs), _set_metadata(response, **kwargs) + return format_response(df, **kwargs), NWIS_Metadata(response, **kwargs) def get_discharge_measurements(sites=None, start=None, end=None, @@ -267,7 +267,7 @@ def get_discharge_measurements(sites=None, start=None, end=None, def _discharge_measurements(ssl_check=True, **kwargs): response = query_waterdata('measurements', format='rdb', ssl_check=ssl_check, **kwargs) - return _read_rdb(response.text), _set_metadata(response, **kwargs) + return _read_rdb(response.text), NWIS_Metadata(response, **kwargs) def get_discharge_peaks(sites=None, start=None, end=None, @@ -328,7 +328,7 @@ def _discharge_peaks(ssl_check=True, **kwargs): df = _read_rdb(response.text) - return format_response(df, service='peaks', **kwargs), _set_metadata( + return format_response(df, service='peaks', **kwargs), NWIS_Metadata( response, **kwargs) @@ -388,7 +388,7 @@ def _gwlevels(datetime_index=True, ssl_check=True, **kwargs): if datetime_index == True: df = format_datetime(df, 'lev_dt', 'lev_tm', 'lev_tz_cd') - return format_response(df, **kwargs), _set_metadata(response, **kwargs) + return format_response(df, **kwargs), NWIS_Metadata(response, **kwargs) def get_stats(sites, ssl_check=True, **kwargs): @@ -443,7 +443,7 @@ def get_stats(sites, ssl_check=True, **kwargs): response = query_waterservices('stat', sites=sites, ssl_check=ssl_check, **kwargs) - return _read_rdb(response.text), _set_metadata(response, **kwargs) + return _read_rdb(response.text), NWIS_Metadata(response, **kwargs) def query_waterdata(service, ssl_check=True, **kwargs): @@ -585,7 +585,7 @@ def _dv(ssl_check=True, **kwargs): ssl_check=ssl_check, **kwargs) df = _read_json(response.json()) - return format_response(df, **kwargs), _set_metadata(response, **kwargs) + return format_response(df, **kwargs), NWIS_Metadata(response, **kwargs) def get_info(ssl_check=True, **kwargs): @@ -691,7 +691,7 @@ def get_info(ssl_check=True, **kwargs): response = query_waterservices('site', ssl_check=ssl_check, **kwargs) - return _read_rdb(response.text), _set_metadata(response, **kwargs) + return _read_rdb(response.text), NWIS_Metadata(response, **kwargs) def get_iv(sites=None, start=None, end=None, multi_index=True, @@ -743,7 +743,7 @@ def _iv(ssl_check=True, **kwargs): response = query_waterservices('iv', format='json', ssl_check=ssl_check, **kwargs) df = _read_json(response.json()) - return format_response(df, **kwargs), _set_metadata(response, **kwargs) + return format_response(df, **kwargs), NWIS_Metadata(response, **kwargs) def get_pmcodes(parameterCd='All', partial=True, ssl_check=True): @@ -792,7 +792,7 @@ def get_pmcodes(parameterCd='All', partial=True, ssl_check=True): payload.update({'group_cd': '%'}) url = ALLPARAMCODES_URL response = query(url, payload, ssl_check=ssl_check) - return _read_rdb(response.text), _set_metadata(response) + return _read_rdb(response.text), NWIS_Metadata(response) else: parameterCd = [parameterCd] @@ -814,7 +814,7 @@ def get_pmcodes(parameterCd='All', partial=True, ssl_check=True): l.append(_read_rdb(response.text)) else: raise TypeError('Parameter information (code or name) must be type string') - return pd.concat(l), _set_metadata(response) + return pd.concat(l), NWIS_Metadata(response) def get_water_use(years="ALL", state=None, counties="ALL", categories="ALL", @@ -874,7 +874,7 @@ def get_water_use(years="ALL", state=None, counties="ALL", categories="ALL", url = WATERDATA_BASE_URL + state + "/nwis/water_use" payload.update({"wu_area": "county"}) response = query(url, payload, ssl_check=ssl_check) - return _read_rdb(response.text), _set_metadata(response) + return _read_rdb(response.text), NWIS_Metadata(response) def get_ratings(site=None, file_type="base", ssl_check=True, **kwargs): @@ -932,7 +932,7 @@ def _ratings(site, file_type, ssl_check=True): raise ValueError('Unrecognized file_type: {}, must be "base", "corr" or "exsa"'.format(file_type)) payload.update({"file_type" : file_type}) response = query(url, payload, ssl_check=ssl_check) - return _read_rdb(response.text), _set_metadata(response, site_no=site) + return _read_rdb(response.text), NWIS_Metadata(response, site_no=site) def what_sites(ssl_check=True, **kwargs): @@ -965,7 +965,7 @@ def what_sites(ssl_check=True, **kwargs): df = _read_rdb(response.text) - return df, _set_metadata(response, **kwargs) + return df, NWIS_Metadata(response, **kwargs) def get_record(sites=None, start=None, end=None, @@ -1236,51 +1236,90 @@ def _read_rdb(rdb): df = format_response(df) return df - -def _set_metadata(response, **parameters): - """Generates a standard set of metadata informed by the response. - - Parameters +class NWIS_Metadata(BaseMetadata): + """Metadata class for NWIS service, derived from BaseMetadata. + + Attributes ---------- - response: Response - Response object from requests module - parameters: unpacked dictionary - Unpacked dictionary of the parameters supplied in the request - - Returns - ------- - md: :obj:`dataretrieval.utils.Metadata` - A ``dataretrieval`` custom :obj:`dataretrieval.utils.Metadata` object. - + url : str + Response url + query_time: datetme.timedelta + Response elapsed time + header: requests.structures.CaseInsensitiveDict + Response headers + comments: str | None + Metadata comments, if any + site_info: tuple[pd.DataFrame, NWIS_Metadata] | None + Site information if the query included `site_no`, `sites`, `stateCd`, + `huc`, `countyCd` or `bBox`. `site_no` is preferred over `sites` if + both are present. + variable_info: tuple[pd.DataFrame, NWIS_Metadata] | None + Variable information if the query included `parameterCd`. + """ - md = set_md(response) - # site_no is preferred over sites to set site_info if both are present, - # matching behavior of the get_rating() function - if 'site_no' in parameters: - md.site_info = lambda: what_sites(sites=parameters['site_no']) - elif 'sites' in parameters: - md.site_info = lambda: what_sites(sites=parameters['sites']) - elif 'stateCd' in parameters: - md.site_info = lambda: what_sites(stateCd=parameters['stateCd']) - elif 'huc' in parameters: - md.site_info = lambda: what_sites(huc=parameters['huc']) - elif 'countyCd' in parameters: - md.site_info = lambda: what_sites(countyCd=parameters['countyCd']) - elif 'bBox' in parameters: - md.site_info = lambda: what_sites(bBox=parameters['bBox']) - else: - pass # don't set metadata site_info attribute - - # define variable_info metadata based on parameterCd if available - if 'parameterCd' in parameters: - md.variable_info = lambda: get_pmcodes( - parameterCd=parameters['parameterCd']) - - comments = "" - for line in response.text.splitlines(): - if line.startswith("#"): - comments += line.lstrip("#") + "\n" - if comments != "": - md.comment = comments - - return md + def __init__(self, response, **parameters) -> None: + """Generates a standard set of metadata informed by the response with specific + metadata for NWIS data. + + Parameters + ---------- + response: Response + Response object from requests module + parameters: unpacked dictionary + Unpacked dictionary of the parameters supplied in the request + + Returns + ------- + md: :obj:`dataretrieval.nwis.NWIS_Metadata` + A ``dataretrieval`` custom :obj:`dataretrieval.nwis.NWIS_Metadata` object. + + """ + super().__init__(response) + + comments = "" + for line in response.text.splitlines(): + if line.startswith("#"): + comments += line.lstrip("#") + "\n" + if comments: + self.comment = comments + + self._parameters = parameters + + @property + def site_info(self): + """ + Return + ------ + df: ``pandas.DataFrame`` + Formatted requested data from calling `nwis.what_sites` + md: :obj:`dataretrieval.nwis.NWIS_Metadata` + A NWIS_Metadata object + """ + if 'site_no' in self._parameters: + return what_sites(sites=self._parameters['site_no']) + + elif 'sites' in self._parameters: + return what_sites(sites=self._parameters['sites']) + + elif 'stateCd' in self._parameters: + return what_sites(stateCd=self._parameters['stateCd']) + + elif 'huc' in self._parameters: + return what_sites(huc=self._parameters['huc']) + + elif 'countyCd' in self._parameters: + return what_sites(countyCd=self._parameters['countyCd']) + + elif 'bBox' in self._parameters: + return what_sites(bBox=self._parameters['bBox']) + + else: + return None # don't set metadata site_info attribute + + @property + def variable_info(self): + + # define variable_info metadata based on parameterCd if available + if 'parameterCd' in self._parameters: + return get_pmcodes(parameterCd=self._parameters['parameterCd']) + diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index 01e755c..d9c27eb 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -7,7 +7,6 @@ import dataretrieval from dataretrieval.codes import tz - def to_str(listlike, delimiter=','): """Translates list-like objects into strings. @@ -135,32 +134,64 @@ def update_merge(left, right, na_only=False, on=None, **kwargs): return df - -class Metadata: - """Custom class for metadata. - """ - url = None - query_time = None - site_info = None - header = None - variable_info = None - comment = None - - # note sure what statistic_info is - statistic_info = None - # disclaimer seems to be only part of importWaterML1 - disclaimer = None - - -def set_metadata(response): - """Function to initialize and set metadata from an API response. +class BaseMetadata: + """Base class for metadata. + + Attributes + ---------- + url : str + Response url + query_time: datetme.timedelta + Response elapsed time + header: requests.structures.CaseInsensitiveDict + Response headers + """ - md = Metadata() - md.url = response.url - md.query_time = response.elapsed - md.header = response.headers - return md - + + def __init__(self, response) -> None: + """Generates a standard set of metadata informed by the response. + + Parameters + ---------- + response: Response + Response object from requests module + + Returns + ------- + md: :obj:`dataretrieval.utils.BaseMetadata` + A ``dataretrieval`` custom :obj:`dataretrieval.utils.BaseMetadata` object. + + """ + + # These are built from the API response + self.url = response.url + self.query_time = response.elapsed + self.header = response.headers + self.comment = None + + # # not sure what statistic_info is + # self.statistic_info = None + + # # disclaimer seems to be only part of importWaterML1 + # self.disclaimer = None + + # These properties are to be set by `nwis` or `wqp`-specific metadata classes. + @property + def site_info(self): + raise NotImplementedError( + "site_info must be implemented by utils.BaseMetadata children" + ) + + @property + def variable_info(self): + raise NotImplementedError( + "variable_info must be implemented by utils.BaseMetadata children" + ) + + + def __repr__(self) -> str: + return f"{type(self).__name__}(url={self.url})" + def query(url, payload, delimiter=',', ssl_check=True): """Send a query. @@ -234,3 +265,4 @@ def __init__(self, url): def __str__(self): return "No sites/data found using the selection criteria specified in url: {}".format(self.url) + diff --git a/dataretrieval/wqp.py b/dataretrieval/wqp.py index e7df387..d79f630 100644 --- a/dataretrieval/wqp.py +++ b/dataretrieval/wqp.py @@ -10,7 +10,7 @@ """ import pandas as pd from io import StringIO -from .utils import query, set_metadata as set_md +from .utils import query, BaseMetadata import warnings @@ -85,7 +85,7 @@ def get_results(ssl_check=True, **kwargs): ssl_check=ssl_check) df = pd.read_csv(StringIO(response.text), delimiter=',') - return df, set_metadata(response) + return df, WQP_Metadata(response) def what_sites(ssl_check=True, **kwargs): @@ -122,7 +122,7 @@ def what_sites(ssl_check=True, **kwargs): df = pd.read_csv(StringIO(response.text), delimiter=',') - return df, set_metadata(response) + return df, WQP_Metadata(response) def what_organizations(ssl_check=True, **kwargs): @@ -158,7 +158,7 @@ def what_organizations(ssl_check=True, **kwargs): df = pd.read_csv(StringIO(response.text), delimiter=',') - return df, set_metadata(response) + return df, WQP_Metadata(response) def what_projects(ssl_check=True, **kwargs): @@ -194,7 +194,7 @@ def what_projects(ssl_check=True, **kwargs): df = pd.read_csv(StringIO(response.text), delimiter=',') - return df, set_metadata(response) + return df, WQP_Metadata(response) def what_activities(ssl_check=True, **kwargs): @@ -233,7 +233,7 @@ def what_activities(ssl_check=True, **kwargs): df = pd.read_csv(StringIO(response.text), delimiter=',') - return df, set_metadata(response) + return df, WQP_Metadata(response) def what_detection_limits(ssl_check=True, **kwargs): @@ -273,7 +273,7 @@ def what_detection_limits(ssl_check=True, **kwargs): df = pd.read_csv(StringIO(response.text), delimiter=',') - return df, set_metadata(response) + return df, WQP_Metadata(response) def what_habitat_metrics(ssl_check=True, **kwargs): @@ -310,7 +310,7 @@ def what_habitat_metrics(ssl_check=True, **kwargs): df = pd.read_csv(StringIO(response.text), delimiter=',') - return df, set_metadata(response) + return df, WQP_Metadata(response) def what_project_weights(ssl_check=True, **kwargs): @@ -349,7 +349,7 @@ def what_project_weights(ssl_check=True, **kwargs): df = pd.read_csv(StringIO(response.text), delimiter=',') - return df, set_metadata(response) + return df, WQP_Metadata(response) def what_activity_metrics(ssl_check=True, **kwargs): @@ -388,7 +388,7 @@ def what_activity_metrics(ssl_check=True, **kwargs): df = pd.read_csv(StringIO(response.text), delimiter=',') - return df, set_metadata(response) + return df, WQP_Metadata(response) def wqp_url(service): @@ -398,18 +398,54 @@ def wqp_url(service): return '{}{}/Search?'.format(base_url, service) -def set_metadata(response, **parameters): - """ Set metadata for WQP data. +class WQP_Metadata(BaseMetadata): + """Metadata class for WQP service, derived from BaseMetadata. + + Attributes + ---------- + url : str + Response url + query_time: datetme.timedelta + Response elapsed time + header: requests.structures.CaseInsensitiveDict + Response headers + comments: None + Metadata comments. WQP does not return comments. + site_info: tuple[pd.DataFrame, NWIS_Metadata] | None + Site information if the query included `sites`, `site` or `site_no`. """ - md = set_md(response) # initialize dataretrieval metadata object - # populate the metadata using NWIS site information - if 'sites' in parameters: - md.site_info = lambda: what_sites(sites=parameters['sites']) - elif 'site' in parameters: - md.site_info = lambda: what_sites(sites=parameters['site']) - elif 'site_no' in parameters: - md.site_info = lambda: what_sites(sites=parameters['site_no']) - return md + + def __init__(self, response, **parameters) -> None: + """Generates a standard set of metadata informed by the response with specific + metadata for WQP data. + + Parameters + ---------- + response: Response + Response object from requests module + + parameters: unpacked dictionary + Unpacked dictionary of the parameters supplied in the request + + Returns + ------- + md: :obj:`dataretrieval.wqp.WQP_Metadata` + A ``dataretrieval`` custom :obj:`dataretrieval.wqp.WQP_Metadata` object. + + """ + + super().__init__(response) + + self._parameters = parameters + + @property + def site_info(self): + if 'sites' in self._parameters: + return what_sites(sites=parameters['sites']) + elif 'site' in self._parameters: + return what_sites(sites=parameters['site']) + elif 'site_no' in self._parameters: + return what_sites(sites=parameters['site_no']) def _alter_kwargs(kwargs): diff --git a/tests/nwis_test.py b/tests/nwis_test.py index e95c807..15a3640 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -5,7 +5,7 @@ import datetime from dataretrieval.nwis import get_record, preformat_peaks_response, get_info from dataretrieval.nwis import what_sites, get_iv, get_dv, get_discharge_peaks -from dataretrieval.nwis import _set_metadata +from dataretrieval.nwis import NWIS_Metadata import unittest.mock as mock @@ -208,70 +208,65 @@ def test_empty_timeseries(): class TestMetaData: - """Tests of NWIS metadata setting, based on GitHub Issue #73.""" + """Tests of NWIS metadata setting, + + Notes + ----- + + - Originally based on GitHub Issue #73. + - Modified to site_info and variable_info as properties, not callables. + """ def test_set_metadata_info_site(self): """Test metadata info is set when site parameter is supplied.""" # mock the query response response = mock.MagicMock() # make metadata call - md = _set_metadata(response, sites='01491000') - # assert that metadata info exists but don't execute lambda function - assert md.site_info is not None - # assert metadata site_info is callable - assert hasattr(md.site_info, '__call__') - + md = NWIS_Metadata(response, sites='01491000') + # assert that site_info is implemented + assert md.site_info + def test_set_metadata_info_site_no(self): """Test metadata info is set when site_no parameter is supplied.""" # mock the query response response = mock.MagicMock() # make metadata call - md = _set_metadata(response, site_no='01491000') - # assert that metadata info exists but don't execute lambda function - assert md.site_info is not None - # assert metadata site_info is callable - assert hasattr(md.site_info, '__call__') + md = NWIS_Metadata(response, site_no='01491000') + # assert that site_info is implemented + assert md.site_info def test_set_metadata_info_stateCd(self): """Test metadata info is set when stateCd parameter is supplied.""" # mock the query response response = mock.MagicMock() # make metadata call - md = _set_metadata(response, stateCd='RI') - # assert that metadata info exists but don't execute lambda function - assert md.site_info is not None - # assert metadata site_info is callable - assert hasattr(md.site_info, '__call__') + md = NWIS_Metadata(response, stateCd='RI') + # assert that site_info is implemented + assert md.site_info def test_set_metadata_info_huc(self): """Test metadata info is set when huc parameter is supplied.""" # mock the query response response = mock.MagicMock() # make metadata call - md = _set_metadata(response, huc='01') - # assert that metadata info exists but don't execute lambda function - assert md.site_info is not None - # assert metadata site_info is callable - assert hasattr(md.site_info, '__call__') + md = NWIS_Metadata(response, huc='01') + # assert that site_info is implemented + assert md.site_info def test_set_metadata_info_bbox(self): """Test metadata info is set when bbox parameter is supplied.""" # mock the query response response = mock.MagicMock() # make metadata call - md = _set_metadata(response, bBox='-92.8,44.2,-88.9,46.0') - # assert that metadata info exists but don't execute lambda function - assert md.site_info is not None - # assert metadata site_info is callable - assert hasattr(md.site_info, '__call__') + md = NWIS_Metadata(response, bBox='-92.8,44.2,-88.9,46.0') + # assert that site_info is implemented + assert md.site_info def test_set_metadata_info_countyCd(self): """Test metadata info is set when countyCd parameter is supplied.""" # mock the query response response = mock.MagicMock() # make metadata call - md = _set_metadata(response, countyCd='01001') - # assert that metadata info exists but don't execute lambda function - assert md.site_info is not None - # assert metadata site_info is callable - assert hasattr(md.site_info, '__call__') + md = NWIS_Metadata(response, countyCd='01001') + # assert that site_info is implemented + assert md.site_info diff --git a/tests/utils_test.py b/tests/utils_test.py index 32d534f..76522c5 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -2,6 +2,7 @@ import pytest from dataretrieval import utils import dataretrieval.nwis as nwis +import unittest.mock as mock class Test_query: @@ -31,3 +32,22 @@ def test_header(self): response = utils.query(url, payload) assert response.status_code == 200 # GET was successful assert 'user-agent' in response.request.headers + +class Test_BaseMetadata: + """Tests of BaseMetadata""" + + def test_init_with_response(self): + response = mock.MagicMock() + md = utils.BaseMetadata(response) + + ## Test parameters initialized from the API response + assert md.url is not None + assert md.query_time is not None + assert md.header is not None + + ## Test NotImplementedError parameters + with pytest.raises(NotImplementedError): + md.site_info + with pytest.raises(NotImplementedError): + md.variable_info + diff --git a/tests/waterservices_test.py b/tests/waterservices_test.py index 7dc780e..2b15a39 100755 --- a/tests/waterservices_test.py +++ b/tests/waterservices_test.py @@ -271,7 +271,7 @@ def assert_metadata(requests_mock, request_url, md, site, parameter_cd, format): site_request_url = "https://waterservices.usgs.gov/nwis/site?sites={}&format=rdb".format(site) with open('data/waterservices_site.txt') as text: requests_mock.get(site_request_url, text=text.read()) - site_info, _ = md.site_info() + site_info, _ = md.site_info assert type(site_info) is DataFrame if parameter_cd is None: assert md.variable_info is None @@ -280,7 +280,7 @@ def assert_metadata(requests_mock, request_url, md, site, parameter_cd, format): pcode_request_url = "https://help.waterdata.usgs.gov/code/parameter_cd_nm_query?fmt=rdb&parm_nm_cd=%25{}%25".format(param) with open('data/waterdata_pmcodes.txt') as text: requests_mock.get(pcode_request_url, text=text.read()) - variable_info, _ = md.variable_info() + variable_info, _ = md.variable_info assert type(variable_info) is DataFrame if format == "rdb":