Skip to content

Commit

Permalink
add ability to coerce incomplete datetime info + tests (#58)
Browse files Browse the repository at this point in the history
* add ability to coerce incomplete datetime info + tests

* handle datetime and make args consistent across functions

* minor version increment
  • Loading branch information
elbeejay authored Dec 7, 2022
1 parent d72012b commit 4318d07
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 38 deletions.
87 changes: 57 additions & 30 deletions dataretrieval/nwis.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,9 @@ def preformat_peaks_response(df):
return df


def get_qwdata(datetime_index=True, wide_format=True, sites=None,
start=None, end=None, multi_index=True,**kwargs):
def get_qwdata(sites=None, start=None, end=None,
multi_index=True, wide_format=True, datetime_index=True,
**kwargs):
"""
Get water sample data from qwdata service.
Expand All @@ -77,10 +78,6 @@ def get_qwdata(datetime_index=True, wide_format=True, sites=None,
Parameters
----------
datetime_index : boolean
If True, create a datetime index
wide_format : boolean
If True, return data in wide format with multiple samples per row and one row per time.
sites: array of strings
If the qwdata parameter site_no is supplied, it will overwrite the sites parameter
start: string
Expand All @@ -89,6 +86,10 @@ def get_qwdata(datetime_index=True, wide_format=True, sites=None,
If the qwdata parameter end_date is supplied, it will overwrite the end parameter
multi_index: boolean
If False, a dataframe with a single-level index (datetime) is returned
wide_format : boolean
If True, return data in wide format with multiple samples per row and one row per time
datetime_index : boolean
If True, create a datetime index
**kwargs: optional
If supplied, will be used as query parameters
Expand All @@ -100,8 +101,10 @@ def get_qwdata(datetime_index=True, wide_format=True, sites=None,
start = kwargs.pop('begin_date', start)
end = kwargs.pop('end_date', end)
sites = kwargs.pop('site_no', sites)
return _qwdata(site_no=sites, begin_date=start, end_date=end, datetime_index=datetime_index,
multi_index=multi_index, ** kwargs)
return _qwdata(site_no=sites, begin_date=start, end_date=end,
datetime_index=datetime_index,
multi_index=multi_index, **kwargs)


def _qwdata(datetime_index=True, **kwargs):
# check number of sites, may need to create multiindex
Expand Down Expand Up @@ -181,7 +184,8 @@ def _discharge_measurements(**kwargs):
return _read_rdb(response.text), _set_metadata(response, **kwargs)


def get_discharge_peaks(sites=None, start=None, end=None, multi_index=True, **kwargs):
def get_discharge_peaks(sites=None, start=None, end=None,
multi_index=True, **kwargs):
"""
Get discharge peaks from the waterdata service.
Expand All @@ -193,6 +197,8 @@ def get_discharge_peaks(sites=None, start=None, end=None, multi_index=True, **k
If the waterdata parameter begin_date is supplied, it will overwrite the start parameter
end: string
If the waterdata parameter end_date is supplied, it will overwrite the end parameter
multi_index: boolean
If False, a dataframe with a single-level index (datetime) is returned
**kwargs: optional
If supplied, will be used as query parameters
Expand All @@ -202,7 +208,8 @@ def get_discharge_peaks(sites=None, start=None, end=None, multi_index=True, **k
start = kwargs.pop('begin_date', start)
end = kwargs.pop('end_date', end)
sites = kwargs.pop('site_no', sites)
return _discharge_peaks(site_no=sites, begin_date=start, end_date=end, multi_index=multi_index, **kwargs)
return _discharge_peaks(site_no=sites, begin_date=start, end_date=end,
multi_index=multi_index, **kwargs)


def _discharge_peaks(**kwargs):
Expand All @@ -213,7 +220,8 @@ def _discharge_peaks(**kwargs):
return format_response(df, service='peaks', **kwargs), _set_metadata(response, **kwargs)


def get_gwlevels(start='1851-01-01', end=None, multi_index=True, **kwargs):
def get_gwlevels(sites=None, start='1851-01-01', end=None,
multi_index=True, datetime_index=True, **kwargs):
"""
Queries the groundwater level service from waterservices
Expand All @@ -224,6 +232,10 @@ def get_gwlevels(start='1851-01-01', end=None, multi_index=True, **kwargs):
parameter (defaults to '1851-01-01')
end: string
If the waterdata parameter end_date is supplied, it will overwrite the end parameter
multi_index: boolean
If False, a dataframe with a single-level index (datetime) is returned
datetime_index : boolean
If True, create a datetime index
**kwargs: optional
If supplied, will be used as query parameters
Expand All @@ -232,15 +244,20 @@ def get_gwlevels(start='1851-01-01', end=None, multi_index=True, **kwargs):
"""
start = kwargs.pop('startDT', start)
end = kwargs.pop('endDT', end)
return _gwlevels(startDT=start, endDT=end, multi_index=multi_index, **kwargs)
sites = kwargs.pop('sites', sites)
return _gwlevels(startDT=start, endDT=end,
datetime_index=datetime_index, sites=sites,
multi_index=multi_index, **kwargs)


def _gwlevels(**kwargs):
def _gwlevels(datetime_index=True, **kwargs):

response = query_waterservices('gwlevels', **kwargs)

df = _read_rdb(response.text)
df = format_datetime(df, 'lev_dt', 'lev_tm', 'lev_tz_cd')

if datetime_index == True:
df = format_datetime(df, 'lev_dt', 'lev_tm', 'lev_tz_cd')

return format_response(df, **kwargs), _set_metadata(response, **kwargs)

Expand Down Expand Up @@ -332,7 +349,7 @@ def query_waterservices(service, **kwargs):
return query(url, payload=kwargs)


def get_dv(start=None, end=None, multi_index=True, **kwargs):
def get_dv(sites=None, start=None, end=None, multi_index=True, **kwargs):
"""
Get daily values data from NWIS and return it as a ``pandas.DataFrame``.
Expand All @@ -352,7 +369,9 @@ def get_dv(start=None, end=None, multi_index=True, **kwargs):
"""
start = kwargs.pop('startDT', start)
end = kwargs.pop('endDT', end)
return _dv(startDT=start, endDT=end, multi_index=multi_index, **kwargs)
sites = kwargs.pop('sites', sites)
return _dv(startDT=start, endDT=end, sites=sites,
multi_index=multi_index, **kwargs)


def _dv(**kwargs):
Expand All @@ -371,7 +390,7 @@ def get_info(**kwargs):
Parameters
----------
sites : string or list
A list of site numters. Sites may be prefixed with an optional agency
A list of site numbers. Sites may be prefixed with an optional agency
code followed by a colon.
stateCd : string
Expand Down Expand Up @@ -453,7 +472,7 @@ def get_info(**kwargs):
return _read_rdb(response.text), _set_metadata(response, **kwargs)


def get_iv(start=None, end=None, multi_index=True, **kwargs):
def get_iv(sites=None, start=None, end=None, multi_index=True, **kwargs):
"""Get instantaneous values data from NWIS and return it as a DataFrame.
Note: If no start or end date are provided, only the most recent record is returned.
Expand All @@ -470,7 +489,9 @@ def get_iv(start=None, end=None, multi_index=True, **kwargs):
"""
start = kwargs.pop('startDT', start)
end = kwargs.pop('endDT', end)
return _iv(startDT=start, endDT=end, multi_index=multi_index, **kwargs)
sites = kwargs.pop('sites', sites)
return _iv(startDT=start, endDT=end, sites=sites,
multi_index=multi_index, **kwargs)


def _iv(**kwargs):
Expand All @@ -479,7 +500,7 @@ def _iv(**kwargs):
return format_response(df, **kwargs), _set_metadata(response, **kwargs)


def get_pmcodes(parameterCd = 'All', partial = True):
def get_pmcodes(parameterCd='All', partial=True):
"""
Return a ``pandas.DataFrame`` containing all NWIS parameter codes.
Expand Down Expand Up @@ -621,8 +642,9 @@ def what_sites(**kwargs):
return df, _set_metadata(response, **kwargs)


def get_record(sites=None, start=None, end=None, state=None,
service='iv', *args, **kwargs):
def get_record(sites=None, start=None, end=None,
multi_index=True, wide_format=True, datetime_index=True,
state=None, service='iv', **kwargs):
"""
Get data from NWIS and return it as a ``pandas.DataFrame``.
Expand Down Expand Up @@ -654,16 +676,19 @@ def get_record(sites=None, start=None, end=None, state=None,
raise TypeError('Unrecognized service: {}'.format(service))

if service == 'iv':
df, _ = get_iv(sites=sites, startDT=start, endDT=end, **kwargs)
df, _ = get_iv(sites=sites, startDT=start, endDT=end,
multi_index=multi_index, **kwargs)
return df

elif service == 'dv':
df, _ = get_dv(sites=sites, startDT=start, endDT=end, **kwargs)
df, _ = get_dv(sites=sites, startDT=start, endDT=end,
multi_index=multi_index, **kwargs)
return df

elif service == 'qwdata':
df, _ = get_qwdata(site_no=sites, begin_date=start, end_date=end,
qw_sample_wide='separated_wide', **kwargs)
multi_index=multi_index,
wide_format=wide_format, **kwargs)
return df

elif service == 'site':
Expand All @@ -677,12 +702,14 @@ def get_record(sites=None, start=None, end=None, state=None,

elif service == 'peaks':
df, _ = get_discharge_peaks(site_no=sites, begin_date=start,
end_date=end, **kwargs)
end_date=end,
multi_index=multi_index, **kwargs)
return df

elif service == 'gwlevels':
df, _ = get_gwlevels(sites=sites, startDT=start, endDT=end,
**kwargs)
multi_index=multi_index,
datetime_index=datetime_index, **kwargs)
return df

elif service == 'pmcodes':
Expand All @@ -694,7 +721,7 @@ def get_record(sites=None, start=None, end=None, state=None,
return df

elif service == 'ratings':
df, _ = get_ratings(**kwargs)
df, _ = get_ratings(site=sites, **kwargs)
return df

else:
Expand Down Expand Up @@ -788,8 +815,8 @@ def _read_rdb(rdb):
break

fields = re.split("[\t]", rdb.splitlines()[count])
fields = [field.replace(",", "") for field in fields]
dtypes = {'site_no': str, 'dec_long_va': float, 'dec_lat_va': float, 'parm_cd': str, 'parameter_cd':str}
fields = [field.replace(",", "") for field in fields]
dtypes = {'site_no': str, 'dec_long_va': float, 'dec_lat_va': float, 'parm_cd': str, 'parameter_cd': str}

df = pd.read_csv(StringIO(rdb), delimiter='\t', skiprows=count + 2,
names=fields, na_values='NaN', dtype=dtypes)
Expand Down
13 changes: 11 additions & 2 deletions dataretrieval/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""
Useful utilities for data munging.
"""
import warnings
import numpy as np
import pandas as pd
import requests
from dataretrieval.codes import tz
Expand Down Expand Up @@ -51,15 +53,22 @@ def format_datetime(df, date_field, time_field, tz_field):
df : ``pandas.DataFrame``
"""

#create a datetime index from the columns in qwdata response
# create a datetime index from the columns in qwdata response
df[tz_field] = df[tz_field].map(tz)

df['datetime'] = pd.to_datetime(df[date_field] + ' ' +
df[time_field] + ' ' +
df[tz_field],
format = '%Y-%m-%d %H:%M',
format='%Y-%m-%d %H:%M',
utc=True)

# if there are any incomplete dates, warn the user
if any(pd.isna(df['datetime'])):
count = sum(pd.isna(df['datetime']) == True)
warnings.warn(
f'Warning: {count} incomplete dates found, ' +
'consider setting datetime_index to False.', UserWarning)

return df


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup

setup(name='dataretrieval',
version='0.7',
version='0.8',
description='',
url='',
author='Timothy Hodson',
Expand Down
63 changes: 62 additions & 1 deletion tests/nwis_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_iv_service():
start = START_DATE
end = END_DATE
service = 'iv'
site = ['03339000','05447500','03346500']
site = ['03339000', '05447500', '03346500']
return get_record(site, start, end, service=service)

def test_iv_service_answer():
Expand Down Expand Up @@ -61,3 +61,64 @@ def test_preformat_peaks_response():
if __name__=='__main__':
test_measurements_service_answer()
test_iv_service_answer()


# tests using real queries to USGS webservices
# these specific queries represent some edge-cases and the tests to address
# incomplete date-time information

def test_inc_date_01():
"""Test based on GitHub Issue #47 - lack of timestamp for measurement."""
site = "403451073585601"
# make call expecting a warning to be thrown due to incomplete dates
with pytest.warns(UserWarning):
df = get_record(site, "1980-01-01", "1990-01-01", service='gwlevels')
# assert that there are indeed incomplete dates
assert any(pd.isna(df.index) == True)
# assert that the datetime index is there
assert df.index.name == 'datetime'
# make call without defining a datetime index and check that it isn't there
df2 = get_record(site, "1980-01-01", "1990-01-01", service='gwlevels',
datetime_index=False)
# assert shape of both dataframes is the same (contain the same data)
assert df.shape == df2.shape
# assert that the datetime index is not there
assert df2.index.name != 'datetime'


def test_inc_date_02():
"""Test based on GitHub Issue #47 - lack of month, day, or time."""
site = "180049066381200"
# make call expecting a warning to be thrown due to incomplete dates
with pytest.warns(UserWarning):
df = get_record(site, "1900-01-01", "2013-01-01", service='gwlevels')
# assert that there are indeed incomplete dates
assert any(pd.isna(df.index) == True)
# assert that the datetime index is there
assert df.index.name == 'datetime'
# make call without defining a datetime index and check that it isn't there
df2 = get_record(site, "1900-01-01", "2013-01-01", service='gwlevels',
datetime_index=False)
# assert shape of both dataframes is the same (contain the same data)
assert df.shape == df2.shape
# assert that the datetime index is not there
assert df2.index.name != 'datetime'


def test_inc_date_03():
"""Test based on GitHub Issue #47 - lack of day, and times."""
site = "290000095192602"
# make call expecting a warning to be thrown due to incomplete dates
with pytest.warns(UserWarning):
df = get_record(site, "1975-01-01", "2000-01-01", service='gwlevels')
# assert that there are indeed incomplete dates
assert any(pd.isna(df.index) == True)
# assert that the datetime index is there
assert df.index.name == 'datetime'
# make call without defining a datetime index and check that it isn't there
df2 = get_record(site, "1975-01-01", "2000-01-01", service='gwlevels',
datetime_index=False)
# assert shape of both dataframes is the same (contain the same data)
assert df.shape == df2.shape
# assert that the datetime index is not there
assert df2.index.name != 'datetime'
8 changes: 4 additions & 4 deletions tests/waterservices_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ def test_get_qwdata(requests_mock):
'&date_format=YYYY-MM-DD&rdb_compression=value&submmitted_form=brief_list'.format(site, format)
response_file_path = 'data/waterdata_qwdata.txt'
mock_request(requests_mock, request_url, response_file_path)
df, md = get_qwdata(sites=["01491000", "01645000"])
with pytest.warns(DeprecationWarning):
df, md = get_qwdata(sites=["01491000", "01645000"])
assert type(df) is DataFrame
assert df.size == 1821472
assert_metadata(requests_mock, request_url, md, site, None, format)
Expand Down Expand Up @@ -279,17 +280,16 @@ def assert_metadata(requests_mock, request_url, md, site, parameter_cd, format):
site_info, _ = md.site_info()
assert type(site_info) is DataFrame
if parameter_cd is None:
assert md.variable_info is None
assert md.variable_info is None
else:
for param in parameter_cd:
pcode_request_url = "https://help.waterdata.usgs.gov/code/parameter_cd_nm_query?fmt=rdb&parm_nm_cd=%25{}%25".format(param)
with open('data/waterdata_pmcodes.txt') as text:
requests_mock.get(pcode_request_url, text=text.read())
variable_info, _ = md.variable_info()
assert type(variable_info) is DataFrame

if format == "rdb":
assert md.comment is not None
else:
assert md.comment is None

0 comments on commit 4318d07

Please sign in to comment.