Skip to content

Commit

Permalink
Fix update_dataframe
Browse files Browse the repository at this point in the history
  • Loading branch information
thodson-usgs committed Aug 28, 2024
1 parent 88c5d4c commit f17bc64
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 11 deletions.
4 changes: 2 additions & 2 deletions demos/nwqn_data_pull/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Retrieva data from the National Water Quality Network (NWQN)

This examples walks through using lithops to retrieve data from every NWQN
monitoring site, then writes the results to a parquet files on s3. Each
This example walks through using lithops to retrieve data from every NWQN
monitoring site, then writes the results to parquet files on s3. Each
retrieval also searches the NLDI for neighboring sites with NWQN data and
merges those data. In the streamflow example, the neighborhood search is
used to progressively fill in gaps in the record by taking data from the
Expand Down
18 changes: 9 additions & 9 deletions demos/nwqn_data_pull/retrieve_nwqn_streamflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def map_retrieval(site):
parameterCd="00060",
)

# drop rows with missing values; neglect other 00060_* columns
df = df.dropna(subset=["00060_Mean"])

# by default, site_no is not in the index if a single site is queried
if "site_no" in df.columns:
index_name = df.index.names[0]
Expand Down Expand Up @@ -65,7 +68,7 @@ def map_retrieval(site):
df = pd.merge(df, site_info, left_index=True, right_index=True)
df["00060_Mean"] *= site_info.loc[df.index.get_level_values("site_no"), "drain_fraction"].values

# order sites for filling in missing data
# order sites by the difference in drainage area fraction
fill_order = site_info.sort_values("fraction_diff", ascending=True)
fill_order = fill_order.index.values

Expand All @@ -74,7 +77,8 @@ def map_retrieval(site):

output = pd.DataFrame()

# loop through sites, updating the best data first
# loop through sites and fill in missing flow values
# going from most to least-similar drainage areas.
for fill_site in fill_order:
fill_data = df.loc[fill_site]
output = update_dataframe(output, fill_data)
Expand All @@ -100,18 +104,14 @@ def update_dataframe(
overwrite: bool = False,
) -> pd.DataFrame:
"""Update a DataFrame with values from another DataFrame.
NOTE: this fuction does not handle MultiIndex DataFrames.
"""
# Identify new rows in new_df that are not in original_df
new_rows = new_df[~new_df.index.isin(original_df.index)]

# Concatenate new rows to original_df
original_df = pd.concat([original_df, new_rows]).drop_duplicates(keep="first")

# Sort the DataFrame by index
original_df.sort_index(inplace=True)

# Update the original DataFrame with values from the new DataFrame
original_df.update(new_df, overwrite=overwrite)
original_df = pd.concat([original_df, new_rows]).sort_index()

return original_df

Expand Down

0 comments on commit f17bc64

Please sign in to comment.