WDQMS fix for obsproc/v1.2

The ObsProc upgraded from version 1.1 to version 1.2, in which the high-resolution radiosonde observations are included. The radiosonde data from ObsProcv1.2 stored in GSI diagnostic files, in some cases, have multiple observations in the same location/time, and these observations often contain bad data (e.g., temperature > 500K or temperature < 50K). These duplicate observations and bad data caused WDQMS processing to fail. This PR improves the handling of these obscure cases. Refs #2389
NOAA-EMC · May 29, 2024 · 4573f80 · 4573f80
1 parent 54cc0aa
commit 4573f80
Showing 1 changed file with 53 additions and 42 deletions.
diff --git a/ush/wdqms.py b/ush/wdqms.py
@@ -69,8 +69,18 @@ def __init__(self, inputfiles, wdqms_type, outdir,
         # Drop duplicates
         df_total = df_total.drop_duplicates()
 
+        # Create temporary dataframe of only temperature and q values
+        tq_condition = [self.wdqms_type_dict[self.wdqms_type]['variable_ids']['t'],
+                        self.wdqms_type_dict[self.wdqms_type]['variable_ids']['q']]
+
+        tq_df = df_total.loc[df_total['var_id'].isin(tq_condition)]
+        no_tq_df = df_total.loc[~df_total['var_id'].isin(tq_condition)]
+
         # Adjust relative humidity data
-        df_total = self._genqsat(df_total)
+        tq_df = self._genqsat(tq_df)
+
+        # Merge the non t and q values back into returned t and q dataframe
+        df_total = pd.concat([tq_df, no_tq_df])
 
         # Add Status Flag column
         df_total = self._create_status_flag(df_total)
@@ -118,6 +128,10 @@ def _wdqms_type_requirements(self, df):
             df.loc[(df['var_id'] != 110) & (df['Obs_Minus_Forecast_adjusted'].abs() > 500),
                    'Obs_Minus_Forecast_adjusted'] = -999.9999
 
+        if self.wdqms_type in ['TEMP']:
+            # Only include assimilated data as per WDQMS requirement document
+            df = df.loc[df['Analysis_Use_Flag'] == 1]
+
         logging.debug(f"Total observations for {self.wdqms_type} after filter: {len(df)}")
         logging.info("Exiting wdqms_type_requirements()")
 
@@ -448,7 +462,7 @@ def _genqsat(self, df):
                    temperature background error from GSI diagnostic file
 
         Args:
-            df : (df) pandas dataframe populated with data from GSI
+            df : (df) pandas dataframe populated with t and q data from GSI
                  diagnostic files
             wdqms_type : (str) wdqms type file being created
         Returns:
@@ -458,56 +472,53 @@ def _genqsat(self, df):
         logging.info("Working in genqstat()")
 
         # Get variable type specific to WDQMS type
-        q = self.wdqms_type_dict[self.wdqms_type]['variable_ids']['q']
-        t = self.wdqms_type_dict[self.wdqms_type]['variable_ids']['t']
-
-        # Create two dataframes, one for q vales and one for t values
-        q_df = df.loc[(df['var_id'] == q)]
-        t_df = df.loc[(df['var_id'] == t)]
-
-        # Find where stations are the same
-        stn_ids = np.intersect1d(t_df.Station_ID, q_df.Station_ID)
+        q_id = self.wdqms_type_dict[self.wdqms_type]['variable_ids']['q']
+        t_id = self.wdqms_type_dict[self.wdqms_type]['variable_ids']['t']
+
+        df_list = []
 
-        # loop through stations, calculate saturation specific humidity,
-        # and replace background departure values from NetCDF file with
-        # new ones calculated using the temperature obs and best temperature guess
-        for stn in stn_ids:
-            logging.debug(f"Station ID: {stn}")
+        # Groupby Station_ID
+        for stn, stn_df in df.groupby('Station_ID'):
 
-            t_tmp = t_df.loc[(t_df['Station_ID'] == stn)]
-            q_tmp = q_df.loc[(q_df['Station_ID'] == stn)]
+            # Filter the dataframes
+            q_df = stn_df[stn_df['var_id'] == q_id]
+            t_df = stn_df[stn_df['var_id'] == t_id]
 
-            columns_to_extract = ['Latitude', 'Longitude', 'Pressure', 'Time', 'Observation', 'Obs_Minus_Forecast_adjusted']
-            columns_to_compare = ['Latitude', 'Longitude', 'Pressure', 'Time']
+            # Merge dataframes on common keys using an inner join
+            merged_df = pd.merge(q_df, t_df, on=['Station_ID', 'Latitude', 'Longitude', 'Pressure', 'Time'], suffixes=('_q', '_t'), how='inner')
 
-            t_tmp = t_tmp[columns_to_extract]
-            q_tmp = q_tmp[columns_to_extract]
+            # Calculate needed values
+            q_obs = merged_df['Observation_q'].to_numpy() * 1.0e6
+            q_ges = (merged_df['Observation_q'].to_numpy() - merged_df['Obs_Minus_Forecast_adjusted_q'].to_numpy()) * 1.0e6
+            t_obs = merged_df['Observation_t'].to_numpy() - 273.16
+            t_ges = (merged_df['Observation_t'].to_numpy() - merged_df['Obs_Minus_Forecast_adjusted_t'].to_numpy()) - 273.16
+            pressure = merged_df['Pressure'].to_numpy()
 
-            t_tmp = pd.merge(t_tmp, q_tmp, on=columns_to_compare, suffixes=('','_q'), how='inner')
-            t_tmp = t_tmp[columns_to_extract].drop_duplicates()
+            qsat_obs = self._temp_2_saturation_specific_humidity(pressure, t_obs)
+            qsat_ges = self._temp_2_saturation_specific_humidity(pressure, t_ges)
 
-            q_obs = q_tmp['Observation'].to_numpy() * 1.0e6
-            q_ges = (q_tmp['Observation'].to_numpy() -
-                     q_tmp['Obs_Minus_Forecast_adjusted'].to_numpy()) * 1.0e6
-            t_obs = t_tmp['Observation'].to_numpy() - 273.16
-            t_ges = (t_tmp['Observation'].to_numpy() -
-                     t_tmp['Obs_Minus_Forecast_adjusted'].to_numpy()) - 273.16
-            pressure = q_tmp['Pressure'].to_numpy()
+            # Calculate background departure
+            bg_dep = (q_obs / qsat_obs) - (q_ges / qsat_ges)
 
-            qsat_obs = self._temp_2_saturation_specific_humidity(
-                pressure, t_obs)
-            qsat_ges = self._temp_2_saturation_specific_humidity(
-                pressure, t_ges)
+            # Grab conditions from merged_df
+            station_ids = merged_df['Station_ID']
+            pressure_vals = merged_df['Pressure']
+            time_vals = merged_df['Time']
+            latitude = merged_df['Latitude']
+            longitude = merged_df['Longitude']
+            conditions = (q_df['Station_ID'].isin(station_ids)) & \
+                         (q_df['Pressure'].isin(pressure_vals)) & \
+                         (q_df['Time'].isin(time_vals)) & \
+                         (q_df['Latitude'].isin(latitude)) & \
+                         (q_df['Longitude'].isin(longitude))
 
-            bg_dep = (q_obs / qsat_obs) - (q_ges / qsat_ges)
+            # Update the background departure values for q_df
+            q_df = q_df.loc[conditions]
+            q_df['Obs_Minus_Forecast_adjusted'] = bg_dep
 
-            logging.debug(f"Original q background departure: {q_tmp['Obs_Minus_Forecast_adjusted'].values}")
-            logging.debug(f"New q background departure: {bg_dep}")
+            df_list.append(pd.concat([t_df, q_df]))
 
-            # Replace the current background departure with the new calculated one
-            df.loc[(df['var_id'] == self.wdqms_type_dict[self.wdqms_type]['variable_ids']['q']) &
-                   (df['Station_ID'] == stn),
-                   'Obs_Minus_Forecast_adjusted'] = bg_dep
+        df = pd.concat(df_list)
 
         logging.info("Exiting genqstat()")