-
Notifications
You must be signed in to change notification settings - Fork 0
/
regrid_tropomi.py
executable file
·416 lines (397 loc) · 17.3 KB
/
regrid_tropomi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""This module reads in S5P/TROPOMI granules for given dates and regrids
from the S5P time-varying grid to a fixed grid. Regridded data are saved for
every day, if there are granules available for that given day. If there are >1
granules, a daily average file is formed.
Todo:
* Figure out time vs time_utc
* Include netCDF4 attributes/description in output file
* Output file name should state resolution, QA COMPLETE - 5 July 2020
* Convert to molec cm-2? (x 6.02214e+19) COMPLETE - 5 July 2020
* Output fill value instead of NaN COMPLETE - 15 July 2020
* Daily average for all granuales in output file COMPLETE - 15 July 2020
"""
DDIR = '/GWSPH/groups/anenberggrp/ghkerr/data/TROPOMI/newyork/'
DDIR_DATA = DDIR+'raw/'
DDIR_OUT = DDIR+'regridded/'
def load_tropomi(date, fstr, var):
"""Read S5P/TROPOMI granules for a given range of dates and extract
variable of interest along with QA and dimensional information (i.e.,
time, latitude, and longitude)
Parameters
----------
date : str
Day of interest (%Y-%m-%d format)
fstr : str
The prefix for the S5P/TROPOMI graunles that specifies version (Near
Real-Time (NRTI) or Offline (OFFL)), dataset, and level
var : str
Variable of interest that will be extracted from S5P/TROPOMI granules
Returns
-------
outdict : dict
Dictionary containing variable(s) of interest, QA, and dimensional
information
"""
import numpy as np
import netCDF4 as nc
from datetime import datetime
import os
# Change date to convention of TROPOMI files (i.e., YYYYMMDD)
date = datetime.strptime(date,'%Y-%m-%d')
date = str(date.year)+str(date.month).zfill(2)+str(date.day).zfill(2)
# Find all TROPOMI files for day of interest (must remove fstr, which
# specifies product/level/gas)
files = [fn for fn in os.listdir(DDIR_DATA) if
(fn.replace(fstr,'').startswith(date)==True)]
# Variables of interest will be extracted and stored in a dictionary
outdict = {key:[] for key in ['qa_value', 'latitude', 'longitude',
'time', var]}
for file in files:
infile = nc.Dataset(DDIR_DATA+file,'r')
# Grid changes with each granule, so this needs to be saved off at
# every iteration
lat = infile.groups['PRODUCT'].variables['latitude'][0].data
# n.b., longitude is in units of -180 to 180 deg
lng = infile.groups['PRODUCT'].variables['longitude'][0].data
# Extract time; units are seconds since 2010-01-01
time = infile.groups['PRODUCT'].variables['time'][:]
time_unit = infile.groups['PRODUCT'].variables['time'].units
try:
time_cal = infile.groups['PRODUCT'].variables['time'].calendar
except AttributeError: # Attribute doesn't exist
time_cal = u'gregorian' # Or standard
time = nc.num2date(time, units=time_unit, calendar=time_cal)[0]
qa_value = infile.groups['PRODUCT'].variables['qa_value'][0].data
# Extract variables of interest and append to dictionary containing
# dimensional information
vararr = infile.groups['PRODUCT'].variables[var][0].data
fill_value = infile.groups['PRODUCT'].variables[var][0].fill_value
vararr[vararr==fill_value] = np.nan
outdict[var].append(vararr)
outdict['latitude'].append(lat)
outdict['longitude'].append(lng)
outdict['time'].append(time)
outdict['qa_value'].append(qa_value)
return outdict
def apply_qa_value(tropomi, var, threshold):
"""Function searches data quality values (varies between 0 (no data) and 1
(full quality data)) for occurrences under the threshold of interest.
TROMPOMI retrievals at indices under the threshold are set to NaN
Parameters
----------
tropomi : dict
Dictionary containing TROPOMI/S5P variable(s) of interest, QA, and
dimensional information
var : str
Variable of interest that will be extracted from S5P/TROPOMI granules
threshold : float
Data quality threshold below which data will be screen (recommended to
ignore data with QA values < 0.5)
Returns
-------
tropomi : dict
Dictionary containing TROPOMI/S5P variable(s) of interest, QA, and
dimensional information screened for non-quality data
"""
import numpy as np
# Loop through variables/time of interest
tmp = tropomi[var]
for g in np.arange(0,len(tropomi['time']),1):
tmp_g = tmp[g]
qa_value_g = tropomi['qa_value'][g]
# Set as NaN where QA array is less than threshold
underthresh = np.where(qa_value_g < threshold)
tmp_g[underthresh] = np.nan
# Replace in dictionary
tropomi[var][g] = tmp_g
return tropomi
def regrid(tropomi, var, crop, lonss, latss, nchunk):
"""Regrid TROPOMI data to a consistent rectlinear grid separated by a
constant spacing
Parameters
----------
tropomi : dict
Dictionary containing TROPOMI/S5P variable(s) of interest, QA, and
dimensional information
var : str
TROPOMI/S5P variable name (should be a key in variable "tropomi")
crop : list
Bounds (x0,x1,y0,y1) roughly corresponding to TROPOMI focus region
lonss : float
Longitude step size, i.e. grid resolution, units of degrees
latss : float
Latitude step size, i.e. grid resolution, units of degrees
nchunk : int
Number of chunks that the time-varying lat/lon grid for the satellite
retrievals will be split into. Note that this function splits the
lat/lon grid by the y dimension since the satellite swaths are
predominantly N/S (if split on the x dimension, there will be instances
where there are no retrivals in a chunk and xESMF will throw an error).
Returns
-------
interpolated : numpy.ndarray
Interpolated TROPOMI retrievals, [time, lat, lon]
lat_out : numpy.ndarray
Rectilinear latitude spine with grid spacing given by variable
"latss," [lat,]
lon_out : numpy.ndarray
Rectilinear longitude spine with grid spacing given by variable
"lonss," [lon,]
"""
from timeit import default_timer as timer
from datetime import timedelta
start = timer()
import numpy as np
import xarray as xr
import xesmf as xe
import cartopy.crs as ccrs
interpolated = []
def chunk_it(length, num):
"""For a given dimension size, get the start/stop indices of quasi equal-
sized chunks. Adapted from stackoverflow.com/questions/2130016/splitting-
a-list-into-n-parts-of-approximately-equal-length
"""
import numpy as np
avg = length / float(num)
seq = np.arange(0,length,1)
out = []
last = 0.0
while last < length:
out.append(seq[int(last):int(last + avg)])
last += avg
return [(x[0], x[-1]+1) for x in out]
# Constract 2D rectilinear grid, cropped over region of interest so there
# are no issues with buffers and to avoid creating large ESMF grid objects
# (see https://github.com/JiaweiZhuang/xESMF/issues/29)
ds_out = xe.util.grid_2d(crop[0], crop[1], lonss, crop[2], crop[3], latss)
ds_out = ds_out.drop(labels=['lon_b','lat_b'])
# Extract latitude/longitude spines
lat_out = np.nanmean(ds_out.lat.data, axis=1)
lon_out = np.nanmean(ds_out.lon.data, axis=0)
# Loop through files (each one representing a specific granule; see
# https://sentinel.esa.int/documents/247904/2474726/Sentinel-5P-Level-2-
# Product-User-Manual-NPP-Cloud-product)
for g in np.arange(0,len(tropomi['time']),1):
print('Granule %d...'%g)
var_g = tropomi[var][g]
lat_g = tropomi['latitude'][g]
lng_g = tropomi['longitude'][g]
# List to fill with interpolated values for particular granule "g"
interpolated_g = []
# Make xarray dataset for variable/time of interest
ds = xr.Dataset({var: (['x', 'y'], var_g)},
coords={'lon': (['x', 'y'], lng_g), 'lat': (['x', 'y'], lat_g)})
# Get rid of points outside domain/crop box
ds = ds.where((ds['lat']>=crop[2]) &
(ds['lat']<=crop[3]) & (ds['lon']>=crop[0]) &
(ds['lon']<=crop[1]), drop=True)
# Chunk up TROPOMI data so xESMF doesn't crash
for chunk in chunk_it(ds_out.y.shape[0], nchunk):
ds_out_chunk = ds_out.isel(y=slice(chunk[0],chunk[-1]))
dr = ds[var]
# Only perform regridding if there are points in chunk to regrid;
# There are issues when there are chunked sections of the high
# resolution grid that don't overlap with any TROPOMI retrivals
# (see https://github.com/JiaweiZhuang/xESMF/issues/36), if the
# shape of the retrieval within the chunk is (0,0). Then fill the
# output interpolated array with NaNs
try:
# Note that xESMF thinks that some grid cells (maybe near edges
# of the domain) are triangles are instead of quadrilaters
# (i.e., degenerated); see https://github.com/JiaweiZhuang/xESMF/
# issues/60
regridder = xe.Regridder(ds, ds_out_chunk, 'bilinear',
ignore_degenerate=True)
dr_out = regridder(dr)
interpolated_g.append(dr_out.data)
regridder.clean_weight_file() # Clean-up
except ValueError:
print('ValueError encountered!')
dr_out = np.empty(shape=(ds_out_chunk.y.shape[0],
ds_out_chunk.x.shape[0]))
dr_out[:] = np.nan
interpolated_g.append(dr_out)
# There appears to be issues with cells that are outide the old grid's
# domain (see https://github.com/JiaweiZhuang/xESMF/issues/15) are all
# set to 0.0 when regridding rather than nan. Force any interpolated
# grid cell that is identically equal to 0.0 to be NaNs
interpolated_g = np.vstack(interpolated_g)
interpolated_g[interpolated_g==0.0] = np.nan
# # # # Optional: check regridding
# import matplotlib.pyplot as plt
# fig = plt.figure(figsize=(8,4))
# ax1 = plt.subplot2grid((2,2),(0,0),colspan=2,
# projection=ccrs.PlateCarree())
# ax2 = plt.subplot2grid((2,2),(1,0),colspan=2,
# projection=ccrs.PlateCarree())
# mb1 = ax1.pcolormesh(lng_g, lat_g, var_g, cmap=plt.get_cmap(
# 'gist_earth'), vmin=np.nanpercentile(interpolated_g, 20),
# vmax=np.nanpercentile(interpolated_g, 20))
# ax1.coastlines()
# plt.colorbar(mb1, ax=ax1)
# mb2 = ax2.pcolormesh(lon_out, lat_out, interpolated_g,
# cmap=plt.get_cmap('gist_earth'), vmin=
# np.nanpercentile(interpolated_g, 20),
# vmax=np.nanpercentile(interpolated_g, 20))
# plt.colorbar(mb2, ax=ax2, extend='both')
# ax2.coastlines()
# for ax in [ax1, ax2]:
# ax.set_extent(crop)
# plt.show()
# # # #
# Append interpolated grid to multi-granule list
interpolated.append(interpolated_g)
interpolated = np.stack(interpolated)
end = timer()
print('Interpolated in...', timedelta(seconds=end-start))
return interpolated, lat_out, lon_out
def regrid_tropomi(fstr, var, varname, startdate, enddate, crop, region, lonss,
latss, nchunk, qa=0.75):
"""Read SP5/TROPOMI L2 OFFL NO2, conduct quality assurance on data, and
interpolate to a standard rectilinear grid. Each interpolated granuleand
the date on which it was retrieved is saved in the output file.
Parameters
----------
fstr : str
TROPOMI data file descriptor (e.g., S5P_OFFL_L2__NO2____); should have
len = 20. See http://www.tropomi.eu/data-products/level-2-products
for more information
var : str
Variable of interest from TROPOMI data (e.g.,
nitrogendioxide_tropospheric_column)
varname : str
Variable name (e.g., NO2) for output file
startdate : str
Start date of time period of interest
enddate : str
End date of time period of interest
crop : list
Bounds (x0,x1,y0,y1) roughly corresponding to TROPOMI focus region
region : str
Description of focus region (contained in the bounding box given in
"crop") that is used in the output file name; e.g. conus, azores
lonss : float
Longitude step size, i.e. grid resolution, units of degrees
latss : float
Latitude step size, i.e. grid resolution, units of degrees
nchunk : int
Number of chunks to split fine resolution grid into for regridding
(if the resolution of output grid is, say, 1 deg, then this could be
1)
qa : optional, float
Data quality threshold below which data will be screen (recommended to
ignore data with QA values < 0.5)
Returns
-------
None
"""
import calendar
import numpy as np
import netCDF4 as nc
from datetime import datetime
import pandas as pd
for date in pd.date_range(startdate, enddate):
date = date.strftime('%Y-%m-%d')
print(date)
# Load data for selected day
tropomi = load_tropomi(date, fstr, var)
# Filter out bad/erroneous data
tropomi = apply_qa_value(tropomi, var, qa)
# Interpolate to rectilinear grid
try:
interpolated, lat, lon = regrid(tropomi, var, crop, lonss, latss, nchunk)
# Create output file which specifies the version, constituent, operation,
# and start/end dates
root_grp = nc.Dataset(DDIR_OUT+
'S5P_%s_%s_%s_%.2fgrid_QA%d.nc'%(varname, region,
datetime.strftime(datetime.strptime(date,'%Y-%m-%d'),'%Y%m%d'),
latss, int(qa*100)), 'w', format='NETCDF4')
root_grp.title = u'TROPOMI/S5P %s '%(var)
root_grp.history = 'created %d %s %d by Gaige Kerr ([email protected])'\
%(datetime.today().day, calendar.month_abbr[datetime.today().month],
datetime.today().year)
root_grp.description = 'Data are for %s '%(date)+\
'and are filtered such that only qa_value > %.2f are included. '%(qa)+\
'Data regridded to %.2f deg longitude x %.2f deg latitude grid'\
%(lonss, latss)
# Dimensions
root_grp.createDimension('time', None)
root_grp.createDimension('longitude', len(lon))
root_grp.createDimension('latitude', len(lat))
# Variables
# Longitude
var_lon = root_grp.createVariable('longitude', 'f4', ('longitude',))
var_lon[:] = lon
var_lon.long_name = 'longitude'
var_lon.units = 'degrees east'
# Latitude
var_lat = root_grp.createVariable('latitude', 'f4', ('latitude',))
var_lat[:] = lat
var_lat.long_name = 'latitude'
var_lat.units = 'degrees north'
# TROPOMI extract
var_out = root_grp.createVariable(var,
'f8', ('time', 'latitude', 'longitude',),
fill_value=nc.default_fillvals['f8'])
# Calculate daily average, convert to molec/cm2, and replace
# NaNs with fill value
interpolated = np.nanmean(interpolated, axis=0)
interpolated = interpolated*6.02214e+19
interpolated[np.where(np.isnan(interpolated)==True)] = \
nc.default_fillvals['f8']
var_out[:] = interpolated.reshape(1,
interpolated.shape[0], interpolated.shape[1])
var_out.long_name = var
var_out.units = 'molecules_percm2'
# Time
var_t = root_grp.createVariable('time', 'int32', ('time',))
var_t.setncattr('units', 'seconds since 2010-01-01 00:00:00 UTC')
ntime = nc.date2num(tropomi['time'][0], var_t.units)
var_t[:] = ntime
# Closing
root_grp.close()
# In case there are no granuales for day/region of interest
except ValueError:
pass
return
# # CONUS
# fstr = 'S5P_OFFL_L2__NO2____'
# var = 'nitrogendioxide_tropospheric_column'
# varname = 'NO2'
startdate = '2019-04-01'
enddate = '2019-04-30'
# crop = [-124.75, -66.75, 24.5, 49.5]
# lonss = 0.01
# latss = 0.01
# nchunk = 3
# region = 'conus'
# regrid_tropomi(fstr, var, varname, startdate, enddate, crop, region, lonss,
# latss, nchunk)
# # Milan
# fstr = 'S5P_OFFL_L2__NO2____'
# var = 'nitrogendioxide_tropospheric_column'
# varname = 'NO2'
# startdate = '2020-01-01'
# enddate = '2020-05-31'
# crop = [43.8, 46.6, 7.8, 10.8]
# lonss = 0.01
# latss = 0.01
# nchunk = 2
# region = 'milan'
# regrid_tropomi(fstr, var, varname, startdate, enddate, crop, region, lonss,
# latss, nchunk)
# New York
fstr = 'S5P_OFFL_L2__NO2____'
var = 'nitrogendioxide_tropospheric_column'
varname = 'NO2'
startdate = '2020-01-01'
enddate = '2020-05-31'
crop = [-75., -73., 40.2, 41.4]
lonss = 0.01
latss = 0.01
nchunk = 2
region = 'newyork'
regrid_tropomi(fstr, var, varname, startdate, enddate, crop, region, lonss,
latss, nchunk)