-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
199 lines (162 loc) · 5.73 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import numpy as np
from pathlib import Path
import yaml
class run_dataset:
def __init__(self,run_id=[], date=[], initialize_separate_lists=False):
"""
Initialize the class run_dataset
Parameters
----------
run_id : list
date : list
initialize_separate_lists : bool
If initialize_separate_lists is True:
- run_id is a list of list with the run ids of the same date in a single list.
- date is a single list with the dates. Dimension 1 correspond to the date for the
list in the dimension 1 of run_id.
If initialize_separate_lists is False:
- run_id is a single list with all the run ids.
- date is a single list with the date of the run id in the same dimension.
Example
-------
Use initialize_separate_lists as True
>>> #dates of the runs
>>> days=[20201120, 20210904, 20210905]
>>> #list with the IDs of the runs for each day
>>> runs1=[2965,2966]
>>> runs2=[6044]
>>> runs3=[6071,6072]
>>> #group all list of ID runs in the same list
>>> day_runs=[runs1, runs2,runs3]
>>> #create the dataset
>>> run_dataset(day_runs,days,True)
Use initialize_separate_lists as False. Using the previous example
>>> #Produce two list where the index gives the date and run ID
>>> array_runs=[]
>>> for i,dayrun in enumerate(day_runs):
>>> for run in dayrun:
>>> array_runs.append([days[i],run])
>>> array_runs=np.array(array_runs)
>>> #fill the run_dataset class
>>> dataset=run_dataset(array_runs[:,1],array_runs[:,0])
"""
runs=[]
if initialize_separate_lists:
array_runs=[]
for i,dayrun in enumerate(run_id):
for run in dayrun:
array_runs.append([date[i],run])
array_runs=np.array(array_runs)
run_id=array_runs[:,1]
date=array_runs[:,0]
self.id=run_id
self.date=date
runs.append(self)
def sort_runs_by_date(self,day):
"""
Find the runs taken for a specific date.
Parameters
----------
day: int
Date of interest. Format: YYYYMMDD.
Returns
-------
list_run_id: np.array
List of run ids that were taken the date "day".
"""
import numpy as np
list_run_id=[]
#loop for each run if there is more than one
if len(np.shape(self.id))>=1:
for irun in range(len(self.id)):
if day==self.date[irun]:
list_run_id.append(self.id[irun])
#if only one run it is returned
else:
list_run_id.append(self.id)
return np.array(list_run_id)
def sort_date_by_run(self,run_id):
"""
Find the date of a specific run id.
Parameters
----------
run_id: int
Run id of interest.
Returns
-------
date: int
Date for which run_id was taken.
"""
import numpy as np
if len(np.shape(self.date))>=1:
for idate in range(len(self.date)):
if run_id==self.id[idate]:
return self.date[idate]
else:
return self.date
def number_of_days(self):
"""
Obtain the total number of days.
Returns
-------
days: np.array
List of days.
"""
import numpy as np
days=[]
if len(np.shape(self.id))>=1:
for irun in range(len(self.id)):
days.append(self.date[irun])
else:
days.append(self.date)
return np.unique(days)
def number_of_runs(self):
"""
Obtain the total number of run ids.
Returns
-------
runs: np.array
List of run ids.
"""
import numpy as np
runs=[]
if len(np.shape(self.id))>=1:
for irun in range(len(self.id)):
runs.append(self.id[irun])
else:
runs.append(self.id)
return np.unique(runs)
def read_dataset(self, file_name):
"""
Read a dataset from a yaml file.
Parameters
----------
file_name: str
Path and file name of the yaml file with the dataset
"""
import yaml
if not Path(file_name).exists():
raise FileNotFoundError(f"File {file_name} does not exist")
with open(file_name, 'r') as file:
data = yaml.safe_load(file)
dates = list(data.keys())
run_list = []
for key in dates:
run_list.append(data[key])
# Initialize the run_dataset object with the data read from the file
self.__init__(run_list, dates, True)
def write_dataset(self, file_name):
"""
Write to yaml file the dataset. The keys are the dates
and the values in the keys are the run_ids for that
date.
Parameters
----------
file_name: str
Path and file name of the yaml file with the dataset
"""
dataset_dict={}
for date in self.number_of_days():
dataset_dict[int(date)]=self.sort_runs_by_date(date).tolist()
with open(file_name, 'w') as file:
yaml.dump(dataset_dict, file, indent=4, default_flow_style=False)