-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dataset.py
66 lines (50 loc) · 1.67 KB
/
Dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
from math import radians, sin, cos, atan2, sqrt
df = pd.read_csv('PauPau.csv', delimiter = ',', error_bad_lines = False,
encoding = 'ISO-8859-1')
def distance(p1, n):
R = 6371.0
if n == 1:
lat2 = radians(41.548331)
lon2 = radians(-8.421298)
elif n == 2:
lat2 = radians(41.551356)
lon2 = radians(-8.420001)
elif n == 3:
lat2 = radians(41.546639)
lon2 = radians(-8.433517)
else:
lat2 = radians(41.508849)
lon2 = radians(-8.462299)
lat1, lon1 = radians(p1[0]), radians(p1[1])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = R * c
return distance
df['Distance'] = df.apply(lambda row: distance((row['latitude'],row['longitude']), row['road_num']), axis=1)
'''print('max:', df['Distance'].max())
print('min:', df['Distance'].min())
print('mean:', df['Distance'].mean())
print('standard deviation:', df['Distance'].std())'''
# row ids para distancia GRANDE
#df.index[df['Distance'] == 6313.251265773197].tolist()
#df.iloc[17807]
# o professor pos coordenadas dos USA?
# agora vamos ter que procurar por dados falsos!!! meh
# ver rows tq distancia >= threshold
# remover essas
print(len(df))
print('oi')
thresh = 1.5 # braga é pequena
ind_dados_errados = df.index[df['Distance'] > thresh].tolist()
print(len(ind_dados_errados))
print('oi')
df.drop(ind_dados_errados , inplace=True)
print(len(df))
df.to_csv('Dataset1.5.csv', index = False)
'''print('max:', df['Distance'].max())
print('min:', df['Distance'].min())
print('mean:', df['Distance'].mean())
print('standard deviation:', df['Distance'].std())'''