-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleanAndSort.py
138 lines (88 loc) · 3.74 KB
/
cleanAndSort.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import csv, re
from os import name
import json
import subprocess
import sys
try:
from geotext import GeoText
except ModuleNotFoundError:
print("Warning: geotext module not found!")
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'geotext'])
from geotext import GeoText
country_dict = dict()
with open('data/countries.json', 'r', encoding='utf-8') as json_file:
country_dict = json.load(json_file)
country_list = list(country_dict.values())
def write_to_csv(new_data, filename='data/clean_tweets.csv'):
with open(filename, 'a', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerow(new_data)
def fixCountry(mispell):
misspell_cap = " ".join([x[0].upper() + x[1:] for x in mispell.split()])
places = GeoText(misspell_cap)
abbvs = list()
if not places.cities and not places.countries:
return ''
else:
for abbv in places.country_mentions.keys():
abbvs.append(abbv)
return country_dict[abbvs[0]]
def clean_with_regex(sentence):
def deEmojify(text):
text = str(text).replace(','," ")
regrex_pattern = re.compile(pattern = "["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags = re.UNICODE)
return regrex_pattern.sub(r'',text)
def remove_url(txt):
return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())
def remove_number(txt):
return re.sub(r'[0-9]+', '', txt)
def remove_email(text):
return re.sub('\S*@\S*\s?','',text)
def remove_username(text):
return re.sub('@[^\s]+','',text)
clean_sentence = deEmojify(sentence)
clean_sentence = remove_url(clean_sentence)
clean_sentence = remove_email(clean_sentence)
clean_sentence = remove_username(clean_sentence)
clean_sentence = remove_number(clean_sentence)
return clean_sentence
def merge_vaccine_names(vaccine_name):
if vaccine_name in ['Pfizer Vaccine', 'BioNTech', 'Comirnaty Vaccine']:
return 'Pfizer BioNTech'
elif vaccine_name in ['AstraZeneca','Covishield']:
return 'AstraZeneca'
elif vaccine_name in ['Johnson & Johnson vaccine', 'Vaccine Janssen']:
return 'Johnson & Johnson'
elif vaccine_name in ['Sinovac vaccine', 'CoronaVac']:
return 'CoronaVac'
elif vaccine_name in ['BBIBP-CorV', 'Sinopharm vaccine']:
return 'BBIBP-CorV'
elif vaccine_name in ['PakVac', 'Ad5-nCoV', 'Convidicea']:
return 'Convidicea'
elif vaccine_name in ['Covaxin Vaccine', 'Bharat Biotech Vaccine']:
return 'Covaxin'
else:
return vaccine_name
def cleanAndSort(filtr):
with open('data/raw_tweets.csv', 'r', encoding='utf-8') as file_object:
tweet_file = csv.reader(file_object)
for row in tweet_file:
if len(row[1]) > filtr:
new_row = row[:]
new_row[0] = merge_vaccine_names(row[0])
new_row[1] = clean_with_regex(row[1])
new_row[2] = fixCountry(row[2])
write_to_csv(new_row)
if __name__ == '__main__':
filtr = int(input("Input minimum length filter: \t"))
try:
val = int(filtr)
except:
filtr = 25
print('using default minimum length: {}'.format(filtr))
cleanAndSort(filtr)