-
Notifications
You must be signed in to change notification settings - Fork 0
/
retdata.py
56 lines (52 loc) · 2.21 KB
/
retdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
import json
import helpers
def retriveword2vecdata(game='pubg'):
sentences = []
#Todo Replace with bucket storage
with open(game+'-dataset.txt', 'r') as f:
for eachline in f:
#clean the /n
eachline = eachline.strip('\n')
#break each line to sentences
eachline = eachline.split('.')
for eachsentence in eachline:
if eachsentence != '' and eachsentence.isspace() == False:
eachsentence = eachsentence.split()
sentences.append(eachsentence)
return sentences
def retriveTMdata(start,end):
oneDay = 86400
startTime = start + 1
data = []
plainText = []
keys = []
classifications = []
dicts=[]
count = 0
while(startTime < end):
try:
#Todo Replace with bucket storage
script_dir = os.path.dirname('__file__')
rel_path = "Dataset/"+str(startTime)+".json"
abs_file_path = os.path.join(script_dir, rel_path)
with open(abs_file_path) as f:
data = json.load(f)
for document in data:
if "im a bot" in document['gencomments']:
count+=1
continue
else:
concatString = document['title'] + ' . ' + document['selftext'] + ' . ' + document['gencomments']
uncleaned = concatString
concatString = helpers.clean(concatString)
plainText.append(str.lower(concatString))
keys.append(document['id'])
classifications.append(document['classifierresults'])
dicts.append({'key':document['id'],'content':str.lower(concatString),'class':document['classifierresults'],
'title':str.lower(document['title']),'selftext':str.lower(document['selftext']),'comments':str.lower(document['gencomments']),
'uncleaned':str.lower(uncleaned),'time':startTime})
startTime += oneDay
except:
startTime += oneDay
return {'keys':keys,'docs':plainText,'classify':classifications,'dicts':dicts}