-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataReader.py
244 lines (216 loc) · 9.3 KB
/
DataReader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import networkx as nx
from DgraphRecommendation import Person, DgraphInterface
from DgraphRecommendation.DataLoader import download_stored_nodes
from DgraphRecommendation.Feature import Feature
import easygui
from os import listdir
from os import path
import os
from tqdm import tqdm
import json
class DataReader:
'''
Help class to read data from files
'''
def __init__(self):
self.persons = dict()
self.all_features = []
self.iteration_persons = dict()
'''
:returns person with given id
if person not recorded yet, record it
'''
def getPerson(self, id) -> Person:
if id not in self.persons:
person = Person(id)
self.persons[id] = person
if id not in self.iteration_persons:
self.iteration_persons[id] = self.persons[id]
return self.persons[id]
'''
method called when we are done working with one id's files
'''
def clearIteration(self):
for person in self.iteration_persons.values():
person.clear_raw_features()
self.iteration_persons = dict()
'''
Reads facebook (or twitter) files content into persons and all_features
'''
def read_from_facebook(self):
easygui.msgbox("Please, select the data folder ('facebook') using the file dialog")
data_dir = easygui.diropenbox() # where to take files from
if data_dir is None:
return
files = listdir(data_dir)
files.sort()
for file in tqdm(files):
# first part contains id, second part item kind
parts = file.split('.')
id = parts[0]
kind = parts[1]
filepath = path.join(data_dir, file)
person = self.getPerson(id)
# since we go through sorted files they come directly after each other
# circles -> edges -> egofeat -> feat -> featnames
if kind == "circles":
continue # not interested in circles for now
elif kind == "edges":
# might contains nodes not mentioned in filenames
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
data = line.split(" ")
follower = self.getPerson(data[0])
follows = self.getPerson(data[1])
# facebook data is undirected
follower.follow(follows)
follows.follow(follower)
elif kind == "egofeat":
# read raw features (0 & 1)
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
features = [int(x) for x in line.split(" ")]
person.add_raw_features(features)
elif kind == "feat":
# read raw features in ego-space
with open(filepath, 'r') as f:
# read raw features for target
target = person
# each line contains features for one node id
for line in f:
line = line.strip()
data = line.split(" ")
target = self.getPerson(data[0])
data.pop(0)
features = [int(x) for x in data]
target.add_raw_features(features)
elif kind == "featnames":
with open(filepath, 'r') as f:
for line in f:
line = line.strip() # get rid of /n or space
data = line.split(" ")
feature_order = int(data[0])
feature_name = data[-1]
if feature_name not in self.all_features:
self.all_features.append(feature_name)
for iter_person in self.iteration_persons.values():
if iter_person.hasFeature(feature_order):
iter_person.add_feature(feature_name)
# featnames is the last file with id
self.clearIteration()
easygui.msgbox("Files processed. Please select the combined file ('*_combined.txt') for further processing..")
filepath = easygui.fileopenbox()
if filepath is None:
return
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
data = line.split(" ")
if data[0] in self.persons and data[1] in self.persons:
follower = self.persons[data[0]]
follows = self.persons[data[1]]
follower.follow(follows)
follows.follow(follower)
'''
Writes (outputs) persons, all_feautures to .rdf files (no links)
:returns persons_file adress + features_file adress
'''
def write_data_to_rdf(self):
# check if required files exist already, if so return them
filedir = os.getcwd()
features_file = path.join(filedir, "features_facebook.rdf")
persons_file = path.join(filedir, "persons_facebook.rdf")
if os.path.exists(features_file) and len(open(features_file).readlines()) > 0 \
and os.path.exists(persons_file) and len(open(persons_file).read()) > 0:
return persons_file, features_file
if len(self.persons) == 0 or len(self.all_features) == 0:
easygui.msgbox("Please, perform data reading first")
return
if path.exists(features_file):
os.remove(features_file) # remove old file
lines = []
all_features = list(set(self.all_features))
# WRITE FEATURES
for feature in tqdm(all_features):
typeline = f'<_:{feature}> <dgraph.type> "Feature" .\n'
nameline = f'<_:{feature}> <name> "{feature}" .\n'
lines.append(typeline)
lines.append(nameline)
with open(features_file, 'a') as f:
f.writelines(lines)
if path.exists(persons_file):
os.remove(persons_file) # remove old file
# WRITE PERSONS
lines = []
for person in tqdm(self.persons.values()):
typeline = f'<_:{person.id}> <dgraph.type> "Person" .\n'
idline = f'<_:{person.id}> <id> "{person.id}" .\n'
lines.append(typeline)
lines.append(idline)
with open(persons_file, 'a') as f:
f.writelines(lines)
return persons_file, features_file
'''
Writes links between persons and features to .rdf files
:arg stored_persons_loc where stored persons file can be found
:arg stored_features_loc where stored features file can be found
'''
def write_links_to_rdf(self, stored_persons_loc: str, stored_features_loc: str):
file = stored_persons_loc
with open(file, 'r') as f:
data = json.load(f)
data = data['data']['total']
for row in tqdm(data):
row_id = row['id']
self.persons[row_id].uid = row['uid']
file = stored_features_loc
with open(file, 'r') as f:
data = json.load(f)
data = data['data']['total']
all_features = dict()
for row in tqdm(data):
row_name = row['name']
row_uid = row['uid']
feature = Feature(row_name, row_uid)
all_features[row_name] = feature
# save relations
wdir = os.getcwd()
follows_lines = []
follows_rdffile = path.join(wdir, "follows_facebook.rdf")
if path.exists(follows_rdffile):
os.remove(follows_rdffile) # remove old file
tracks_lines = []
tracks_rdffile = path.join(wdir, "tracks_facebook.rdf")
if path.exists(tracks_rdffile):
os.remove(tracks_rdffile) # remove old file
for person in tqdm(self.persons.values()):
person_uid = person.uid
if person_uid is None:
# todo change to log, throw an error
easygui.msgbox("Person not stored in dgraph: " + person.id + ", exiting...")
print("Exiting...")
return
for followed in person.get_follows():
target_uid = self.persons[followed.id].uid
if target_uid is None:
easygui.msgbox("Person not stored in dgraph: " + followed.id + ", exiting..")
print("Exiting...")
return
followline = f'<{person_uid}> <follows> <{target_uid}> .\n'
follows_lines.append(followline)
for feature in person.get_features():
feature_uid = all_features[feature].uid
if feature_uid is None:
easygui.msgbox("Feature not stored in dgraph: " + feature + ", exiting...")
print("Exiting...")
return
tracksline = f'<{person_uid}> <tracks> <{feature_uid}> .\n'
tracks_lines.append(tracksline)
# write lines to the .rdf files
with open(follows_rdffile, 'a') as f:
f.writelines(follows_lines)
with open(tracks_rdffile, 'a') as f:
f.writelines(tracks_lines)
return follows_rdffile, tracks_rdffile