-
Notifications
You must be signed in to change notification settings - Fork 1
/
parseAmazon.py
128 lines (100 loc) · 4.22 KB
/
parseAmazon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
import numpy as np
def parse_amazon():
"""
parse amazon data set subset(~200,000 reviews) where positive is 5 star reviews and negative is 1 star reviews.
Input is amazon.json which contains all reviews(including 1-5 star reviews) and other meta data
:return: x_train, y_train
"""
# read the reviews from amazon.json and we get a dataframe
data = pd.read_json("data/amazon.json", lines=True)
# in data["overall"], the rating is given for each corresponding index in data["reviewText"]
# 1 means negative reponse and 5 means positive response
neg = data.loc[data["overall"] == 1]
pos = data.loc[data["overall"] == 5][0:neg.shape[0]]
# parse it into numpy array and store the reviewText in x_train and the rating in y_train
x_train = np.copy(neg["reviewText"])
x_train = np.append(x_train, np.copy(pos["reviewText"]))
y_train = np.subtract(neg["overall"], 1)
y_train = np.append(y_train, np.subtract(pos["overall"], 4))
# another way of performing the above
# x_train = []
# y_train = []
# i = 0
# count1 = 0
# count5 = 0
# for x in data["overall"]:
# if x == 5 and count5 < count1:
# x_train.append(data["reviewText"][i])
# y_train.append(1)
# count5 += 1
# if x == 1:
# x_train.append(data["reviewText"][i])
# y_train.append(0)
# count1 += 1
# i += 1
# x_train = np.copy(x_train).transpose()
# y_train = np.copy(y_train).transpose()
#
# df = pd.DataFrame(y_train)
# # print(df.describe())
# print(np.divide(df[0].value_counts(), 1)) # 2786.77))
# print(x_train)
# print(y_train)
return x_train, y_train
def parse_amazon_medium():
"""
parse amazon data(~20,000,000 reviews) where positive is 5 star reviews and negative is 1 star reviews.
Input is slightly preprocessed amazon data in form of amazon-1m-pos.json which is 1 million positive reviews and
amazon-1m-neg.json which is 1m negative reviews.
:return: x_train, y_train,
"""
with open('data/amazon-500k-pos.json') as json_file:
data = json_file.readlines()
data = list(map(pd.json.loads, data))
pos = pd.DataFrame(data)
print("pos loaded")
with open('data/amazon-500k-neg.json') as json_file:
data = json_file.readlines()
data = list(map(pd.json.loads, data))
neg = pd.DataFrame(data)
print("neg loaded")
# pos = pd.read_json("data/amazon-large-pos.json", lines=True)
# neg = pd.read_json("data/amazon-large-neg.json", lines=True)
# parse it into numpy array and store the reviewText in x_train and the rating in y_train
x_train = np.copy(neg["reviewText"])
x_train = np.append(x_train, np.copy(pos["reviewText"]))
print("over")
y_train = np.subtract(neg["overall"], 1)
y_train = np.append(y_train, np.subtract(pos["overall"], 4))
return x_train, y_train
def parse_amazon_large():
"""
parse amazon data(~20,000,000 reviews) where positive is 5 star reviews and negative is 1 star reviews.
Input is slightly preprocessed amazon data in form of amazon-1m-pos.json which is 1 million positive reviews and
amazon-1m-neg.json which is 1m negative reviews.
:return: x_train, y_train,
"""
with open('data/amazon-1m-pos.json') as json_file:
data = json_file.readlines()
data = list(map(pd.json.loads, data))
pos = pd.DataFrame(data)
print("pos loaded")
with open('data/amazon-1m-neg.json') as json_file:
data = json_file.readlines()
data = list(map(pd.json.loads, data))
neg = pd.DataFrame(data)
print("neg loaded")
# pos = pd.read_json("data/amazon-large-pos.json", lines=True)
# neg = pd.read_json("data/amazon-large-neg.json", lines=True)
# parse it into numpy array and store the reviewText in x_train and the rating in y_train
x_train = np.copy(neg["reviewText"])
x_train = np.append(x_train, np.copy(pos["reviewText"]))
print("over")
y_train = np.subtract(neg["overall"], 1)
y_train = np.append(y_train, np.subtract(pos["overall"], 4))
return x_train, y_train
def main():
parse_amazon()
if __name__ == "__main__":
main()