-
Notifications
You must be signed in to change notification settings - Fork 1
/
parseKaggle.py
56 lines (42 loc) · 1.63 KB
/
parseKaggle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import numpy as np
def parse_kaggle():
"""
parse kaggle's women's e-commerce clothing reviews data set subset(~25,000 reviews)
where positive is 5 star reviews and negative is 1 star reviews.
Input is kaggle.csv which contains all reviews(including 1-5 star reviews) and other meta data
:return: x_train, y_train
"""
data = pd.read_csv("data/kaggle.csv")
# data = data.dropna()
data = data[pd.notnull(data['Rating'])]
data = data[pd.notnull(data['Review Text'])]
# if in data["Rating"]:
# print(data["Rating"])
# print(data.describe())
# print(data["Review Text"])
# print(data["Rating"])
# x_train = np.copy(data["Review Text"])
# y_train = np.divide(data["Rating"], 5)
#
# in data["overall"], the rating is given for each corresponding index in data["reviewText"]
# 1 means negative reponse and 5 means positive response
neg = data.loc[data["Rating"] == 1]
pos = data.loc[data["Rating"] == 5][0:neg.shape[0]]
# parse it into numpy array and store the reviewText in x_train and the rating in y_train
x_train = np.copy(neg["Review Text"])
x_train = np.append(x_train, np.copy(pos["Review Text"]))
y_train = np.subtract(neg["Rating"], 1)
y_train = np.append(y_train, np.subtract(pos["Rating"], 4))
# print(x_train)
# print(y_train)
# print(x_train.shape)
# print(y_train.shape)
# df = pd.DataFrame(y_train)
# # print(df)
# print(np.divide(df["Rating"].value_counts(), 2786.77))
return x_train, y_train
def main():
parse_kaggle()
if __name__ == "__main__":
main()