-
Notifications
You must be signed in to change notification settings - Fork 0
/
yelp.py
60 lines (43 loc) · 2.14 KB
/
yelp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json
import pandas as pd
import kaggle
import os
from kaggle.api.kaggle_api_extended import KaggleApi
current = os.getcwd()
print('Downloading dataset...')
api = KaggleApi()
api.authenticate()
api.dataset_download_files('yelp-dataset/yelp-dataset', path=current, unzip=True)
print('Dataset downloaded successfully')
#Lists of the columns wanted from each json file
bus_columns = ['business_id', 'name', 'city', 'stars', 'review_count']
reviews_columns = ['review_id', 'user_id', 'business_id', 'stars', 'date', 'text', 'useful', 'funny', 'cool']
users_columns = ['user_id', 'review_count', 'yelping_since', 'useful', 'funny', 'cool', 'fans', 'elite', 'average_stars']
def add_to_dict(columns, file_path):
'''
Function creates pandas dataframe using json file specified in file_path and elements specified in columns
file_path: Full path (not relative) to json file
columns: List of column values to extract from json file
'''
temp_df = {} #temporary dictionary that will be turned into dataframe
data_points = []
#extracts json dictionary from each value and adds them to data_points array
for line in open(file_path, "r", encoding = 'cp850'):
data_points.append(json.loads(line))
#extracts columns specified in columns and adds them to temp_df
for item in columns:
temp_df[item] = []
for point in data_points:
temp_df[item].append(point[item])
return pd.DataFrame.from_dict(temp_df) #returns a pandas dataframe
#joins all dataframes created for each json. Uses .join to join dataframes that are of differnet length
print('Processing dataset...')
root_df = add_to_dict(bus_columns, '\yelp_academic_dataset_business.json')
root_df = root_df.join(add_to_dict(reviews_columns, '\yelp_dataset\yelp_academic_dataset_review.json'), lsuffix='_business', rsuffix='_review')
root_df = root_df.join(add_to_dict(users_columns, '\yelp_academic_dataset_user.json'), lsuffix='_review', rsuffix='_user')
print('Processed')
#converts dataframe to csv
os.system('touch data.csv')
print('Converting to csv...')
root_df.to_csv('data.csv')
print('Converted')