-
Notifications
You must be signed in to change notification settings - Fork 35
/
stock-market-scraper.py
110 lines (78 loc) · 3.65 KB
/
stock-market-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import urllib.request, json , time, os, difflib, itertools
import pandas as pd
from multiprocessing.dummy import Pool
from datetime import datetime
try:
import httplib
except:
import http.client as httplib
def check_internet():
conn = httplib.HTTPConnection("www.google.com", timeout=5)
try:
conn.request("HEAD", "/")
conn.close()
# print("True")
return True
except:
conn.close()
# print("False")
return False
def get_historic_price(query_url,json_path,csv_path):
stock_id=query_url.split("&period")[0].split("symbol=")[1]
if os.path.exists(csv_path+stock_id+'.csv') and os.stat(csv_path+stock_id+'.csv').st_size != 0:
print("<<< Historical data of "+stock_id+" already exists")
return
while not check_internet():
print("Could not connect, trying again in 5 seconds...")
time.sleep(5)
try:
with urllib.request.urlopen(query_url) as url:
parsed = json.loads(url.read().decode())
except:
print("||| Historical data of "+stock_id+" doesn't exist")
return
else:
if os.path.exists(json_path+stock_id+'.json') and os.stat(json_path+stock_id+'.json').st_size != 0:
os.remove(json_path+stock_id+'.json')
with open(json_path+stock_id+'.json', 'w') as outfile:
json.dump(parsed, outfile, indent=4)
try:
Date=[]
for i in parsed['chart']['result'][0]['timestamp']:
Date.append(datetime.utcfromtimestamp(int(i)).strftime('%d-%m-%Y'))
Low=parsed['chart']['result'][0]['indicators']['quote'][0]['low']
Open=parsed['chart']['result'][0]['indicators']['quote'][0]['open']
Volume=parsed['chart']['result'][0]['indicators']['quote'][0]['volume']
High=parsed['chart']['result'][0]['indicators']['quote'][0]['high']
Close=parsed['chart']['result'][0]['indicators']['quote'][0]['close']
Adjusted_Close=parsed['chart']['result'][0]['indicators']['adjclose'][0]['adjclose']
df=pd.DataFrame(list(zip(Date,Low,Open,Volume,High,Close,Adjusted_Close)),columns =['Date','Low','Open','Volume','High','Close','Adjusted Close'])
if os.path.exists(csv_path+stock_id+'.csv'):
os.remove(csv_path+stock_id+'.csv')
df.to_csv(csv_path+stock_id+'.csv', sep=',', index=None)
print(">>> Historical data of "+stock_id+" saved")
except:
print(">>> Historical data of "+stock_id+" could not be saved")
return
def main():
json_path = os.getcwd()+os.sep+".."+os.sep+"historic_data"+os.sep+"json"+os.sep
csv_path = os.getcwd()+os.sep+".."+os.sep+"historic_data"+os.sep+"csv"+os.sep
if not os.path.isdir(json_path):
os.makedirs(json_path)
if not os.path.isdir(csv_path):
os.makedirs(csv_path)
ticker_file_path = "Assets"+os.sep+"Yahoo Ticker Symbols - September 2017.xlsx"
temp_df = pd.read_excel(ticker_file_path)
temp_df = temp_df.drop(temp_df.columns[[5, 6, 7]], axis=1)
headers = temp_df.iloc[2]
df = pd.DataFrame(temp_df.values[3:], columns=headers)
print("Total stocks:",len(df))
query_urls=[]
for ticker in df['Ticker']:
query_urls.append("https://query1.finance.yahoo.com/v8/finance/chart/"+ticker+"?symbol="+ticker+"&period1=0&period2=9999999999&interval=1d&includePrePost=true&events=div%2Csplit")
with Pool(processes=10) as pool:
pool.starmap(get_historic_price, zip(query_urls, itertools.repeat(json_path), itertools.repeat(csv_path)))
print("<|> Historical data of all stocks saved")
return
if __name__ == '__main__':
main()