-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_new_data.py
150 lines (132 loc) · 6.24 KB
/
get_new_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import requests
import pandas as pd
import numpy as np
import simplejson
import time
import datetime
import re
date = (datetime.datetime.now() + datetime.timedelta(days=-7)).strftime('%Y-%m-%d')
def requests_data(p, TYPE_KEY_WORD):
global date
try:
response = requests.get(
'http://datacenter.eastmoney.com/api/data/get?st=NOTICE_DATE%2CSECURITY_CODE&sr=-1%2C-1&ps=500&p='
+ str(p) + '&type=' + TYPE_KEY_WORD + '&sty=ALL&filter=(NOTICE_DATE>=%27' + date + '%27)')
except:
time.sleep(5)
response = requests_data(p, TYPE_KEY_WORD)
return response
def get_data(TYPE_KEY_WORD):
df = pd.DataFrame()
response = requests_data(1, TYPE_KEY_WORD)
pageNum = int(re.sub(r'.*"pages":([0-9]+).*', r'\1', response.text[:200]))
l_cols = np.array(
[re.sub(r'.*"(.+)".*', r'\1', s) for s in re.findall(r'"[^:,]*?":', re.finditer(r'{.+?},', re.sub(
r'.*"data":(\[.+]).*', r'\1', response.text)).__next__()[0])])
for p in range(1, pageNum + 1):
if p > 1:
response = requests_data(p, TYPE_KEY_WORD)
while response.status_code != 200 or len(re.findall(r'"data":', response.text)) == 0:
response = requests_data(p, TYPE_KEY_WORD)
l_str = re.sub(r'"[^:,]*?":', r'',
re.sub(r'{', r'[', re.sub(r'}', r']', re.sub(r'.*"data":(\[.*]).*', r'\1', response.text))))
dataContent = np.array(simplejson.loads(l_str))
df_temp = pd.DataFrame(dataContent, columns=l_cols)
df = pd.concat([df, df_temp], axis=0)
print('已获取' + str(p) + '页')
return df
def extract_data(df, dirName):
for index, row in df.iterrows():
try:
df_target = pd.read_excel('./' + dirName + '/' + row['SECUCODE'] + '.xlsx', dtype='object', index_col=0)
except FileNotFoundError:
df_target = df.iloc[[index]]
df_target.to_excel('./' + dirName + '/' + row['SECURITY_CODE'] + '.xlsx', index=False)
continue
df_target = df_target.append(row)
df_target.drop_duplicates()
df_target.to_excel('./' + dirName + '/' + row['SECURITY_CODE'] + '.xlsx', index=False)
d = {'RPT_LICO_FN_CPD': '业绩报表', 'RPT_DMSK_FN_BALANCE': '资产负债表',
'RPT_DMSK_FN_INCOME': '利润表', 'RPT_DMSK_FN_CASHFLOW': '现金流量表'}
df_calc = pd.DataFrame()
for key, value in d.items():
df = get_data(key)
if value == '业绩报表' or value == '资产负债表':
if df_calc.empty:
df_calc = df
else:
df_calc = pd.merge(left=df_calc, right=df, left_on=['SECURITY_CODE', 'REPORTDATE'],
right_on=['SECURITY_CODE', 'REPORT_DATE']).reset_index(drop=True)
extract_data(df, value)
df_NO_ADV = df_calc.drop(columns=['ADVANCE_RECEIVABLES']).dropna()
df_NO_ADV = df_NO_ADV.drop(index=df_NO_ADV[df_NO_ADV['TOTAL_OPERATE_INCOME'] == 0].index)
df_NO_ADV = df_NO_ADV.drop(index=df_NO_ADV[df_NO_ADV['BASIC_EPS'] == 0].index)
df_NO_ADV = df_NO_ADV.drop(index=df_NO_ADV[df_NO_ADV['TOTAL_ASSETS'] == 0].index)
new_df_NO_ADV = pd.DataFrame()
new_df_NO_ADV['公告日'] = df_NO_ADV['NOTICE_DATE_x']
new_df_NO_ADV['报告期'] = df_NO_ADV['REPORTDATE']
new_df_NO_ADV['证券名称'] = df_NO_ADV['SECURITY_NAME_ABBR_x']
new_df_NO_ADV['证券代码'] = df_NO_ADV['SECURITY_CODE']
new_df_NO_ADV['净利率'] = df_NO_ADV['PARENT_NETPROFIT'] * 100 / df_NO_ADV['TOTAL_OPERATE_INCOME']
new_df_NO_ADV['毛利率'] = df_NO_ADV['XSMLL']
new_df_NO_ADV['应收账款占营收'] = df_NO_ADV['ACCOUNTS_RECE'] * 100 / df_NO_ADV['TOTAL_OPERATE_INCOME']
new_df_NO_ADV['经营净额比净利润'] = df_NO_ADV['MGJYXJJE'] * 100 / df_NO_ADV['BASIC_EPS'].where(df_NO_ADV['BASIC_EPS'] != 0.0,
0.00000001)
new_df_NO_ADV['净资产收益率ROE'] = df_NO_ADV['WEIGHTAVG_ROE']
new_df_NO_ADV['资产负债比'] = df_NO_ADV['TOTAL_LIABILITIES'] * 100 / df_NO_ADV['TOTAL_ASSETS']
new_df_NO_ADV['投入资本回报率ROIC'] = df_NO_ADV['WEIGHTAVG_ROE'] * 100 / (100 + new_df_NO_ADV['资产负债比'])
df = new_df_NO_ADV
df = df[df['净利率'] >= 10]
df = df[df['毛利率'] >= 40]
df = df[df['应收账款占营收'] <= 20]
df = df[df['经营净额比净利润'] >= 120]
df = df[df['净资产收益率ROE'] >= 15]
df = df[df['投入资本回报率ROIC'] >= 3]
newly = df.apply(lambda y: y.apply(lambda x: str(x).split(' 00:00:00')[0]))
output = '# 近七日新增\r\n'
s = newly.to_markdown(index=False)
output += s
output += '\r\n\r\n'
df_origin = pd.read_excel('筛选结果.xlsx', dtype='object')
df = pd.concat([df_origin, df], axis=0)
df.to_excel('筛选结果.xlsx', index=False)
date = datetime.datetime.now()
month = (date.month - 1) - (date.month - 1) % 3 + 1
newdate = datetime.datetime(date.year, month, 1)
newdate = newdate + datetime.timedelta(days=-1)
near = df[pd.to_datetime(df['报告期']) >= newdate].apply(
lambda y: y.apply(lambda x: str(x).split(' 00:00:00')[0]))
output += '# 最近季度\r\n'
s = near.to_markdown(index=False)
output += s
output += '\r\n\r\n'
date = newdate
month = (date.month - 1) - (date.month - 1) % 3 + 1
newdate = datetime.datetime(date.year, month, 1)
newdate = newdate + datetime.timedelta(days=-1)
nearlast = df[pd.to_datetime(df['报告期']) == newdate].apply(
lambda y: y.apply(lambda x: str(x).split(' 00:00:00')[0]))
output += '# 上一季度\r\n'
s = nearlast.to_markdown(index=False)
output += s
output += '\r\n\r\n'
date = newdate
month = (date.month - 1) - (date.month - 1) % 3 + 1
newdate = datetime.datetime(date.year, month, 1)
newdate = newdate + datetime.timedelta(days=-1)
nearnearlast = df[pd.to_datetime(df['报告期']) == newdate].apply(
lambda y: y.apply(lambda x: str(x).split(' 00:00:00')[0]))
output += '# 上上季度\r\n'
s = nearnearlast.to_markdown(index=False)
output += s
output += '\r\n\r\n'
jnji1 = pd.merge(near, nearlast, on=['证券代码', '证券名称'])[['证券代码', '证券名称']]
jnji2 = pd.merge(near, nearnearlast, on=['证券代码', '证券名称'])[['证券代码', '证券名称']]
jnji = pd.concat([jnji1, jnji2], axis=0).drop_duplicates()
output += '# 交集\r\n'
s = jnji.to_markdown(index=False)
output += s
output += '\r\n\r\n'
f = open('README.md', 'w', encoding='utf-8')
f.write(output)
f.close()