-
Notifications
You must be signed in to change notification settings - Fork 5
/
EopCrawler.py
238 lines (222 loc) · 10.8 KB
/
EopCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
from bs4 import BeautifulSoup
from EopScorePageItem import EopPageItem
import gzip
import urllib.request
import http.cookiejar
import urllib.parse
import os.path
import time
import codecs
class EopCrawler(object):
def __init__(self):
return
UrlPage = "http://www.everyonepiano.cn/Music.html?canshu=id&paixu=desc&p="
UrlHome = "http://www.everyonepiano.cn"
op = None
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Connection': 'keep-alive',
'Host': 'accounts.pixiv.net',
'Referer': 'http://www.everyonepiano.cn/Music.html?paixu=desc',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/50.0.2661.102 Safari/537.36'
}
def ungzip(self,data):
try:
data = gzip.decompress(data)
except Exception as e:
print(e)
return data
def getopener(self, header):
cj = http.cookiejar.CookieJar()
cp = urllib.request.HTTPCookieProcessor(cj)
op = urllib.request.build_opener(cp)
h = []
for key, value in header.items():
elem = (key, value)
h.append(elem)
op.addheaders = h
return op
def getHtml (self, url):
if self.op is None:
self.op = self.getopener(self.headers)
html = None
with self.op.open(url) as f:
if f.status == 200:
op_key = self.op.open(url)
data = op_key.read()
op_key.close()
html = self.ungzip(data).decode('utf-8')
return html
def getPageItems(self, html):
rootSoup = BeautifulSoup(html, 'lxml')
# 获得#EOPMain中的所有class=MusicIndexBox的div
selector = rootSoup.select('div.MusicIndexBox')
'''
<div class="MusicIndexBox">
<div class="MITitle">
<div class="MIMusicNO hidden-xs">0008579</div>
<a href="/Music-8579-别-薛之谦.html" title="别-薛之谦" target="_blank" class="Title">别-薛之谦</a>-
<a href="/Music.html?author=%E8%96%9B%E4%B9%8B%E8%B0%A6" style="color:green;" title="薛之谦">薛之谦</a></div>
<div class="row">
<div class="col-xs-12 col-sm-10 col-md-8 MIMusicBar" style="background:url(/Public/img2016/launch_cn.png) no-repeat bottom right;">
<div class="MIMusicPICDiv">
<a href="/Music-8579-别-薛之谦.html" title="别-薛之谦" target="_blank">
<img src="/pianomusic/009/0008579/0008579-small.jpg" class="MIMusicPIC" alt="别-薛之谦" onerror="this.src='/Public/img2015/noeoppic.jpg'"></a>
<div class="MIMusicUpdate">2017/11/11</div></div>别是由歌手薛之谦作词、作曲并演唱的一首歌曲,发行于2017年10月31日。下面是别钢琴谱,感兴趣的朋友可以使用。
<div class="MusicBtn1 hidden-xs">曲谱格式:
<a href="/Music-8579-别-薛之谦.html#别-薛之谦五线谱下载" title="下载:618次" target="_blank">
<img src="/Public/img2016/stave.png" width="36" height="21" /></a>
<a href="/Music-8579-别-薛之谦.html#别-薛之谦双手简谱下载" title="下载:817次" target="_blank">
<img src="/Public/img2016/num.png" width="36" height="21" /></a>
<a href="/Music-8579-别-薛之谦.html#别-薛之谦EOP文件下载" title="下载:301次" target="_blank">
<img src="/Public/img2016/eop.png" width="36" height="21" style="margin-right:20px;" /></a>上传者:
<font color="#666666">EOP小编</font></div>
</div>
<div class="hidden-xs col-sm-2 col-md-2 MIMusicInfo2">
<span class="MIMusicInfo2Num">2646</span>
<span class="MIMusicInfo2Time">次</span></div>
<div class="hidden-xs hidden-sm col-md-2 MIMusicInfo3">
<a href="/Mp3-8579-别-薛之谦.html" target="_blank" title="我要试听">
<div class="BigBtn_MP3"></div>
</a>
<a href="/Music/returns/8579" target="_blank" title="相关视频">
<div class="BigBtn_Video"></div>
</a>
</div>
</div>
<div class="hidden-sm hidden-md hidden-lg MISmallBtn">
<a href="/Music/returns/8579" target="_blank" title="相关视频">
<div class="MBtn_Video"></div>
</a>
<a href="/Mp3-8579-别-薛之谦.html" target="_blank" title="我要试听">
<div class="MBtn_Mp3"></div>
</a>
<a href="/Music-8579-别-薛之谦.html#别-薛之谦EOP文件下载" title="下载:301次" target="_blank">
<div class="MBtn_EOP"></div>
</a>
<a href="/Music-8579-别-薛之谦.html#别-薛之谦双手简谱下载" title="下载:817次" target="_blank">
<div class="MBtn_Num"></div>
</a>
<a href="/Music-8579-别-薛之谦.html#别-薛之谦五线谱下载" title="下载:618次" target="_blank">
<div class="MBtn_Stave"></div>
</a>
</div>
</div>
'''
# 遍历处理div
items = []
for child in selector:
strid = str(child.select('div.MITitle > div')[0].string)
author = str(child.select('div.MITitle > a')[1].string)
title = str(child.select('div.MITitle > a')[0].string)
title = title.replace("-" + author, '')
url = child.select('div.MITitle > a')[0]['href']
url = self.UrlHome + url
date = child.select('div.MIMusicUpdate')[0].string
items.append(EopPageItem(strid, url, date,title,author))
return items
# 分析出谱子图片的url
def getImgUrls(self, item):
# 处理五线谱页
html = self.getHtml(item.staveUrl)
if html is not None:
rootSoup = BeautifulSoup(html, 'lxml')
selector = rootSoup.select('div.PngDiv > ul > li')
for child in selector:
item.staveImgs.append(self.UrlHome + child.select('img')[0]['src'])
# 处理简谱页
html = self.getHtml(item.numberUrl)
if html is not None:
rootSoup = BeautifulSoup(html, 'lxml')
selector = rootSoup.select('div.PngDiv > ul > li')
for child in selector:
item.numberImgs.append(self.UrlHome + child.select('img')[0]['src'])
return item
# 执行谱子下载
def doDownLoadImgs(self,item, parentdir):
# 储存路径
path = item.getSavePath(parentdir)
if os.path.exists(path) is False:
os.makedirs(path)
if self.op is None:
self.op = self.getopener(self.headers)
# 下载五线谱
i = 0
for url in item.staveImgs:
i += 1
imgPath = os.path.join(path, item.rep(item.title) + "_stave_" + str(i).zfill(3) + ".jpg")
try:
with self.op.open(url) as f:
if f.status == 200:
with open(imgPath, 'wb') as o:
o.write(f.read())
print('成功下载 -> %s' % imgPath)
o.close()
# 等待,爬得太快容易被发现
# time.sleep(0.5)
except Exception as e:
with codecs.open(os.path.join(parentdir, "log.txt"), "a", "UTF-8") as f:
errinfo = ''
for ii in range(0, len(e.args)):
arg = e.args[ii]
if self.isNum(arg):
arg = str(arg)
errinfo += ' ' + arg
errinfo += ' stavD=> ' + imgPath
f.write(errinfo + "\r\n")
continue
# 下载简谱
i = 0
for url in item.numberImgs:
i += 1
try:
with self.op.open(url) as f:
imgPath = os.path.join(path, item.rep(item.title) + "_number_" + str(i).zfill(3) + ".jpg")
if f.status == 200:
with open(imgPath, 'wb') as o:
o.write(f.read())
print('成功下载 -> %s' % imgPath)
o.close()
# 等待,爬得太快容易被发现
time.sleep(0.5)
except Exception as e:
with codecs.open(os.path.join(parentdir, "log.txt"), "a", "UTF-8") as f:
errinfo = ''
for ii in range(0, len(e.args)):
arg = e.args[ii]
if self.isNum(arg):
arg = str(arg)
errinfo += ' ' + arg
errinfo += ' numbD=> ' + imgPath
f.write(errinfo + "\r\n")
continue
# 保存信息
try:
with open(os.path.join(path, item.rep(item.title) + ".txt"), "w") as f:
f.write(" Form:\t\t" + item.url + "\r\n")
f.write(" ID:\t\t" + item.strid + "\r\n")
f.write("UpdateDate:\t\t" + item.date + "\r\n")
f.write(" Sorting:\t\tShawn\r\n")
except Exception as e:
with codecs.open(os.path.join(parentdir, "log.txt"), "a", "UTF-8") as f:
errinfo = ''
for ii in range(0, len(e.args)):
arg = e.args[ii]
if self.isNum(arg):
arg = str(arg)
errinfo += ' ' + arg
errinfo += ' writeInfo=> ' + os.path.join(path, item.rep(item.title) + ".txt")
f.write(errinfo + "\r\n")
return
# 判断是否为数字
def isNum(self, value):
try:
value + 1
except TypeError:
return False
else:
return True