forked from lidingke/go_python_crawler_benchmark
-
Notifications
You must be signed in to change notification settings - Fork 0
/
py_get.py
90 lines (77 loc) · 2.99 KB
/
py_get.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pdb
import scrapy
import mysql.connector
from sqlalchemy import create_engine
import sqlalchemy
import os
import threading
with open("dbline.txt",'r') as f:
dbline = f.read().strip()
class Hander(object):
engine = create_engine('mysql+mysqlconnector://%[email protected]:3306/trnet'%dbline)
lock = threading.Lock()
def insert(self, id,content,sub):
# table = self._get_table_in_month(id_)
# self.lock.acquire()
with self.lock:
with self.engine.connect() as con:
cmd = """INSERT INTO company_test (id, content, sub) VALUES ({},\'{}\',\'{}\')""".format(id,content,sub)
# print(cmd)
result = con.execute(cmd)
# print(result)
# line = result.first()
# self.lock.release()
# return line
# with open('ends.txt','r') as f:
# line = f.readlines()
# line = [l.strip().split(':') for l in line]
# gets = {l[0]:int(l[1]) for l in line}
# pdb.set_trace()
pre = [("B","25"),("C","25"),("D","25"),("E","25"),]
# pres = [ for a,b in pre]
start_urls = []
for a,b in pre:
for i in range(int(b)):
_ = 'http://shop.99114.com/list/pinyin/{}_{}'.format(a,i)
start_urls.append(_)
# start_urls = start_urls+[ for a,b in pre]
print("len start urls",len(start_urls))
class BlogSpider(scrapy.Spider):
name = 'blogspider'
start_urls = start_urls
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "www.xxxxxx.com",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) "+
"AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"
}
head = Hander()
def parse(self, response):
# re =//*[@id="footerTop"]/ul/li[1]/a[2]
for re in response.xpath('//*[@id="footerTop"]/ul/li/a'):
sub = re.xpath('@href').extract_first()
content = re.xpath('b/text()').extract_first()
id = sub.split('/')[-1]
# print(id,sub,content)
try:
self.head.insert(id,content,sub)
except mysql.connector.errors.IntegrityError as e:
print(e)
pass
except sqlalchemy.exc.IntegrityError as e:
print(e)
pass
# except sqlalchemy.exc
# except sqlalchemy.exc
except mysql.connector.errors.ProgrammingError as e:
if content.find(','):
content = content.replace('\'', '')
self.head.insert(id, content, sub)
# pdb.set_trace()
# pdb.set_trace()
# for title in response.css('.post-header>h2'):
# yield {'title': title.css('a ::text').extract_first()}