Skip to content

Commit

Permalink
add V2ex node spider
Browse files Browse the repository at this point in the history
  • Loading branch information
oldshensheep committed Jul 8, 2023
1 parent 65e455d commit e3619dd
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 88 deletions.
4 changes: 2 additions & 2 deletions v2ex_scrapy/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ class TopicItem(Base):
thank_count: Mapped[int] = mapped_column(nullable=False)
favorite_count: Mapped[int] = mapped_column(nullable=False)

@classmethod
def err_topic(cls, topic_id: int):
@staticmethod
def err_topic(topic_id: int):
return TopicItem(
id_=topic_id,
author="",
Expand Down
104 changes: 104 additions & 0 deletions v2ex_scrapy/spiders/CommonSpider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import math

import scrapy
import scrapy.http.response.html

from v2ex_scrapy import v2ex_parser
from v2ex_scrapy.DB import DB
from v2ex_scrapy.items import MemberItem, TopicItem


class CommonSpider:
def __init__(
self, logger, update_topic=False, update_member=False, update_comment=False
):
self.db = DB()
self.logger = logger
self.UPDATE_MEMBER = update_member
# only work when UPDATE_TOPIC
self.UPDATE_COMMENT = update_comment

def parse_topic_err(self, failure):
topic_id = failure.request.cb_kwargs["topic_id"]
self.logger.warn(f"Crawl Topic Err {topic_id}")
yield TopicItem.err_topic(topic_id=topic_id)

def parse_topic(
self, response: scrapy.http.response.html.HtmlResponse, topic_id: int
):
self.logger.info(f"Crawl Topic {topic_id}")

if response.status == 302:
# need login or account too young
yield TopicItem.err_topic(topic_id=topic_id)
else:
for i in v2ex_parser.parse_topic_supplement(response, topic_id):
yield i
for topic in v2ex_parser.parse_topic(response, topic_id):
yield topic
for i in self.crawl_member(topic.author, response):
yield i
for i in self.parse_comment(response, topic_id):
yield i
# crawl sub page comment
topic_reply_count = int(
response.css(
"#Main > div:nth-child(4) > div:nth-child(1) > span::text"
).re_first(r"\d+", "-1")
)
c = self.db.get_topic_comment_count(topic_id)
if (
# 爬了一部分 并且设置更新评论
(0 < c < topic_reply_count)
and self.UPDATE_COMMENT
) or (
# 没有爬 并且有评论
topic_reply_count > 0
and c == 0
):
total_page = math.ceil(topic_reply_count / 100)
for i in range(max(2, math.ceil(c / 100)), total_page + 1):
for j in self.crawl_comment(topic_id, i, response):
yield j

def crawl_comment(self, topic_id, page, response):
yield response.follow(
f"/t/{topic_id}?p={page}",
callback=self.parse_comment,
cb_kwargs={"topic_id": topic_id},
)

def parse_comment(self, response: scrapy.http.response.html.HtmlResponse, topic_id):
for comment_item in v2ex_parser.parse_comment(response, topic_id):
yield comment_item
for i in self.crawl_member(comment_item.commenter, response):
yield i

def crawl_member(self, username, response: scrapy.http.response.html.HtmlResponse):
if username != "" and (
self.UPDATE_MEMBER or not self.db.exist(MemberItem, username)
):
yield response.follow(
f"/member/{username}",
callback=self.parse_member,
errback=self.member_err,
cb_kwargs={"username": username},
)

def member_err(self, failure):
username = failure.request.cb_kwargs["username"]
self.logger.warn(f"Crawl Member Err {username}")
yield MemberItem(
username=username,
avatar_url="",
create_at=0,
social_link=[],
uid=-1,
)

def parse_member(
self, response: scrapy.http.response.html.HtmlResponse, username: str
):
self.logger.info(f"Crawl Member {username}")
for i in v2ex_parser.parse_member(response=response):
yield i
49 changes: 49 additions & 0 deletions v2ex_scrapy/spiders/V2exNodeTopicSpider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import scrapy
import scrapy.http.response.html

from v2ex_scrapy.DB import DB
from v2ex_scrapy.items import TopicItem
from v2ex_scrapy.spiders.CommonSpider import CommonSpider


class V2exTopicSpider(scrapy.Spider):
name = "v2ex-node"

UPDATE_TOPIC_WHEN_REPLY_CHANGE = True
UPDATE_COMMENT = True # only work when UPDATE_TOPIC_WHEN_REPLY_CHANGE = True

def __init__(self, node="flamewar", *args, **kwargs):
super().__init__(*args, **kwargs)
self.db = DB()
self.node = node
self.common_spider = CommonSpider(
self.logger, update_comment=self.UPDATE_COMMENT
)

def start_requests(self):
for i in range(552, 0, -1):
yield scrapy.Request(
url=f"https://www.v2ex.com/go/{self.node}?p={i}",
callback=self.parse,
cb_kwargs={"page": i},
)

def parse(self, response: scrapy.http.response.html.HtmlResponse, page: int):
topics = [
(int(x), int(y))
for x, y in zip(
response.xpath('//span[@class="item_title"]/a/@id').re(r"\d+"),
response.xpath('//span[@class="item_title"]/a/@href').re(r"reply(\d+)"),
)
]
for i, reply_count in topics:
if not self.db.exist(TopicItem, i) or (
self.UPDATE_TOPIC_WHEN_REPLY_CHANGE
and self.db.get_topic_comment_count(i) < reply_count
):
yield scrapy.Request(
url=f"https://www.v2ex.com/t/{i}",
callback=self.common_spider.parse_topic,
errback=self.common_spider.parse_topic_err,
cb_kwargs={"topic_id": i},
)
97 changes: 11 additions & 86 deletions v2ex_scrapy/spiders/V2exSpider.py
Original file line number Diff line number Diff line change
@@ -1,116 +1,41 @@
import math

import scrapy
import scrapy.http.response.html

from v2ex_scrapy import v2ex_parser
from v2ex_scrapy.DB import DB
from v2ex_scrapy.items import MemberItem, TopicItem
from v2ex_scrapy.items import TopicItem
from v2ex_scrapy.spiders.CommonSpider import CommonSpider


class V2exTopicSpider(scrapy.Spider):
name = "v2ex"
start_id = 1
end_id = 1000000
UPDATE_TOPIC = False
UPDATE_COMMENT = False
UPDATE_MEMBER = False
# only work when UPDATE_TOPIC = True
UPDATE_COMMENT = True

def __init__(self, name=None, **kwargs):
super().__init__(name, **kwargs)
self.db = DB()
self.start_id = self.db.get_max_topic_id()
self.common_spider = CommonSpider(
self.logger, update_comment=self.UPDATE_COMMENT
)
self.logger.info(f"start from topic id {self.start_id}, end at {self.end_id}")

def start_requests(self):
# 之前的评论和用户信息可能没爬完,所以继续爬停止时的topic
yield scrapy.Request(
url=f"https://www.v2ex.com/t/{self.start_id}",
callback=self.parse,
callback=self.common_spider.parse_topic,
errback=self.common_spider.parse_topic_err,
cb_kwargs={"topic_id": self.start_id},
)
for i in range(self.start_id + 1, self.end_id + 1):
if self.UPDATE_TOPIC or not self.db.exist(TopicItem, i):
yield scrapy.Request(
url=f"https://www.v2ex.com/t/{i}",
callback=self.parse,
errback=self.parse_topic_err,
callback=self.common_spider.parse_topic,
errback=self.common_spider.parse_topic_err,
cb_kwargs={"topic_id": i},
)

def parse_topic_err(self, failure):
topic_id = failure.request.cb_kwargs["topic_id"]
self.logger.warn(f"Crawl Topic Err {topic_id}")
yield TopicItem.err_topic(topic_id)

def parse(self, response: scrapy.http.response.html.HtmlResponse, topic_id: int):
self.logger.info(f"Crawl Topic {topic_id}")

if response.status == 302:
# need login or account too young
yield TopicItem.err_topic(topic_id=topic_id)
else:
for i in v2ex_parser.parse_topic_supplement(response, topic_id):
yield i
for topic in v2ex_parser.parse_topic(response, topic_id):
yield topic
for i in self.crawl_member(topic.author, response):
yield i
for i in self.parse_comment(response, topic_id):
yield i
# crawl sub page comment
topic_reply_count = int(
response.css(
"#Main > div:nth-child(4) > div:nth-child(1) > span::text"
).re_first(r"\d+", "-1")
)
if (
self.UPDATE_COMMENT
or self.db.get_topic_comment_count(topic_id) < topic_reply_count
):
total_page = math.ceil(topic_reply_count / 100)
for i in range(2, total_page + 1):
for j in self.crawl_comment(topic_id, i, response):
yield j

def crawl_comment(self, topic_id, page, response):
yield response.follow(
f"/t/{topic_id}?p={page}",
callback=self.parse_comment,
cb_kwargs={"topic_id": topic_id},
)

def parse_comment(self, response: scrapy.http.response.html.HtmlResponse, topic_id):
for comment_item in v2ex_parser.parse_comment(response, topic_id):
yield comment_item
for i in self.crawl_member(comment_item.commenter, response):
yield i

def crawl_member(self, username, response: scrapy.http.response.html.HtmlResponse):
if username != "" and (
self.UPDATE_MEMBER or not self.db.exist(MemberItem, username)
):
yield response.follow(
f"/member/{username}",
callback=self.parse_member,
errback=self.member_err,
cb_kwargs={"username": username},
)

def member_err(self, failure):
username = failure.request.cb_kwargs["username"]
self.logger.warn(f"Crawl Member Err {username}")
yield MemberItem(
username=username,
avatar_url="",
create_at=0,
social_link=[],
uid=-1,
)

def parse_member(
self, response: scrapy.http.response.html.HtmlResponse, username: str
):
self.logger.info(f"Crawl Member {username}")
for i in v2ex_parser.parse_member(response=response):
yield i

0 comments on commit e3619dd

Please sign in to comment.