From bed01ae7a278f2669a6b5384d7646b9932695de1 Mon Sep 17 00:00:00 2001 From: oldshensheep Date: Sat, 8 Jul 2023 22:50:52 +0800 Subject: [PATCH] change database schema, auto detect max page of node --- requirements-analysis.txt | 2 ++ requirements.txt | 3 +-- v2ex_scrapy/DB.py | 6 ++--- v2ex_scrapy/items.py | 5 +++- v2ex_scrapy/middlewares.py | 18 ++++++++------ v2ex_scrapy/pipelines.py | 13 +++++----- v2ex_scrapy/spiders/CommonSpider.py | 5 +--- v2ex_scrapy/spiders/V2exNodeTopicSpider.py | 29 ++++++++++++++++++++-- v2ex_scrapy/utils.py | 7 ++++++ v2ex_scrapy/v2ex_parser.py | 7 +++++- 10 files changed, 67 insertions(+), 28 deletions(-) create mode 100644 requirements-analysis.txt diff --git a/requirements-analysis.txt b/requirements-analysis.txt new file mode 100644 index 0000000..8a276d6 --- /dev/null +++ b/requirements-analysis.txt @@ -0,0 +1,2 @@ +pandas==2.0.1 +plotly==5.14.1 diff --git a/requirements.txt b/requirements.txt index cb31c2e..e6fb44b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ arrow==1.2.3 -pandas==2.0.1 -plotly==5.14.1 +httpx==0.24.1 Scrapy==2.9.0 SQLAlchemy==2.0.17 diff --git a/v2ex_scrapy/DB.py b/v2ex_scrapy/DB.py index a0cb1e9..95f6b36 100644 --- a/v2ex_scrapy/DB.py +++ b/v2ex_scrapy/DB.py @@ -26,9 +26,9 @@ def __new__(cls): cls._instance = super().__new__(cls) return cls._instance - def __init__(self): + def __init__(self, database_name="v2ex.sqlite"): self.engine = create_engine( - "sqlite:///v2ex.sqlite", + f"sqlite:///{database_name}", echo=False, json_serializer=lambda x: json.dumps(x, ensure_ascii=False), ) @@ -61,7 +61,7 @@ def get_max_topic_id(self) -> int: def get_topic_comment_count(self, topic_id) -> int: result = self.session.execute( - text("select count(*) from comment where topic_id = :q"), {"q": topic_id} + text("select reply_count from topic where id = :q"), {"q": topic_id} ).fetchone() if result is None or result[0] is None: return 0 diff --git a/v2ex_scrapy/items.py b/v2ex_scrapy/items.py index 17c51dc..b356e7e 100644 --- a/v2ex_scrapy/items.py +++ b/v2ex_scrapy/items.py @@ -26,7 +26,7 @@ class TopicItem(Base): id_: Mapped[int] = mapped_column(name="id", primary_key=True) author: Mapped[str] = mapped_column(nullable=False) title: Mapped[str] = mapped_column(nullable=False) - content: Mapped[str] + content: Mapped[str] = mapped_column() node: Mapped[str] = mapped_column(nullable=False) tag: Mapped[list[str]] = mapped_column(nullable=False) clicks: Mapped[int] = mapped_column(nullable=False) @@ -34,6 +34,7 @@ class TopicItem(Base): create_at: Mapped[int] = mapped_column(nullable=False) thank_count: Mapped[int] = mapped_column(nullable=False) favorite_count: Mapped[int] = mapped_column(nullable=False) + reply_count: Mapped[int] = mapped_column(nullable=False) @staticmethod def err_topic(topic_id: int): @@ -49,6 +50,7 @@ def err_topic(topic_id: int): votes=-1, thank_count=-1, favorite_count=-1, + reply_count=-1, ) @@ -72,6 +74,7 @@ class CommentItem(Base): content: Mapped[str] = mapped_column(nullable=False) thank_count: Mapped[int] = mapped_column(nullable=False) create_at: Mapped[int] = mapped_column(nullable=False) + no: Mapped[int] = mapped_column(nullable=False) @dataclass(kw_only=True) diff --git a/v2ex_scrapy/middlewares.py b/v2ex_scrapy/middlewares.py index f570a4a..8fe6995 100644 --- a/v2ex_scrapy/middlewares.py +++ b/v2ex_scrapy/middlewares.py @@ -7,13 +7,13 @@ import random import time -from http.cookies import SimpleCookie import scrapy import scrapy.http.response.html from scrapy import signals - +from scrapy.exceptions import IgnoreRequest from v2ex_scrapy.DB import DB, LogItem +from v2ex_scrapy import utils class TutorialScrapySpiderMiddleware: @@ -95,10 +95,14 @@ def process_request(self, request: scrapy.Request, spider): return None def process_response( - self, request, response: scrapy.http.response.html.HtmlResponse, spider + self, + request: scrapy.Request, + response: scrapy.http.response.html.HtmlResponse, + spider: scrapy.Spider, ): # Called with the response returned from the downloader. - + if response.status == 403: + raise IgnoreRequest(f"403 url {response.url}") # Must either; # - return a Response object # - return a Request object @@ -118,10 +122,8 @@ def process_exception(self, request, exception, spider): def spider_opened(self, spider: scrapy.Spider): self.proxies = spider.settings.get("PROXIES", []) # type: ignore - if type(cookie_str := spider.settings.get("COOKIES", "")) == str: - simple_cookie = SimpleCookie() - simple_cookie.load(cookie_str) # type: ignore - self.cookies = {k: v.value for k, v in simple_cookie.items()} + cookie_str = spider.settings.get("COOKIES", "") + self.cookies = utils.cookie_str2cookie_dict(cookie_str) # type: ignore spider.logger.info("Spider opened: %s" % spider.name) diff --git a/v2ex_scrapy/pipelines.py b/v2ex_scrapy/pipelines.py index f226ff2..4b7eccb 100644 --- a/v2ex_scrapy/pipelines.py +++ b/v2ex_scrapy/pipelines.py @@ -11,12 +11,7 @@ # don't remove import v2ex_scrapy.insert_ignore from v2ex_scrapy.DB import DB -from v2ex_scrapy.items import ( - CommentItem, - MemberItem, - TopicItem, - TopicSupplementItem, -) +from v2ex_scrapy.items import CommentItem, MemberItem, TopicItem, TopicSupplementItem ItemsType = Union[TopicItem, CommentItem, MemberItem, TopicSupplementItem] @@ -48,7 +43,11 @@ def process_item( self.db.session.commit() return item - def close_spider(self, spider): + def save_all(self): for _, v in self.data.items(): self.db.session.add_all(v) + self.db.session.commit() + + def close_spider(self, spider): + self.save_all() self.db.close() diff --git a/v2ex_scrapy/spiders/CommonSpider.py b/v2ex_scrapy/spiders/CommonSpider.py index 8c521fe..3bed1c1 100644 --- a/v2ex_scrapy/spiders/CommonSpider.py +++ b/v2ex_scrapy/spiders/CommonSpider.py @@ -9,13 +9,10 @@ class CommonSpider: - def __init__( - self, logger, update_topic=False, update_member=False, update_comment=False - ): + def __init__(self, logger, update_member=False, update_comment=False): self.db = DB() self.logger = logger self.UPDATE_MEMBER = update_member - # only work when UPDATE_TOPIC self.UPDATE_COMMENT = update_comment def parse_topic_err(self, failure): diff --git a/v2ex_scrapy/spiders/V2exNodeTopicSpider.py b/v2ex_scrapy/spiders/V2exNodeTopicSpider.py index 2c043fa..865081a 100644 --- a/v2ex_scrapy/spiders/V2exNodeTopicSpider.py +++ b/v2ex_scrapy/spiders/V2exNodeTopicSpider.py @@ -1,9 +1,13 @@ +import httpx import scrapy import scrapy.http.response.html +from parsel import Selector +from scrapy.utils.project import get_project_settings from v2ex_scrapy.DB import DB from v2ex_scrapy.items import TopicItem from v2ex_scrapy.spiders.CommonSpider import CommonSpider +from v2ex_scrapy import utils class V2exTopicSpider(scrapy.Spider): @@ -11,6 +15,12 @@ class V2exTopicSpider(scrapy.Spider): UPDATE_TOPIC_WHEN_REPLY_CHANGE = True UPDATE_COMMENT = True # only work when UPDATE_TOPIC_WHEN_REPLY_CHANGE = True + URL = "https://www.v2ex.com/go/" + + """ + 现存在的几个问题,因为节点的排序是动态的,如果爬完一页后未爬的主题跑到爬完的页数里那就爬不到了。 + 解决方法1,开始爬取时先获取全部帖子ID再开始爬,获取ID的速度比较快所以排序改变的幅度不会很大。 + """ def __init__(self, node="flamewar", *args, **kwargs): super().__init__(*args, **kwargs) @@ -19,11 +29,25 @@ def __init__(self, node="flamewar", *args, **kwargs): self.common_spider = CommonSpider( self.logger, update_comment=self.UPDATE_COMMENT ) + settings = get_project_settings() + resp = httpx.get( + f"{self.URL}{self.node}", + timeout=10, + follow_redirects=True, + cookies=utils.cookie_str2cookie_dict(settings.get("COOKIES", "")), # type: ignore + headers={"User-Agent": settings.get("USER_AGENT", "")}, # type: ignore + ).text + max_page = ( + Selector(text=resp) + .xpath('//tr/td[@align="left" and @width="92%"]/a[last()]/text()') + .get("1") + ) + self.max_page = int(max_page) def start_requests(self): - for i in range(552, 0, -1): + for i in range(self.max_page, 0, -1): yield scrapy.Request( - url=f"https://www.v2ex.com/go/{self.node}?p={i}", + url=f"{self.URL}{self.node}?p={i}", callback=self.parse, cb_kwargs={"page": i}, ) @@ -33,6 +57,7 @@ def parse(self, response: scrapy.http.response.html.HtmlResponse, page: int): (int(x), int(y)) for x, y in zip( response.xpath('//span[@class="item_title"]/a/@id').re(r"\d+"), + # not correct when some comments are deleted, fuck response.xpath('//span[@class="item_title"]/a/@href').re(r"reply(\d+)"), ) ] diff --git a/v2ex_scrapy/utils.py b/v2ex_scrapy/utils.py index bef7d33..c636a58 100644 --- a/v2ex_scrapy/utils.py +++ b/v2ex_scrapy/utils.py @@ -1,3 +1,4 @@ +from http.cookies import SimpleCookie import json from typing import Union @@ -27,6 +28,12 @@ def json_to_str(j): return json.dumps(j, ensure_ascii=False) +def cookie_str2cookie_dict(cookie_str: str): + simple_cookie = SimpleCookie() + simple_cookie.load(cookie_str) + return {k: v.value for k, v in simple_cookie.items()} + + if __name__ == "__main__": a = ["2022-04-28 13:24:38 +08:00", "287 天前", "1 小时前"] diff --git a/v2ex_scrapy/v2ex_parser.py b/v2ex_scrapy/v2ex_parser.py index 51bca9a..da6f2e9 100644 --- a/v2ex_scrapy/v2ex_parser.py +++ b/v2ex_scrapy/v2ex_parser.py @@ -42,8 +42,10 @@ def parse_comment(response: scrapy.http.response.html.HtmlResponse, topic_id): reply_content = cbox.xpath('.//div[@class="reply_content"]').get("") reply_time = cbox.css(".ago::attr(title)").get("") thank_count = cbox.css(".fade::text").get("0").strip() + no = cbox.css(".no::text").get("-1").strip() yield CommentItem( id_=int(comment_id), + no=int(no), commenter=author_name, topic_id=topic_id, content=reply_content, @@ -74,7 +76,9 @@ def parse_topic(response: scrapy.http.response.html.HtmlResponse, topic_id): ) topic_content = response.css(".cell .topic_content").get("") - + topic_reply_count = response.css(".box > .cell > .gray::text").re_first( + r"(\d+) 条回复", "0" + ) yield TopicItem( id_=topic_id, author=topic_author, @@ -87,6 +91,7 @@ def parse_topic(response: scrapy.http.response.html.HtmlResponse, topic_id): votes=int(topic_vote), thank_count=int(topic_thank_count), favorite_count=int(topic_favorite_count), + reply_count=int(topic_reply_count), )