Skip to content

Commit

Permalink
change database schema, auto detect max page of node
Browse files Browse the repository at this point in the history
  • Loading branch information
oldshensheep committed Jul 8, 2023
1 parent e3619dd commit bed01ae
Show file tree
Hide file tree
Showing 10 changed files with 67 additions and 28 deletions.
2 changes: 2 additions & 0 deletions requirements-analysis.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pandas==2.0.1
plotly==5.14.1
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
arrow==1.2.3
pandas==2.0.1
plotly==5.14.1
httpx==0.24.1
Scrapy==2.9.0
SQLAlchemy==2.0.17
6 changes: 3 additions & 3 deletions v2ex_scrapy/DB.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ def __new__(cls):
cls._instance = super().__new__(cls)
return cls._instance

def __init__(self):
def __init__(self, database_name="v2ex.sqlite"):
self.engine = create_engine(
"sqlite:///v2ex.sqlite",
f"sqlite:///{database_name}",
echo=False,
json_serializer=lambda x: json.dumps(x, ensure_ascii=False),
)
Expand Down Expand Up @@ -61,7 +61,7 @@ def get_max_topic_id(self) -> int:

def get_topic_comment_count(self, topic_id) -> int:
result = self.session.execute(
text("select count(*) from comment where topic_id = :q"), {"q": topic_id}
text("select reply_count from topic where id = :q"), {"q": topic_id}
).fetchone()
if result is None or result[0] is None:
return 0
Expand Down
5 changes: 4 additions & 1 deletion v2ex_scrapy/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,15 @@ class TopicItem(Base):
id_: Mapped[int] = mapped_column(name="id", primary_key=True)
author: Mapped[str] = mapped_column(nullable=False)
title: Mapped[str] = mapped_column(nullable=False)
content: Mapped[str]
content: Mapped[str] = mapped_column()
node: Mapped[str] = mapped_column(nullable=False)
tag: Mapped[list[str]] = mapped_column(nullable=False)
clicks: Mapped[int] = mapped_column(nullable=False)
votes: Mapped[int] = mapped_column(nullable=False)
create_at: Mapped[int] = mapped_column(nullable=False)
thank_count: Mapped[int] = mapped_column(nullable=False)
favorite_count: Mapped[int] = mapped_column(nullable=False)
reply_count: Mapped[int] = mapped_column(nullable=False)

@staticmethod
def err_topic(topic_id: int):
Expand All @@ -49,6 +50,7 @@ def err_topic(topic_id: int):
votes=-1,
thank_count=-1,
favorite_count=-1,
reply_count=-1,
)


Expand All @@ -72,6 +74,7 @@ class CommentItem(Base):
content: Mapped[str] = mapped_column(nullable=False)
thank_count: Mapped[int] = mapped_column(nullable=False)
create_at: Mapped[int] = mapped_column(nullable=False)
no: Mapped[int] = mapped_column(nullable=False)


@dataclass(kw_only=True)
Expand Down
18 changes: 10 additions & 8 deletions v2ex_scrapy/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@

import random
import time
from http.cookies import SimpleCookie

import scrapy
import scrapy.http.response.html
from scrapy import signals

from scrapy.exceptions import IgnoreRequest
from v2ex_scrapy.DB import DB, LogItem
from v2ex_scrapy import utils


class TutorialScrapySpiderMiddleware:
Expand Down Expand Up @@ -95,10 +95,14 @@ def process_request(self, request: scrapy.Request, spider):
return None

def process_response(
self, request, response: scrapy.http.response.html.HtmlResponse, spider
self,
request: scrapy.Request,
response: scrapy.http.response.html.HtmlResponse,
spider: scrapy.Spider,
):
# Called with the response returned from the downloader.

if response.status == 403:
raise IgnoreRequest(f"403 url {response.url}")
# Must either;
# - return a Response object
# - return a Request object
Expand All @@ -118,10 +122,8 @@ def process_exception(self, request, exception, spider):
def spider_opened(self, spider: scrapy.Spider):
self.proxies = spider.settings.get("PROXIES", []) # type: ignore

if type(cookie_str := spider.settings.get("COOKIES", "")) == str:
simple_cookie = SimpleCookie()
simple_cookie.load(cookie_str) # type: ignore
self.cookies = {k: v.value for k, v in simple_cookie.items()}
cookie_str = spider.settings.get("COOKIES", "")
self.cookies = utils.cookie_str2cookie_dict(cookie_str) # type: ignore

spider.logger.info("Spider opened: %s" % spider.name)

Expand Down
13 changes: 6 additions & 7 deletions v2ex_scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,7 @@
# don't remove
import v2ex_scrapy.insert_ignore
from v2ex_scrapy.DB import DB
from v2ex_scrapy.items import (
CommentItem,
MemberItem,
TopicItem,
TopicSupplementItem,
)
from v2ex_scrapy.items import CommentItem, MemberItem, TopicItem, TopicSupplementItem

ItemsType = Union[TopicItem, CommentItem, MemberItem, TopicSupplementItem]

Expand Down Expand Up @@ -48,7 +43,11 @@ def process_item(
self.db.session.commit()
return item

def close_spider(self, spider):
def save_all(self):
for _, v in self.data.items():
self.db.session.add_all(v)
self.db.session.commit()

def close_spider(self, spider):
self.save_all()
self.db.close()
5 changes: 1 addition & 4 deletions v2ex_scrapy/spiders/CommonSpider.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,10 @@


class CommonSpider:
def __init__(
self, logger, update_topic=False, update_member=False, update_comment=False
):
def __init__(self, logger, update_member=False, update_comment=False):
self.db = DB()
self.logger = logger
self.UPDATE_MEMBER = update_member
# only work when UPDATE_TOPIC
self.UPDATE_COMMENT = update_comment

def parse_topic_err(self, failure):
Expand Down
29 changes: 27 additions & 2 deletions v2ex_scrapy/spiders/V2exNodeTopicSpider.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,26 @@
import httpx
import scrapy
import scrapy.http.response.html
from parsel import Selector
from scrapy.utils.project import get_project_settings

from v2ex_scrapy.DB import DB
from v2ex_scrapy.items import TopicItem
from v2ex_scrapy.spiders.CommonSpider import CommonSpider
from v2ex_scrapy import utils


class V2exTopicSpider(scrapy.Spider):
name = "v2ex-node"

UPDATE_TOPIC_WHEN_REPLY_CHANGE = True
UPDATE_COMMENT = True # only work when UPDATE_TOPIC_WHEN_REPLY_CHANGE = True
URL = "https://www.v2ex.com/go/"

"""
现存在的几个问题,因为节点的排序是动态的,如果爬完一页后未爬的主题跑到爬完的页数里那就爬不到了。
解决方法1,开始爬取时先获取全部帖子ID再开始爬,获取ID的速度比较快所以排序改变的幅度不会很大。
"""

def __init__(self, node="flamewar", *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -19,11 +29,25 @@ def __init__(self, node="flamewar", *args, **kwargs):
self.common_spider = CommonSpider(
self.logger, update_comment=self.UPDATE_COMMENT
)
settings = get_project_settings()
resp = httpx.get(
f"{self.URL}{self.node}",
timeout=10,
follow_redirects=True,
cookies=utils.cookie_str2cookie_dict(settings.get("COOKIES", "")), # type: ignore
headers={"User-Agent": settings.get("USER_AGENT", "")}, # type: ignore
).text
max_page = (
Selector(text=resp)
.xpath('//tr/td[@align="left" and @width="92%"]/a[last()]/text()')
.get("1")
)
self.max_page = int(max_page)

def start_requests(self):
for i in range(552, 0, -1):
for i in range(self.max_page, 0, -1):
yield scrapy.Request(
url=f"https://www.v2ex.com/go/{self.node}?p={i}",
url=f"{self.URL}{self.node}?p={i}",
callback=self.parse,
cb_kwargs={"page": i},
)
Expand All @@ -33,6 +57,7 @@ def parse(self, response: scrapy.http.response.html.HtmlResponse, page: int):
(int(x), int(y))
for x, y in zip(
response.xpath('//span[@class="item_title"]/a/@id').re(r"\d+"),
# not correct when some comments are deleted, fuck
response.xpath('//span[@class="item_title"]/a/@href').re(r"reply(\d+)"),
)
]
Expand Down
7 changes: 7 additions & 0 deletions v2ex_scrapy/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from http.cookies import SimpleCookie
import json
from typing import Union

Expand Down Expand Up @@ -27,6 +28,12 @@ def json_to_str(j):
return json.dumps(j, ensure_ascii=False)


def cookie_str2cookie_dict(cookie_str: str):
simple_cookie = SimpleCookie()
simple_cookie.load(cookie_str)
return {k: v.value for k, v in simple_cookie.items()}


if __name__ == "__main__":
a = ["2022-04-28 13:24:38 +08:00", "287 天前", "1 小时前"]

Expand Down
7 changes: 6 additions & 1 deletion v2ex_scrapy/v2ex_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,10 @@ def parse_comment(response: scrapy.http.response.html.HtmlResponse, topic_id):
reply_content = cbox.xpath('.//div[@class="reply_content"]').get("")
reply_time = cbox.css(".ago::attr(title)").get("")
thank_count = cbox.css(".fade::text").get("0").strip()
no = cbox.css(".no::text").get("-1").strip()
yield CommentItem(
id_=int(comment_id),
no=int(no),
commenter=author_name,
topic_id=topic_id,
content=reply_content,
Expand Down Expand Up @@ -74,7 +76,9 @@ def parse_topic(response: scrapy.http.response.html.HtmlResponse, topic_id):
)

topic_content = response.css(".cell .topic_content").get("")

topic_reply_count = response.css(".box > .cell > .gray::text").re_first(
r"(\d+) 条回复", "0"
)
yield TopicItem(
id_=topic_id,
author=topic_author,
Expand All @@ -87,6 +91,7 @@ def parse_topic(response: scrapy.http.response.html.HtmlResponse, topic_id):
votes=int(topic_vote),
thank_count=int(topic_thank_count),
favorite_count=int(topic_favorite_count),
reply_count=int(topic_reply_count),
)


Expand Down

0 comments on commit bed01ae

Please sign in to comment.