howie6879 · aircloud · Jun 7, 2023 · Jul 1, 2023
diff --git a/ruia/request.py b/ruia/request.py
@@ -7,15 +7,13 @@
 
 import asyncio
 import weakref
-
 from asyncio.locks import Semaphore
 from inspect import iscoroutinefunction
 from types import AsyncGeneratorType
 from typing import Coroutine, Optional, Tuple
 
 import aiohttp
 import async_timeout
-
 from ruia.exceptions import InvalidRequestMethod
 from ruia.response import Response
 from ruia.utils import get_logger

diff --git a/ruia/response.py b/ruia/response.py
@@ -6,7 +6,6 @@
 
 import asyncio
 import json
-
 from http.cookies import SimpleCookie
 from typing import Any, Callable, Optional
 
@@ -163,6 +162,8 @@ async def text(
     ) -> str:
         """Read response payload and decode."""
         encoding = encoding or self._encoding
+        if self._aws_text is None:
+            return ''
         self._html = await self._aws_text(encoding=encoding, errors=errors)
         return self._html
 

diff --git a/ruia/spider.py b/ruia/spider.py
@@ -21,7 +21,6 @@
 from types import AsyncGeneratorType
 
 from aiohttp import ClientSession
-
 from ruia.exceptions import NothingMatchedError, NotImplementedParseError
 from ruia.item import Item
 from ruia.middleware import Middleware
@@ -67,6 +66,7 @@ class Spider(SpiderHook):
     # Concurrency control
     worker_numbers: int = 2
     concurrency: int = 3
+    max_batch_size: int = 30
 
     # Spider entry
     start_urls: list = []
@@ -82,6 +82,7 @@ def __init__(
         cancel_tasks: bool = True,
         **spider_kwargs,
     ):
+        print("use hf ruia")
         """
         Init spider object.
         :param middleware: a list of or a single Middleware
@@ -166,12 +167,13 @@ async def _process_response(self, request: Request, response: Response):
         if response:
             if response.ok:
                 # Process succeed response
-                self.success_counts += 1
                 await self.process_succeed_response(request, response)
+                return True
             else:
                 # Process failed response
-                self.failed_counts += 1
                 await self.process_failed_response(request, response)
+                return False
+        return False
 
     async def _run_request_middleware(self, request: Request):
         if self.middleware.request_middleware:
@@ -350,12 +352,15 @@ async def handle_request(
             typing.Tuple[AsyncGeneratorType, Request, Response]: Returns a result tuple after each request
         """
         callback_result, response = None, None
+        if_success = False
 
         try:
             await self._run_request_middleware(request)
             callback_result, response = await request.fetch_callback(self.sem)
             await self._run_response_middleware(request, response)
-            await self._process_response(request=request, response=response)
+            if_success = await self._process_response(
+                request=request, response=response
+            )
         except NotImplementedParseError as e:
             self.logger.error(e)
         except NothingMatchedError as e:
@@ -364,6 +369,11 @@ async def handle_request(
         except Exception as e:
             self.logger.error(f"<Callback[{request.callback.__name__}]: {e}")
 
+        if if_success:
+            self.success_counts += 1
+        else:
+            self.failed_counts += 1
+
         return callback_result, request, response
 
     async def multiple_request(self, urls, is_gather=False, **kwargs):
@@ -463,31 +473,41 @@ async def start_master(self):
         """
         Actually start crawling
         """
-        async for request_ins in self.process_start_urls():
-            self.request_queue.put_nowait(self.handle_request(request_ins))
+        process_urls_task = asyncio.create_task(self.enqueue_start_urls())
+
         workers = [
-            asyncio.ensure_future(self.start_worker())
+            asyncio.ensure_future(self.start_worker(i))
             for i in range(self.worker_numbers)
         ]
         for worker in workers:
             self.logger.info(f"Worker started: {id(worker)}")
-        await self.request_queue.join()
+
+        await asyncio.gather(process_urls_task, self.request_queue.join())
 
         if not self.is_async_start:
             await self.stop(SIGINT)
         else:
             if self.cancel_tasks:
                 await self.cancel_all_tasks()
 
-    async def start_worker(self):
+    async def enqueue_start_urls(self):
+        async for request_ins in self.process_start_urls():
+            await self.request_queue.put(self.handle_request(request_ins))
+
+    async def start_worker(self, i: int):
         """
         Start spider worker
         :return:
         """
         while True:
             request_item = await self.request_queue.get()
             self.worker_tasks.append(request_item)
-            if self.request_queue.empty():
+            # TODO: 这样写不是很好，它把所有的 queue 都拿过来了，再去请求？
+            # 假设有无限的 urls 呢
+            if (
+                self.request_queue.empty()
+                or len(self.worker_tasks) > self.max_batch_size
+            ):
                 results = await asyncio.gather(
                     *self.worker_tasks, return_exceptions=True
                 )
@@ -505,12 +525,16 @@ async def start_worker(self):
                 self.worker_tasks = []
             self.request_queue.task_done()
 
+    async def cancel_callback(self):
+        self.logger.info("Spider Cancaled")
+
     async def stop(self, _signal):
         """
         Finish all running tasks, cancel remaining tasks.
         :param _signal:
         :return:
         """
         self.logger.info(f"Stopping spider: {self.name}")
+        await self.cancel_callback()
         await self.cancel_all_tasks()
         # self.loop.stop()