Fixed search failure due to unexpected parser state

In many plugins the parser's state wasn't reset between pages. This meant that if a page ended in a weird state (truncated or temporary error or unexpected html), all following pages would fail to find results. torrentproject noticed the issue and overrode feed() to reset some of its state between pages. But creating a new parser for each page is simpler. I have updated all plugins with this issue.
qbittorrent · Sep 12, 2024 · 360bb86 · 360bb86
1 parent 3a88c6f
commit 360bb86
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 67 deletions.
diff --git a/nova3/engines/limetorrents.py b/nova3/engines/limetorrents.py
@@ -1,4 +1,4 @@
-#VERSION: 4.7
+#VERSION: 4.8
 # AUTHORS: Lima66
 # CONTRIBUTORS: Diego de las Heras ([email protected])
 
@@ -38,7 +38,7 @@ def __init__(self, url):
             self.url = url
             self.current_item = {}  # dict for found item
             self.item_name = None  # key's name in current_item dict
-            self.page_empty = 22000
+            self.page_items = 0
             self.inside_tr = False
             self.findTable = False
             self.parser_class = {"tdnormal": "size",  # class
@@ -113,14 +113,11 @@ def search(self, query, cat='all'):
         query = query.replace("%20", "-")
         category = self.supported_categories[cat]
 
-        parser = self.MyHtmlParser(self.url)
-        page = 1
-        while True:
-            page_url = "{0}/search/{1}/{2}/seeds/{3}/".format(self.url, category, query, page)
+        for page in range(1, 5):
+            page_url = f"{self.url}/search/{category}/{query}/seeds/{page}/"
             html = retrieve_url(page_url)
-            lunghezza_html = len(html)
-            if page > 6 or lunghezza_html <= parser.page_empty:
-                return
+            parser = self.MyHtmlParser(self.url)
             parser.feed(html)
-            page += 1
-        parser.close()
+            parser.close()
+            if parser.page_items < 20:
+                break
diff --git a/nova3/engines/solidtorrents.py b/nova3/engines/solidtorrents.py
@@ -1,4 +1,4 @@
-# VERSION: 2.3
+# VERSION: 2.4
 # AUTHORS: nKlido
 
 # LICENSING INFORMATION
@@ -24,7 +24,6 @@
 from novaprinter import prettyPrinter
 from html.parser import HTMLParser
 from datetime import datetime
-import math
 
 
 class solidtorrents(object):
@@ -47,8 +46,6 @@ def __init__(self, url):
             self.parseDate = False
             self.column = 0
             self.torrentReady = False
-            self.foundSearchStats = False
-            self.parseTotalResults = False
             self.totalResults = 0
 
             self.torrent_info = self.empty_torrent_info()
@@ -68,13 +65,6 @@ def empty_torrent_info(self):
         def handle_starttag(self, tag, attrs):
             params = dict(attrs)
 
-            if 'search-stats' in params.get('class', ''):
-                self.foundSearchStats = True
-
-            if (self.foundSearchStats and tag == 'b'):
-                self.parseTotalResults = True
-                self.foundSearchStats = False
-
             if 'search-result' in params.get('class', ''):
                 self.foundResult = True
                 return
@@ -115,13 +105,10 @@ def handle_endtag(self, tag):
                 prettyPrinter(self.torrent_info)
                 self.torrentReady = False
                 self.torrent_info = self.empty_torrent_info()
+                self.totalResults += 1
 
         def handle_data(self, data):
 
-            if (self.parseTotalResults):
-                self.totalResults = int(data.strip())
-                self.parseTotalResults = False
-
             if (self.parseTitle):
                 if (bool(data.strip()) and data != '\n'):
                     self.torrent_info['name'] = data
@@ -161,12 +148,9 @@ def request(self, searchTerm, category, page=1):
     def search(self, what, cat='all'):
         category = self.supported_categories[cat]
 
-        parser = self.TorrentInfoParser(self.url)
-        parser.feed(self.request(what, category, 1))
-
-        totalPages = min(math.ceil(parser.totalResults / 20), 5)
-
-        for page in range(2, totalPages + 1):
+        for page in range(1, 5):
+            parser = self.TorrentInfoParser(self.url)
             parser.feed(self.request(what, category, page))
-
-        parser.close()
+            parser.close()
+            if parser.totalResults < 15:
+                break
diff --git a/nova3/engines/torlock.py b/nova3/engines/torlock.py
@@ -1,8 +1,7 @@
-#VERSION: 2.23
+#VERSION: 2.24
 # AUTHORS: Douman ([email protected])
 # CONTRIBUTORS: Diego de las Heras ([email protected])
 
-from re import compile as re_compile
 from html.parser import HTMLParser
 from datetime import datetime, timedelta
 
@@ -35,6 +34,7 @@ def __init__(self, url):
             self.item_bad = False  # set to True for malicious links
             self.current_item = None  # dict for found item
             self.item_name = None  # key's name in current_item dict
+            self.page_items = 0
             self.parser_class = {"td": "pub_date",
                                  "ts": "size",
                                  "tul": "seeds",
@@ -91,26 +91,19 @@ def handle_endtag(self, tag):
                     except Exception:
                         self.current_item["pub_date"] = -1
                     prettyPrinter(self.current_item)
+                    self.page_items += 1
                 self.current_item = {}
 
     def search(self, query, cat='all'):
         """ Performs search """
         query = query.replace("%20", "-")
+        category = self.supported_categories[cat]
 
-        parser = self.MyHtmlParser(self.url)
-        page = "".join((self.url, "/", self.supported_categories[cat],
-                        "/torrents/", query, ".html?sort=seeds&page=1"))
-        html = retrieve_url(page)
-        parser.feed(html)
-
-        counter = 1
-        additional_pages = re_compile(r"/{0}/torrents/{1}.html\?sort=seeds&page=[0-9]+"
-                                      .format(self.supported_categories[cat], query))
-        list_searches = additional_pages.findall(html)[:-1]  # last link is next(i.e. second)
-        for page in map(lambda link: "".join((self.url, link)), list_searches):
-            html = retrieve_url(page)
+        for page in range(1, 5):
+            parser = self.MyHtmlParser(self.url)
+            page_url = f"{self.url}/{category}/torrents/{query}.html?sort=seeds&page={page}"
+            html = retrieve_url(page_url)
             parser.feed(html)
-            counter += 1
-            if counter > 3:
+            parser.close()
+            if parser.page_items < 20:
                 break
-        parser.close()
diff --git a/nova3/engines/torrentproject.py b/nova3/engines/torrentproject.py
@@ -1,4 +1,4 @@
-#VERSION: 1.4
+#VERSION: 1.5
 #AUTHORS: mauricci
 
 from helpers import retrieve_url
@@ -102,26 +102,18 @@ def handle_data(self, data):
                             elif curr_key != 'name':
                                 self.singleResData[curr_key] += data.strip()
 
-        def feed(self, html):
-            HTMLParser.feed(self, html)
-            self.pageComplete = False
-            self.insideResults = False
-            self.insideDataDiv = False
-            self.spanCount = -1
-
     def search(self, what, cat='all'):
         # curr_cat = self.supported_categories[cat]
-        parser = self.MyHTMLParser(self.url)
         what = what.replace('%20', '+')
         # analyze first 5 pages of results
         for currPage in range(0, 5):
             url = self.url + '/browse?t={0}&p={1}'.format(what, currPage)
             html = retrieve_url(url)
+            parser = self.MyHTMLParser(self.url)
             parser.feed(html)
-            if len(parser.pageRes) <= 0:
+            parser.close()
+            if len(parser.pageRes) < 20:
                 break
-            del parser.pageRes[:]
-        parser.close()
 
     def download_torrent(self, info):
         """ Downloader """

diff --git a/nova3/engines/versions.txt b/nova3/engines/versions.txt
@@ -1,8 +1,8 @@
 eztv: 1.16
 jackett: 4.0
-limetorrents: 4.7
+limetorrents: 4.8
 piratebay: 3.3
-solidtorrents: 2.3
-torlock: 2.23
-torrentproject: 1.4
+solidtorrents: 2.4
+torlock: 2.24
+torrentproject: 1.5
 torrentscsv: 1.4