From 360bb867b22a2bbafd2260ceff1a206ffb643cfe Mon Sep 17 00:00:00 2001 From: Alex Duchesne Date: Thu, 12 Sep 2024 12:45:14 -0400 Subject: [PATCH] Fixed search failure due to unexpected parser state In many plugins the parser's state wasn't reset between pages. This meant that if a page ended in a weird state (truncated or temporary error or unexpected html), all following pages would fail to find results. torrentproject noticed the issue and overrode feed() to reset some of its state between pages. But creating a new parser for each page is simpler. I have updated all plugins with this issue. --- nova3/engines/limetorrents.py | 19 ++++++++----------- nova3/engines/solidtorrents.py | 30 +++++++----------------------- nova3/engines/torlock.py | 27 ++++++++++----------------- nova3/engines/torrentproject.py | 16 ++++------------ nova3/engines/versions.txt | 8 ++++---- 5 files changed, 33 insertions(+), 67 deletions(-) diff --git a/nova3/engines/limetorrents.py b/nova3/engines/limetorrents.py index 37d8c5a..248aeda 100644 --- a/nova3/engines/limetorrents.py +++ b/nova3/engines/limetorrents.py @@ -1,4 +1,4 @@ -#VERSION: 4.7 +#VERSION: 4.8 # AUTHORS: Lima66 # CONTRIBUTORS: Diego de las Heras (ngosang@hotmail.es) @@ -38,7 +38,7 @@ def __init__(self, url): self.url = url self.current_item = {} # dict for found item self.item_name = None # key's name in current_item dict - self.page_empty = 22000 + self.page_items = 0 self.inside_tr = False self.findTable = False self.parser_class = {"tdnormal": "size", # class @@ -113,14 +113,11 @@ def search(self, query, cat='all'): query = query.replace("%20", "-") category = self.supported_categories[cat] - parser = self.MyHtmlParser(self.url) - page = 1 - while True: - page_url = "{0}/search/{1}/{2}/seeds/{3}/".format(self.url, category, query, page) + for page in range(1, 5): + page_url = f"{self.url}/search/{category}/{query}/seeds/{page}/" html = retrieve_url(page_url) - lunghezza_html = len(html) - if page > 6 or lunghezza_html <= parser.page_empty: - return + parser = self.MyHtmlParser(self.url) parser.feed(html) - page += 1 - parser.close() + parser.close() + if parser.page_items < 20: + break diff --git a/nova3/engines/solidtorrents.py b/nova3/engines/solidtorrents.py index 5dfccd6..3a46f6a 100644 --- a/nova3/engines/solidtorrents.py +++ b/nova3/engines/solidtorrents.py @@ -1,4 +1,4 @@ -# VERSION: 2.3 +# VERSION: 2.4 # AUTHORS: nKlido # LICENSING INFORMATION @@ -24,7 +24,6 @@ from novaprinter import prettyPrinter from html.parser import HTMLParser from datetime import datetime -import math class solidtorrents(object): @@ -47,8 +46,6 @@ def __init__(self, url): self.parseDate = False self.column = 0 self.torrentReady = False - self.foundSearchStats = False - self.parseTotalResults = False self.totalResults = 0 self.torrent_info = self.empty_torrent_info() @@ -68,13 +65,6 @@ def empty_torrent_info(self): def handle_starttag(self, tag, attrs): params = dict(attrs) - if 'search-stats' in params.get('class', ''): - self.foundSearchStats = True - - if (self.foundSearchStats and tag == 'b'): - self.parseTotalResults = True - self.foundSearchStats = False - if 'search-result' in params.get('class', ''): self.foundResult = True return @@ -115,13 +105,10 @@ def handle_endtag(self, tag): prettyPrinter(self.torrent_info) self.torrentReady = False self.torrent_info = self.empty_torrent_info() + self.totalResults += 1 def handle_data(self, data): - if (self.parseTotalResults): - self.totalResults = int(data.strip()) - self.parseTotalResults = False - if (self.parseTitle): if (bool(data.strip()) and data != '\n'): self.torrent_info['name'] = data @@ -161,12 +148,9 @@ def request(self, searchTerm, category, page=1): def search(self, what, cat='all'): category = self.supported_categories[cat] - parser = self.TorrentInfoParser(self.url) - parser.feed(self.request(what, category, 1)) - - totalPages = min(math.ceil(parser.totalResults / 20), 5) - - for page in range(2, totalPages + 1): + for page in range(1, 5): + parser = self.TorrentInfoParser(self.url) parser.feed(self.request(what, category, page)) - - parser.close() + parser.close() + if parser.totalResults < 15: + break diff --git a/nova3/engines/torlock.py b/nova3/engines/torlock.py index 7b60263..6aa6a9d 100644 --- a/nova3/engines/torlock.py +++ b/nova3/engines/torlock.py @@ -1,8 +1,7 @@ -#VERSION: 2.23 +#VERSION: 2.24 # AUTHORS: Douman (custparasite@gmx.se) # CONTRIBUTORS: Diego de las Heras (ngosang@hotmail.es) -from re import compile as re_compile from html.parser import HTMLParser from datetime import datetime, timedelta @@ -35,6 +34,7 @@ def __init__(self, url): self.item_bad = False # set to True for malicious links self.current_item = None # dict for found item self.item_name = None # key's name in current_item dict + self.page_items = 0 self.parser_class = {"td": "pub_date", "ts": "size", "tul": "seeds", @@ -91,26 +91,19 @@ def handle_endtag(self, tag): except Exception: self.current_item["pub_date"] = -1 prettyPrinter(self.current_item) + self.page_items += 1 self.current_item = {} def search(self, query, cat='all'): """ Performs search """ query = query.replace("%20", "-") + category = self.supported_categories[cat] - parser = self.MyHtmlParser(self.url) - page = "".join((self.url, "/", self.supported_categories[cat], - "/torrents/", query, ".html?sort=seeds&page=1")) - html = retrieve_url(page) - parser.feed(html) - - counter = 1 - additional_pages = re_compile(r"/{0}/torrents/{1}.html\?sort=seeds&page=[0-9]+" - .format(self.supported_categories[cat], query)) - list_searches = additional_pages.findall(html)[:-1] # last link is next(i.e. second) - for page in map(lambda link: "".join((self.url, link)), list_searches): - html = retrieve_url(page) + for page in range(1, 5): + parser = self.MyHtmlParser(self.url) + page_url = f"{self.url}/{category}/torrents/{query}.html?sort=seeds&page={page}" + html = retrieve_url(page_url) parser.feed(html) - counter += 1 - if counter > 3: + parser.close() + if parser.page_items < 20: break - parser.close() diff --git a/nova3/engines/torrentproject.py b/nova3/engines/torrentproject.py index e736871..2db3b8d 100644 --- a/nova3/engines/torrentproject.py +++ b/nova3/engines/torrentproject.py @@ -1,4 +1,4 @@ -#VERSION: 1.4 +#VERSION: 1.5 #AUTHORS: mauricci from helpers import retrieve_url @@ -102,26 +102,18 @@ def handle_data(self, data): elif curr_key != 'name': self.singleResData[curr_key] += data.strip() - def feed(self, html): - HTMLParser.feed(self, html) - self.pageComplete = False - self.insideResults = False - self.insideDataDiv = False - self.spanCount = -1 - def search(self, what, cat='all'): # curr_cat = self.supported_categories[cat] - parser = self.MyHTMLParser(self.url) what = what.replace('%20', '+') # analyze first 5 pages of results for currPage in range(0, 5): url = self.url + '/browse?t={0}&p={1}'.format(what, currPage) html = retrieve_url(url) + parser = self.MyHTMLParser(self.url) parser.feed(html) - if len(parser.pageRes) <= 0: + parser.close() + if len(parser.pageRes) < 20: break - del parser.pageRes[:] - parser.close() def download_torrent(self, info): """ Downloader """ diff --git a/nova3/engines/versions.txt b/nova3/engines/versions.txt index 672def0..65fc148 100644 --- a/nova3/engines/versions.txt +++ b/nova3/engines/versions.txt @@ -1,8 +1,8 @@ eztv: 1.16 jackett: 4.0 -limetorrents: 4.7 +limetorrents: 4.8 piratebay: 3.3 -solidtorrents: 2.3 -torlock: 2.23 -torrentproject: 1.4 +solidtorrents: 2.4 +torlock: 2.24 +torrentproject: 1.5 torrentscsv: 1.4