Skip to content

Commit

Permalink
Fixed search failure due to unexpected parser state
Browse files Browse the repository at this point in the history
In many plugins the parser's state wasn't reset between pages.

This meant that if a page ended in a weird state (truncated or temporary error or unexpected html), all following pages would fail to find results.

torrentproject noticed the issue and overrode feed() to reset some of its state between pages.

But creating a new parser for each page is simpler. I have updated all plugins with this issue.
  • Loading branch information
ducalex committed Sep 12, 2024
1 parent 3a88c6f commit 360bb86
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 67 deletions.
19 changes: 8 additions & 11 deletions nova3/engines/limetorrents.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#VERSION: 4.7
#VERSION: 4.8
# AUTHORS: Lima66
# CONTRIBUTORS: Diego de las Heras ([email protected])

Expand Down Expand Up @@ -38,7 +38,7 @@ def __init__(self, url):
self.url = url
self.current_item = {} # dict for found item
self.item_name = None # key's name in current_item dict
self.page_empty = 22000
self.page_items = 0
self.inside_tr = False
self.findTable = False
self.parser_class = {"tdnormal": "size", # class
Expand Down Expand Up @@ -113,14 +113,11 @@ def search(self, query, cat='all'):
query = query.replace("%20", "-")
category = self.supported_categories[cat]

parser = self.MyHtmlParser(self.url)
page = 1
while True:
page_url = "{0}/search/{1}/{2}/seeds/{3}/".format(self.url, category, query, page)
for page in range(1, 5):
page_url = f"{self.url}/search/{category}/{query}/seeds/{page}/"
html = retrieve_url(page_url)
lunghezza_html = len(html)
if page > 6 or lunghezza_html <= parser.page_empty:
return
parser = self.MyHtmlParser(self.url)
parser.feed(html)
page += 1
parser.close()
parser.close()
if parser.page_items < 20:
break
30 changes: 7 additions & 23 deletions nova3/engines/solidtorrents.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# VERSION: 2.3
# VERSION: 2.4
# AUTHORS: nKlido

# LICENSING INFORMATION
Expand All @@ -24,7 +24,6 @@
from novaprinter import prettyPrinter
from html.parser import HTMLParser
from datetime import datetime
import math


class solidtorrents(object):
Expand All @@ -47,8 +46,6 @@ def __init__(self, url):
self.parseDate = False
self.column = 0
self.torrentReady = False
self.foundSearchStats = False
self.parseTotalResults = False
self.totalResults = 0

self.torrent_info = self.empty_torrent_info()
Expand All @@ -68,13 +65,6 @@ def empty_torrent_info(self):
def handle_starttag(self, tag, attrs):
params = dict(attrs)

if 'search-stats' in params.get('class', ''):
self.foundSearchStats = True

if (self.foundSearchStats and tag == 'b'):
self.parseTotalResults = True
self.foundSearchStats = False

if 'search-result' in params.get('class', ''):
self.foundResult = True
return
Expand Down Expand Up @@ -115,13 +105,10 @@ def handle_endtag(self, tag):
prettyPrinter(self.torrent_info)
self.torrentReady = False
self.torrent_info = self.empty_torrent_info()
self.totalResults += 1

def handle_data(self, data):

if (self.parseTotalResults):
self.totalResults = int(data.strip())
self.parseTotalResults = False

if (self.parseTitle):
if (bool(data.strip()) and data != '\n'):
self.torrent_info['name'] = data
Expand Down Expand Up @@ -161,12 +148,9 @@ def request(self, searchTerm, category, page=1):
def search(self, what, cat='all'):
category = self.supported_categories[cat]

parser = self.TorrentInfoParser(self.url)
parser.feed(self.request(what, category, 1))

totalPages = min(math.ceil(parser.totalResults / 20), 5)

for page in range(2, totalPages + 1):
for page in range(1, 5):
parser = self.TorrentInfoParser(self.url)
parser.feed(self.request(what, category, page))

parser.close()
parser.close()
if parser.totalResults < 15:
break
27 changes: 10 additions & 17 deletions nova3/engines/torlock.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#VERSION: 2.23
#VERSION: 2.24
# AUTHORS: Douman ([email protected])
# CONTRIBUTORS: Diego de las Heras ([email protected])

from re import compile as re_compile
from html.parser import HTMLParser
from datetime import datetime, timedelta

Expand Down Expand Up @@ -35,6 +34,7 @@ def __init__(self, url):
self.item_bad = False # set to True for malicious links
self.current_item = None # dict for found item
self.item_name = None # key's name in current_item dict
self.page_items = 0
self.parser_class = {"td": "pub_date",
"ts": "size",
"tul": "seeds",
Expand Down Expand Up @@ -91,26 +91,19 @@ def handle_endtag(self, tag):
except Exception:
self.current_item["pub_date"] = -1
prettyPrinter(self.current_item)
self.page_items += 1
self.current_item = {}

def search(self, query, cat='all'):
""" Performs search """
query = query.replace("%20", "-")
category = self.supported_categories[cat]

parser = self.MyHtmlParser(self.url)
page = "".join((self.url, "/", self.supported_categories[cat],
"/torrents/", query, ".html?sort=seeds&page=1"))
html = retrieve_url(page)
parser.feed(html)

counter = 1
additional_pages = re_compile(r"/{0}/torrents/{1}.html\?sort=seeds&page=[0-9]+"
.format(self.supported_categories[cat], query))
list_searches = additional_pages.findall(html)[:-1] # last link is next(i.e. second)
for page in map(lambda link: "".join((self.url, link)), list_searches):
html = retrieve_url(page)
for page in range(1, 5):
parser = self.MyHtmlParser(self.url)
page_url = f"{self.url}/{category}/torrents/{query}.html?sort=seeds&page={page}"
html = retrieve_url(page_url)
parser.feed(html)
counter += 1
if counter > 3:
parser.close()
if parser.page_items < 20:
break
parser.close()
16 changes: 4 additions & 12 deletions nova3/engines/torrentproject.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#VERSION: 1.4
#VERSION: 1.5
#AUTHORS: mauricci

from helpers import retrieve_url
Expand Down Expand Up @@ -102,26 +102,18 @@ def handle_data(self, data):
elif curr_key != 'name':
self.singleResData[curr_key] += data.strip()

def feed(self, html):
HTMLParser.feed(self, html)
self.pageComplete = False
self.insideResults = False
self.insideDataDiv = False
self.spanCount = -1

def search(self, what, cat='all'):
# curr_cat = self.supported_categories[cat]
parser = self.MyHTMLParser(self.url)
what = what.replace('%20', '+')
# analyze first 5 pages of results
for currPage in range(0, 5):
url = self.url + '/browse?t={0}&p={1}'.format(what, currPage)
html = retrieve_url(url)
parser = self.MyHTMLParser(self.url)
parser.feed(html)
if len(parser.pageRes) <= 0:
parser.close()
if len(parser.pageRes) < 20:
break
del parser.pageRes[:]
parser.close()

def download_torrent(self, info):
""" Downloader """
Expand Down
8 changes: 4 additions & 4 deletions nova3/engines/versions.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
eztv: 1.16
jackett: 4.0
limetorrents: 4.7
limetorrents: 4.8
piratebay: 3.3
solidtorrents: 2.3
torlock: 2.23
torrentproject: 1.4
solidtorrents: 2.4
torlock: 2.24
torrentproject: 1.5
torrentscsv: 1.4

0 comments on commit 360bb86

Please sign in to comment.