Skip to content

Commit

Permalink
Merge pull request #493 from JustAnotherArchivist/ignoracle-threading…
Browse files Browse the repository at this point in the history
…-bug

Fix ignores sometimes not being applied correctly due to thread-related race conditions
  • Loading branch information
JustAnotherArchivist authored Feb 21, 2021
2 parents 7e57a61 + 4ce74e6 commit 3fbf32b
Showing 1 changed file with 25 additions and 17 deletions.
42 changes: 25 additions & 17 deletions pipeline/archivebot/wpull/ignoracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import re
import sys
import threading

from urllib.parse import urlparse

Expand Down Expand Up @@ -54,28 +55,34 @@ class Ignoracle(object):
An Ignoracle's pattern list starts as the empty list.
'''

# Note that set_patterns() is called from a different thread than ignores().
# The lock prevents race conditions on iteration over self._compiled.

patterns = []

def __init__(self):
self._primary = None
self._compiled = []
self._lock = threading.Lock()

def set_patterns(self, strings):
'''
Given a list of strings, replaces this Ignoracle's pattern state with
that list.
'''

self.patterns = []
patterns = []

for string in strings:
if isinstance(string, bytes):
string = string.decode('utf-8')

self.patterns.append(string)
patterns.append(string)

self._primary = None
self._compiled = []
with self._lock:
self.patterns = patterns
self._primary = None
# Don't replace _compiled here; _primary acts as a trigger for the recompilation.

def ignores(self, url_record: wpull.pipeline.item.URLRecord):
'''
Expand All @@ -88,19 +95,20 @@ def ignores(self, url_record: wpull.pipeline.item.URLRecord):
primaryUrl = params.get('primary_url') or ''
primaryNetloc = params.get('primary_netloc') or ''
if self._primary != (primaryUrl, primaryNetloc):
self._compiled = []
escapedPrimaryUrl = re.escape(primaryUrl)
escapedPrimaryNetloc = re.escape(primaryNetloc)
for pattern in self.patterns:
try:
expanded = pattern.replace('{primary_url}', escapedPrimaryUrl)
expanded = expanded.replace('{primary_netloc}', escapedPrimaryNetloc)
compiledPattern = re.compile(expanded)
except re.error as error:
print('Pattern %s is invalid (error: %s). Ignored.'
% (pattern, str(error)), file=sys.stderr)
self._compiled.append((pattern, compiledPattern))
self._primary = (primaryUrl, primaryNetloc)
with self._lock:
self._compiled = []
escapedPrimaryUrl = re.escape(primaryUrl)
escapedPrimaryNetloc = re.escape(primaryNetloc)
for pattern in self.patterns:
try:
expanded = pattern.replace('{primary_url}', escapedPrimaryUrl)
expanded = expanded.replace('{primary_netloc}', escapedPrimaryNetloc)
compiledPattern = re.compile(expanded)
except re.error as error:
print('Pattern %s is invalid (error: %s). Ignored.'
% (pattern, str(error)), file=sys.stderr)
self._compiled.append((pattern, compiledPattern))
self._primary = (primaryUrl, primaryNetloc)

for pattern, compiled in self._compiled:
match = compiled.search(url_record.url)
Expand Down

0 comments on commit 3fbf32b

Please sign in to comment.