Skip to content

Commit

Permalink
Update adblock_filter_compiler.py
Browse files Browse the repository at this point in the history
  • Loading branch information
ghostnetic authored Nov 18, 2023
1 parent 30ad0df commit 38923a0
Showing 1 changed file with 22 additions and 37 deletions.
59 changes: 22 additions & 37 deletions adblock_filter_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,24 @@
from datetime import datetime
import json

# Pre-compiled regular expression for performance
domain_regex = re.compile(
r"^(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]$"
)

def is_valid_domain(domain):
"""Checks if a string is a valid domain."""
domain_regex = re.compile(
r"^(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]$"
)
return bool(domain_regex.match(domain))

def parse_hosts_file(content):
"""Parses a host file content into AdBlock rules."""
lines = content.split('\n')
adblock_rules = set()

for line in lines:
for line in content.split('\n'):
line = line.strip()

# Ignore comments and empty lines
if line.startswith('#') or line.startswith('!') or line == '':
if not line or line[0] in ('#', '!'):
continue

# Check if line follows AdBlock syntax, else create new rule
Expand All @@ -29,37 +30,35 @@ def parse_hosts_file(content):
parts = line.split()
domain = parts[-1]
if is_valid_domain(domain):
rule = f'||{domain}^'
adblock_rules.add(rule)
adblock_rules.add(f'||{domain}^')

return adblock_rules

def generate_filter(file_contents):
"""Generates filter content from file_contents by eliminating duplicates and redundant rules."""
duplicates_removed = 0
redundant_rules_removed = 0
adblock_rules_set = set()
base_domain_set = set()
duplicates_removed = 0
redundant_rules_removed = 0

for content in file_contents:
adblock_rules = parse_hosts_file(content)
for rule in adblock_rules:
domain = rule[2:-1] # Remove '||' and '^'
base_domain = '.'.join(domain.split('.')[-2:]) # Get the base domain (last two parts)
if rule not in adblock_rules_set:
# Check for redundant rules
if base_domain not in base_domain_set:
adblock_rules_set.add(rule)
base_domain_set.add(base_domain)
base_domain = domain.split('.')[-2:] # Get the base domain (last two parts)
base_domain = '.'.join(base_domain)
if rule not in adblock_rules_set and base_domain not in base_domain_set:
adblock_rules_set.add(rule)
base_domain_set.add(base_domain)
else:
if rule in adblock_rules_set:
duplicates_removed += 1
else:
redundant_rules_removed += 1
else:
duplicates_removed += 1

sorted_rules = sorted(list(adblock_rules_set))
sorted_rules = sorted(adblock_rules_set)
header = generate_header(len(sorted_rules), duplicates_removed, redundant_rules_removed)
filter_content = '\n'.join([header, '', *sorted_rules]) # Add an empty line after the header
return filter_content, duplicates_removed, redundant_rules_removed
return '\n'.join([header, '', *sorted_rules]), duplicates_removed, redundant_rules_removed

def generate_header(domain_count, duplicates_removed, redundant_rules_removed):
"""Generates header with specific domain count, removed duplicates, and compressed domains information."""
Expand All @@ -72,33 +71,19 @@ def generate_header(domain_count, duplicates_removed, redundant_rules_removed):
# Domains Compressed: {redundant_rules_removed}
#=================================================================="""

def get_parent_domains(domain):
"""Generates the immediate parent domain of a given domain."""
parts = domain.split('.')
if len(parts) > 2:
return ['.'.join(parts[i:]) for i in range(1, 2)]
else:
return []

def main():
"""Main function to fetch blocklists and generate a combined filter."""
with open('config.json') as f:
config = json.load(f)

blocklist_urls = config['blocklist_urls']
file_contents = [requests.get(url).text for url in blocklist_urls]

file_contents = []
for url in blocklist_urls:
with requests.get(url) as response:
file_contents.append(response.text)

filter_content, duplicates_removed, redundant_rules_removed = generate_filter(file_contents)
filter_content, _, _ = generate_filter(file_contents)

# Write the filter content to a file
with open('blocklist.txt', 'w') as f:
f.write(filter_content)

if __name__ == "__main__":
main()


0 comments on commit 38923a0

Please sign in to comment.