Skip to content

Commit

Permalink
Refactor: split reject and reject_extra
Browse files Browse the repository at this point in the history
  • Loading branch information
SukkaW committed Jul 21, 2024
1 parent a714263 commit f129152
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 79 deletions.
99 changes: 74 additions & 25 deletions Build/build-reject-domainset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import path from 'path';
import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter';
import { createTrie } from './lib/trie';

import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS } from './constants/reject-data-source';
import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source';
import { createRuleset, compareAndWriteFile } from './lib/create-file';
import { domainDeduper } from './lib/domain-deduper';
import createKeywordFilter from './lib/aho-corasick';
Expand All @@ -29,6 +29,9 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
const domainSets = new Set<string>();
const appendArrayToDomainSets = setAddFromArrayCurried(domainSets);

const domainSetsExtra = new Set<string>();
const appendArrayToDomainSetsExtra = setAddFromArrayCurried(domainSetsExtra);

// Parse from AdGuard Filters
const shouldStop = await span
.traceChild('download and process hosts / adblock filter rules')
Expand All @@ -38,7 +41,11 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
await Promise.all([
// Parse from remote hosts & domain lists
HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToDomainSets)),
HOSTS_EXTRA.map(entry => processHosts(childSpan, ...entry).then(appendArrayToDomainSetsExtra)),

DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToDomainSets)),
DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToDomainSetsExtra)),

ADGUARD_FILTERS.map(
input => processFilterRules(childSpan, ...input)
.then(({ white, black, foundDebugDomain }) => {
Expand All @@ -51,6 +58,19 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
setAddFromArray(domainSets, black);
})
),
ADGUARD_FILTERS_EXTRA.map(
input => processFilterRules(childSpan, ...input)
.then(({ white, black, foundDebugDomain }) => {
if (foundDebugDomain) {
// eslint-disable-next-line sukka/no-single-return -- not single return
shouldStop = true;
// we should not break here, as we want to see full matches from all data source
}
setAddFromArray(filterRuleWhitelistDomainSets, white);
setAddFromArray(domainSetsExtra, black);
})
),

([
'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt',
'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt'
Expand All @@ -60,7 +80,7 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
setAddFromArray(filterRuleWhitelistDomainSets, black);
})
)),
getPhishingDomains(childSpan).then(appendArrayToDomainSets),
getPhishingDomains(childSpan).then(appendArrayToDomainSetsExtra),
getRejectSukkaConfPromise.then(appendArrayToDomainSets)
].flat());
// eslint-disable-next-line sukka/no-single-return -- not single return
Expand All @@ -71,7 +91,7 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
process.exit(1);
}

console.log(`Import ${domainSets.size} rules from Hosts / AdBlock Filter Rules & reject_sukka.conf!`);
console.log(`Import ${domainSets.size} + ${domainSetsExtra.size} rules from Hosts / AdBlock Filter Rules & reject_sukka.conf!`);

// Dedupe domainSets
const domainKeywordsSet = await span.traceChildAsync('collect black keywords/suffixes', async () => {
Expand All @@ -91,25 +111,38 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
return domainKeywordsSet;
});

const trie = span.traceChildSync('create smol trie while deduping black keywords', () => {
const trie = createTrie(null, true, true);
const [baseTrie, extraTrie] = span.traceChildSync('create smol trie while deduping black keywords', () => {
const baseTrie = createTrie(null, true, true);
const extraTrie = createTrie(null, true, true);

const kwfilter = createKeywordFilter(domainKeywordsSet);

for (const domain of domainSets) {
// exclude keyword when creating trie
if (!kwfilter(domain)) {
trie.add(domain);
baseTrie.add(domain);
}
}

return trie;
for (const domain of domainSetsExtra) {
// exclude keyword when creating trie
if (!kwfilter(domain)) {
extraTrie.add(domain);
}
}

return [baseTrie, extraTrie] as const;
});

span.traceChildSync('dedupe from white suffixes', () => filterRuleWhitelistDomainSets.forEach(trie.whitelist));
span.traceChildSync('dedupe from white suffixes (base)', () => filterRuleWhitelistDomainSets.forEach(baseTrie.whitelist));
span.traceChildSync('dedupe from white suffixes and base (extra)', () => {
domainSets.forEach(extraTrie.whitelist);
filterRuleWhitelistDomainSets.forEach(extraTrie.whitelist);
});

// Dedupe domainSets
const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain', () => domainDeduper(trie));
const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain (base)', () => domainDeduper(baseTrie));
const dudupedDominArrayExtra = span.traceChildSync('dedupe from covered subdomain (extra)', () => domainDeduper(extraTrie));

console.log(`Final size ${dudupedDominArray.length}`);

Expand All @@ -118,7 +151,7 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
subdomainMap: domainArraySubdomainMap
} = span.traceChildSync(
'build map for stat and sort',
() => buildParseDomainMap(dudupedDominArray)
() => buildParseDomainMap(dudupedDominArray.concat(dudupedDominArrayExtra))
);

// Create reject stats
Expand All @@ -136,30 +169,46 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
return sort(Array.from(statMap.entries()).filter(a => a[1] > 9), (a, b) => (b[1] - a[1]) || a[0].localeCompare(b[0]));
});

const description = [
...SHARED_DESCRIPTION,
'',
'The domainset supports AD blocking, tracking protection, privacy protection, anti-phishing, anti-mining',
'',
'Build from:',
...HOSTS.map(host => ` - ${host[0]}`),
...DOMAIN_LISTS.map(domainList => ` - ${domainList[0]}`),
...ADGUARD_FILTERS.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`),
' - https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt',
' - https://phishing.army/download/phishing_army_blocklist.txt'
];

return Promise.all([
createRuleset(
span,
'Sukka\'s Ruleset - Reject Base',
description,
[
...SHARED_DESCRIPTION,
'',
'The domainset supports AD blocking, tracking protection, privacy protection, anti-phishing, anti-mining',
'',
'Build from:',
...HOSTS.map(host => ` - ${host[0]}`),
...DOMAIN_LISTS.map(domainList => ` - ${domainList[0]}`),
...ADGUARD_FILTERS.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`)
],
new Date(),
span.traceChildSync('sort reject domainset', () => sortDomains(dudupedDominArray, domainArrayMainDomainMap, domainArraySubdomainMap)),
span.traceChildSync('sort reject domainset (base)', () => sortDomains(dudupedDominArray, domainArrayMainDomainMap, domainArraySubdomainMap)),
'domainset',
path.resolve(import.meta.dir, '../List/domainset/reject.conf'),
path.resolve(import.meta.dir, '../Clash/domainset/reject.txt')
),
createRuleset(
span,
'Sukka\'s Ruleset - Reject Extra',
[
...SHARED_DESCRIPTION,
'',
'The domainset supports AD blocking, tracking protection, privacy protection, anti-phishing, anti-mining',
'',
'Build from:',
...HOSTS_EXTRA.map(host => ` - ${host[0]}`),
...DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`),
...ADGUARD_FILTERS_EXTRA.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`),
...PHISHING_DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`)
],
new Date(),
span.traceChildSync('sort reject domainset (extra)', () => sortDomains(dudupedDominArrayExtra, domainArrayMainDomainMap, domainArraySubdomainMap)),
'domainset',
path.resolve(import.meta.dir, '../List/domainset/reject_extra.conf'),
path.resolve(import.meta.dir, '../Clash/domainset/reject_extra.txt')
),
compareAndWriteFile(
span,
rejectDomainsStats.map(([domain, count]) => `${domain}${' '.repeat(100 - domain.length)}${count}`),
Expand Down
115 changes: 63 additions & 52 deletions Build/constants/reject-data-source.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,28 @@ export const HOSTS: HostsSource[] = [
true,
TTL.THREE_HOURS()
],
// Dan Pollock's hosts file, 0.0.0.0 version is 30 KiB smaller
['https://someonewhocares.org/hosts/zero/hosts', null, true, TTL.THREE_HOURS()],

// no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl
['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', null, true, TTL.THREE_DAYS()],
// have not been updated for more than a year, so we set a 14 days cache ttl
['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', null, true, TTL.TWO_WEEKS()],
['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', null, false, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-AdBlock.txt', null, false, TTL.THREE_DAYS()],
// ad-wars is not actively maintained, so we set a 7 days cache ttl
['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.ONE_WEEK()],
['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', null, true, TTL.THREE_HOURS()]
] as const;

export const HOSTS_EXTRA: HostsSource[] = [
// Dan Pollock's hosts file, 0.0.0.0 version is 30 KiB smaller
['https://someonewhocares.org/hosts/zero/hosts', null, true, TTL.THREE_HOURS()],
// ad-wars is not actively maintained, so we set a 7 days cache ttl
['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.ONE_WEEK()]
];

export const DOMAIN_LISTS: HostsSource[] = [
// CoinBlockerList
// Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl
['https://zerodot1.gitlab.io/CoinBlockerLists/list_browser.txt', [], true, TTL.TWO_WEEKS()],
// BarbBlock
// The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl
['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', [], true, TTL.TWO_WEEKS()],
// DigitalSide Threat-Intel - OSINT Hub
// Update once per day
['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', [], true, TTL.ONE_DAY()],

// Curben's PUP Domains Blocklist
// 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
// 'https://pup-filter.pages.dev/pup-filter-agh.txt'
Expand All @@ -52,17 +51,26 @@ export const DOMAIN_LISTS: HostsSource[] = [
'https://malware-filter.gitlab.io/malware-filter/urlhaus-filter-domains.txt'
],
true, TTL.THREE_HOURS()
],
]
] as const;

export const DOMAIN_LISTS_EXTRA: HostsSource[] = [
// BarbBlock
// The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl
['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', [], true, TTL.TWO_WEEKS()],
// DigitalSide Threat-Intel - OSINT Hub
// Update once per day
['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', [], true, TTL.ONE_DAY()],
// AdGuard CNAME Filter Combined
// Update on a 7 days basis, so we add a 3 hours cache ttl
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', [], true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', [], true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', [], true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', [], true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', [], true, TTL.THREE_DAYS()]
] as const;
];

export const PHISHING_DOMAIN_LISTS: [HostsSource, HostsSource] = [
export const PHISHING_DOMAIN_LISTS_EXTRA: [HostsSource, HostsSource] = [
[
'https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt',
[
Expand Down Expand Up @@ -114,6 +122,46 @@ export const ADGUARD_FILTERS: AdGuardFilterSource[] = [
],
TTL.TWLVE_HOURS()
],
// AdGuard Base Filter
['https://filters.adtidy.org/extension/ublock/filters/2_without_easylist.txt', null, TTL.THREE_HOURS()],
// AdGuard Mobile AD
['https://filters.adtidy.org/extension/ublock/filters/11_optimized.txt', null, TTL.THREE_HOURS()],
// AdGuard Tracking Protection
['https://filters.adtidy.org/extension/ublock/filters/3_optimized.txt', null, TTL.THREE_HOURS()],
// AdGuard Chinese filter (EasyList China + AdGuard Chinese filter)
['https://filters.adtidy.org/extension/ublock/filters/224_optimized.txt', null, TTL.THREE_HOURS()],
// AdGuard Annoyances filter
['https://filters.adtidy.org/android/filters/14_optimized.txt', null, TTL.THREE_HOURS()],
// GameConsoleAdblockList
// Update almost once per 1 to 3 months, let's set a 10 days cache ttl
['https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt', null, TTL.TEN_DAYS()],
// PiHoleBlocklist
// Update almost once per 3 months, let's set a 10 days cache ttl
[
'https://perflyst.github.io/PiHoleBlocklist/SmartTV-AGH.txt',
[
'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'
],
TTL.TEN_DAYS()
],
// Spam404
// Not actively maintained, let's use a 10 days cache ttl
['https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt', null, TTL.TEN_DAYS()],
// Brave First Party & First Party CNAME
['https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt', null, TTL.ONE_DAY()]
] as const;

export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [
// EasyList Germany filter
[
'https://easylist.to/easylistgermany/easylistgermany.txt',
[
'https://easylist-downloads.adblockplus.org/easylistgermany.txt'
],
TTL.TWLVE_HOURS()
],
// AdGuard Japanese filter
['https://filters.adtidy.org/extension/ublock/filters/7_optimized.txt', null, TTL.THREE_HOURS()],
// uBlock Origin Filter List
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters.min.txt',
Expand Down Expand Up @@ -152,45 +200,8 @@ export const ADGUARD_FILTERS: AdGuardFilterSource[] = [
'https://ublockorigin.pages.dev/filters/unbreak.min.txt'
],
TTL.THREE_HOURS()
],
// AdGuard Base Filter
['https://filters.adtidy.org/extension/ublock/filters/2_without_easylist.txt', null, TTL.THREE_HOURS()],
// AdGuard Mobile AD
['https://filters.adtidy.org/extension/ublock/filters/11_optimized.txt', null, TTL.THREE_HOURS()],
// AdGuard Tracking Protection
['https://filters.adtidy.org/extension/ublock/filters/3_optimized.txt', null, TTL.THREE_HOURS()],
// AdGuard Japanese filter
['https://filters.adtidy.org/extension/ublock/filters/7_optimized.txt', null, TTL.THREE_HOURS()],
// AdGuard Chinese filter (EasyList China + AdGuard Chinese filter)
['https://filters.adtidy.org/extension/ublock/filters/224_optimized.txt', null, TTL.THREE_HOURS()],
// AdGuard Annoyances filter
['https://filters.adtidy.org/android/filters/14_optimized.txt', null, TTL.THREE_HOURS()],
// EasyList Germany filter
[
'https://easylist.to/easylistgermany/easylistgermany.txt',
[
'https://easylist-downloads.adblockplus.org/easylistgermany.txt'
],
TTL.TWLVE_HOURS()
],
// GameConsoleAdblockList
// Update almost once per 1 to 3 months, let's set a 10 days cache ttl
['https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt', null, TTL.TEN_DAYS()],
// PiHoleBlocklist
// Update almost once per 3 months, let's set a 10 days cache ttl
[
'https://perflyst.github.io/PiHoleBlocklist/SmartTV-AGH.txt',
[
'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'
],
TTL.TEN_DAYS()
],
// Spam404
// Not actively maintained, let's use a 10 days cache ttl
['https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt', null, TTL.TEN_DAYS()],
// Brave First Party & First Party CNAME
['https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt', null, TTL.ONE_DAY()]
] as const;
]
];

export const PREDEFINED_WHITELIST = [
'.localhost',
Expand Down
4 changes: 2 additions & 2 deletions Build/lib/get-phishing-domains.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import * as tldts from 'tldts-experimental';

import type { Span } from '../trace';
import { appendArrayInPlaceCurried } from './append-array-in-place';
import { PHISHING_DOMAIN_LISTS } from '../constants/reject-data-source';
import { PHISHING_DOMAIN_LISTS_EXTRA } from '../constants/reject-data-source';
import { looseTldtsOpt } from '../constants/loose-tldts-opt';
import picocolors from 'picocolors';
import createKeywordFilter from './aho-corasick';
Expand Down Expand Up @@ -133,7 +133,7 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => {
const domainArr: string[] = [];

(await Promise.all(PHISHING_DOMAIN_LISTS.map(entry => processDomainLists(curSpan, ...entry))))
(await Promise.all(PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(curSpan, ...entry))))
.forEach(appendArrayInPlaceCurried(domainArr));

return domainArr;
Expand Down

0 comments on commit f129152

Please sign in to comment.