-
Notifications
You must be signed in to change notification settings - Fork 2
/
duplicate_files.py
69 lines (54 loc) · 2.3 KB
/
duplicate_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from utils import plural, time_and_date, whatlinkshere
from wikitools import wiki
from wikitools.page import Page
verbose = False
LANGS = ['ar', 'cs', 'da', 'de', 'es', 'fi', 'fr', 'hu', 'it', 'ja', 'ko', 'nl', 'no', 'pl', 'pt', 'pt-br', 'ro', 'ru', 'sv', 'tr', 'zh-hans', 'zh-hant']
def main(w):
seen = set()
all_duplicates = []
for page in w.get_all_files():
duplicates = page.raw.get('duplicatefiles', [])
duplicates = [ 'File:' + dupe['name'].replace('_', ' ') for dupe in duplicates ]
# Normalize language titles, this means we treat 'Theshowdown05' and 'Theshowdown05 ru' as the same file.
title = page.title.rpartition('.')[0]
for lang in LANGS:
if title.endswith(f' {lang}'):
title = title[:-len(lang)-1]
break
if not duplicates or title in seen:
continue
duplicates += [title, page.title] # The duplicate list does not include ourselves, obviously
if verbose:
print(f'Found duplicate image: {page.title}')
seen.update(duplicates)
all_duplicates.append(duplicates)
if verbose:
print(f'Found {len(all_duplicates)} duplicate images')
all_duplicates.sort(key = lambda dupe_list: -len(dupe_list)) # Put files with the most duplicates first
output = """\
{{{{DISPLAYTITLE: {count} duplicate files}}}}
List of all duplicate files; <onlyinclude>{unique}</onlyinclude> unique files, {count} duplicated files in total. Data as of {date}.
== List ==\n""".format(
unique=len(all_duplicates),
count=sum(len(dupe_list) for dupe_list in all_duplicates),
date=time_and_date())
for dupe_list in all_duplicates:
dupe_list = [d for d in dupe_list if not d.startswith('File:User')]
if len(dupe_list) <= 1:
continue
counts = []
for duplicate in dupe_list:
link_count = Page(w, duplicate).get_file_link_count()
counts.append([link_count, duplicate])
counts.sort(key=lambda s: (-s[0], s[1]))
output += f'[[{dupe_list[0]}|200px]]\n'
for count, title in counts:
output += f'* [[:{title}|]] ([{whatlinkshere(title, count)} {plural.uses(count)}])\n'
output += '\n'
return output
if __name__ == '__main__':
verbose = True
w = wiki.Wiki('https://wiki.teamfortress.com/w/api.php')
with open('wiki_duplicate_files.txt', 'w') as f:
f.write(main(w))
print(f'Article written to {f.name}')