jbzdarkid · jbzdarkid · Oct 10, 2022 · Oct 10, 2022 · Oct 10, 2022 · Oct 10, 2022
diff --git a/.github/workflows/tfwiki_stats.yml b/.github/workflows/tfwiki_stats.yml
@@ -29,7 +29,7 @@ jobs:
         PULL_REQUEST_ID: ${{ github.event.pull_request.number }}
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         API_KEY: ${{ secrets.API_KEY }}
-    - uses: actions/upload-artifact@v3
+    - uses: actions/upload-artifact@v4
       with:
         name: Failed uploads
         path: wiki_*.txt

diff --git a/README.md b/README.md
@@ -18,10 +18,10 @@ This version of the scripts has been updated to python3, which unfortunately bro
 - `mismatched_weekly.py`: Weekly copy of the monthly report which only runs on the past week of recent changes.
 
 ## Monthly reports
+- `all_external_links.py`: A slightly different take on scanning external links which just groups by domain name without making network calls.
 - `displaytitles.py`: Searches for pages with duplicate displaytitles, which show a gross-looking error message.
 - `duplicate_files.py`: Finds all identical files, and sorts them by usage count.
 - `edit_stats.py`: Provides some statistics about user editing habits on the wiki, along with a list of the top 100 editors by edit count
-- `external_links2.py`: Searches all articles for links outside the tf2 wiki, and checks to see if those links are still valid (HTTP 200)
 - `mismatched.py`: Searches all articles for incorrect pairs of parenthesis, to help catch broken links, tags, and templates.
 - `missing_categories.py`: Searches for non-translated categories. Categories which are only in english should generally be marked as {{non-article category}}.
 - `overtranslated.py`: Searches all articles for language pages which don't exist in english. This is usually indicative of duplicate translations.

diff --git a/all_external_links.py b/all_external_links.py
@@ -0,0 +1,76 @@
+from re import compile, VERBOSE
+from utils import pagescraper_queue, plural, time_and_date
+from wikitools import wiki
+
+verbose = False
+
+# Within the HTML source code, all links should be href="()". Internal links start with /wiki/foo, so this will find all external links.
+LINK_REGEX = compile('''
+  href="(       # Within the HTML source code, all links start with href=
+    https?://   # Match http/https scheme (internal wiki links start with /wiki)
+    (           # Start inner capture group (for just the domain name)
+      [^/"]+    # The domain
+    )
+    [^"]*       # The rest of the URL
+  )"
+''', VERBOSE)
+
+# Domains which cannot be malware or phishing or broken links. Hopefully.
+
+def pagescraper(page, all_links):
+  text = page.get_raw_html()
+
+  start = text.find('id="content"')
+  end = text.find('class="printfooter"')
+  text = text[start:end]
+
+  for m in LINK_REGEX.finditer(text):
+    hostname = '.'.join(m[2].split('.')[-2:]).lower()
+    if hostname not in all_links:
+      all_links[hostname] = set()
+    all_links[hostname].add(page)
+
+def main(w):
+  all_links = {} # Map of {domain: {link: [pages]}}
+  with pagescraper_queue(pagescraper, all_links) as pages:
+    for page in w.get_all_pages():
+      pages.put(page)
+
+  output = """\
+{{{{DISPLAYTITLE: {domain_count} external domains}}}}
+There are external links to <onlyinclude>{domain_count}</onlyinclude> different domains from the wiki. Data as of {date}.
+
+{{{{TOC limit|2}}}}
+""".format(
+    domain_count=len(all_links),
+    date=time_and_date())
+
+  # Sort domains by count (high -> low), then title (a -> z)
+  domains = list(all_links.keys())
+  domains.sort(key = lambda domain: (-len(all_links[domain]), domain))
+
+  last_header = 100_000
+  for domain in domains:
+    # Sort pages by title, then language (english first)
+    pages = list(all_links[domain])
+    pages.sort(key = lambda page: (page.title, page.lang != 'en', page.lang))
+
+    if len(pages) < last_header:
+      next_header = last_header // 10
+      output += f'== Domains with {next_header}-{last_header-1} pages ==\n'
+      last_header = next_header
+
+    output += f'=== {domain} ({plural.pages(len(pages))}) ===\n'
+    for page in pages[:10]:
+      output += f'* [[{page.title}]]\n'
+    if len(pages) > 10:
+      output += f'... and {len(pages)-10} more\n'
+
+  return output
+
+if __name__ == '__main__':
+  verbose = True
+  w = wiki.Wiki('https://wiki.teamfortress.com/w/api.php')
+  with open('wiki_all_external_links.txt', 'w', encoding='utf-8') as f:
+    f.write(main(w))
+  print(f'Article written to {f.name}')
diff --git a/bad_fragments.py b/bad_fragments.py
@@ -0,0 +1,75 @@
+from re import compile, VERBOSE
+from utils import pagescraper_queue, time_and_date
+from wikitools import wiki
+
+verbose = False
+
+# Within the HTML source code, all links should be href="()". Internal links start with /wiki/foo, so this will find all external links.
+LINK_REGEX = compile('''
+  href="/wiki/ # Within the HTML source code, all wiki links are href="/wiki/..."
+  ([^?#"]*)    # 1: Title
+  \\#([^"]*)   # 2: Fragment
+  "
+''', VERBOSE)
+
+ANCHOR_REGEX = compile('<span class="mw-headline" id="([^"]*)">')
+
+def pagescraper(page, links, sections):
+  text = page.get_raw_html()
+
+  page_links = []
+  page_sections = []
+
+  for m in LINK_REGEX.finditer(text):
+    page_links.append((m[1], m[2]))
+
+  for m in ANCHOR_REGEX.finditer(text):
+    anchor = m[1]
+    page_sections.append(anchor)
+
+  links[page.title] = page_links
+  sections[page.title] = page_sections
+
+
+def main(w):
+  # First, get all of the page contents to find links and section headers
+  links = {}
+  sections = {}
+  with pagescraper_queue(pagescraper, links, sections) as pages:
+    for page in w.get_all_pages():
+      pages.put(page)
+
+  total_bad_links = 0
+  bad_links = {}
+  for page, page_links in links.items():
+    bad_page_links = []
+    for target_page, target_section in page_links:
+      if target_page in sections and target_section not in sections[target_page]:
+        bad_page_links.append((target_page, target_section))
+        total_bad_links += 1
+    if len(bad_page_links) > 0:
+      bad_links[page] = bad_page_links
+
+  output = """\
+{{{{DISPLAYTITLE: {total_bad_links} links to nonexistant section headings}}}}
+There are <onlyinclude>{total_bad_links}</onlyinclude> links from {bad_pages} pages which do not link to valid subsections. Data as of {date}.
+
+{{{{TOC limit|3}}}}
+""".format(
+    total_bad_links=total_bad_links,
+    bad_pages=len(bad_links),
+    date=time_and_date())
+
+  for page in sorted(bad_links.keys()):
+    output += f'== [[{page}]] ==\n'
+    for target_page, target_section in sorted(bad_links[page]):
+      output += f'* [[{target_page}#{target_section}]]\n'
+
+  return output
+
+if __name__ == '__main__':
+  verbose = True
+  w = wiki.Wiki('https://wiki.teamfortress.com/w/api.php')
+  with open('wiki_bad_redirects.txt', 'w') as f:
+    f.write(main(w))
+  print(f'Article written to {f.name}')
diff --git a/master.py b/master.py
@@ -10,12 +10,10 @@
 import open_pr_comment
 
 # Reports I want:
-# Now that I have wikitext caching, many things are faster. Write a report for Redirects which link to non-existant subsections
 # images without licensing?
 # Quotations which use quote characters
 # Using {{lang}} and {{if lang}} on non-template pages
 # Direct links to disambig pages
-# Just... a summary of every single external link. Maybe just 'count per domain' and then list the top 10 pages? I'm finding a LOT of sus links, and it's only the ones that are *broken*.
 # Lang template mis-ordering and lang-template duplicate keys
 # Templates sorted by usage and protect status
 
@@ -75,7 +73,7 @@ def publish_report(w, module, report_name, root, summary):
   'unused_files': 'Unused files',
   'undocumented_templates': 'Undocumented templates',
   'edit_stats': 'Users by edit count',
-  'external_links2': 'External links',
+  'all_external_links': 'All external links',
   'mismatched': 'Mismatched parenthesis',
   'displaytitles': 'Duplicate displaytitles',
 }

diff --git a/utils.py b/utils.py
@@ -33,6 +33,7 @@ def __enter__(self):
     self.q = Queue()
     self.done = Event()
     self.threads = []
+    self.count = 0
     self.failures = 0
     for _ in range(self.num_threads):
       thread = Thread(target=self.meta_thread_func)
@@ -42,6 +43,10 @@ def __enter__(self):
 
   def put(self, obj):
     self.q.put(obj)
+    self.count += 1
+
+  def __len__(self):
+    return self.count
 
   def __exit__(self, exc_type, exc_val, traceback):
     self.done.set()

diff --git a/wikitools/page.py b/wikitools/page.py
@@ -23,7 +23,9 @@ def __repr__(self):
     return f'Page(w, {self.title})'
 
   def __le__(self, other):
-    return self.url_title < other.url_title
+    if self.lang == other.lang:
+      return self.url_title <= other.url_title
+    return self.lang == 'en' or (other.lang != 'en' and self.lang < other.lang)
 
   def get_wiki_text(self):
     cached_text = self.wiki.page_text_cache.get(self.title, None)

diff --git a/wikitools/tests.py b/wikitools/tests.py
@@ -0,0 +1,62 @@
+# A very light smattering of tests
+import inspect
+import sys
+
+from page import Page
+
+class MockWiki:
+  def __init__(self):
+    pass
+
+class Tests:
+  # Class setup
+  wiki = MockWiki()
+
+  # Utilities
+  def sort_titles(self, titles):
+    pages = [Page(self.wiki, title) for title in titles]
+    pages.sort()
+    return [page.title for page in pages]
+
+  #############
+  #!# Tests #!#
+  #############
+  def test_sort_pages(self):
+    actual =  self.sort_titles(['Scout/zh-hans', 'Scout/ru', 'Scout/pt-br', 'Scout/es', 'Scout', 'Scout/fr', 'Scout/it'])
+    expected = ['Scout', 'Scout/es', 'Scout/fr', 'Scout/it', 'Scout/pt-br', 'Scout/ru', 'Scout/zh-hans']
+    assert expected == actual, f'{expected}\n{actual}'
+    actual = self.sort_titles(['Scout', 'Solider', 'Pyro', 'Demoman', 'Heavy', 'Engineer', 'Medic', 'Sniper', 'Spy'])
+    expected = ['Demoman', 'Engineer', 'Heavy', 'Medic', 'Pyro', 'Scout', 'Sniper', 'Solider', 'Spy']
+    assert expected == actual, f'{expected}\n{actual}'
+    actual = self.sort_titles(['Scout/ko', 'Solider/ja', 'Pyro/it', 'Demoman/hu', 'Heavy/fr', 'Engineer/de', 'Medic/cs', 'Sniper/ar', 'Spy'])
+    expected = ['Spy', 'Sniper/ar', 'Medic/cs', 'Engineer/de', 'Heavy/fr', 'Demoman/hu', 'Pyro/it', 'Solider/ja', 'Scout/ko']
+    assert expected == actual, f'{expected}\n{actual}'
+
+if __name__ == '__main__':
+  tests = Tests()
+
+  def is_test(method):
+    return inspect.ismethod(method) and method.__name__.startswith('test')
+  tests = list(inspect.getmembers(tests, is_test))
+  tests.sort(key=lambda func: func[1].__code__.co_firstlineno)
+
+  for test in tests:
+    if len(sys.argv) > 1: # Requested specific test(s)
+      if test[0] not in sys.argv[1:]:
+        continue
+
+    # Test setup (nothing yet)
+
+    # Run test
+    print('---', test[0], 'started')
+    try:
+      test[1]()
+    except Exception:
+      print('!!!', test[0], 'failed:')
+      import traceback
+      traceback.print_exc()
+      sys.exit(-1)
+
+    print('===', test[0], 'passed')
+  print('\nAll tests passed')
+