export.py

#!/usr/bin/env python3
'''
    Copyright (C) 2017  Povilas Kanapickas <povilas@radix.lt>

    This file is part of cppreference-doc

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see http://www.gnu.org/licenses/.
'''

import argparse
import urllib.parse
import urllib.request
import json

def retrieve_page_names(root, ns_index):

    begin = None
    pages = []

    while True:
        params = {
            'action' : 'query',
            'list' : 'allpages',
            'apnamespace' : ns_index,
            'aplimit' : 500,
            'format' : 'json'
        }
        if begin is not None:
            params['apcontinue'] = begin

        url = "{0}/api.php?{1}".format(root, urllib.parse.urlencode(params))

        with urllib.request.urlopen(url) as f:
            data = json.loads(f.read().decode('utf-8'))
            pages += [ p['title'] for p in data['query']['allpages'] ]

            if ('query-continue' in data and 'allpages' in data['query-continue'] and
                'apcontinue' in data['query-continue']['allpages']):
                begin = data['query-continue']['allpages']['apcontinue']
            else:
                return pages

def export_pages(root, pages, output_path):
    params = {
        'wpDownload' : '',
        'curonly' : 1,
        'pages' : '\n'.join(pages)
    }

    data = urllib.parse.urlencode(params)
    data = data.encode('ascii')
    url = "{0}/index.php?title=Special:Export&action=submit".format(root)

    urllib.request.urlretrieve(url, output_path, data=data)

def main():
    parser = argparse.ArgumentParser(prog='export.py')
    parser.add_argument('--url', type=str, help='The URL to the root of the MediaWiki installation')
    parser.add_argument('output_path', type=str, help='The path to the XML file to save output to')
    parser.add_argument('ns_index', type=str, nargs='+', help='The indices of the namespaces to retrieve')
    args = parser.parse_args()

    pages = []
    for ns_index in args.ns_index:
        new_pages = retrieve_page_names(args.url, ns_index)
        print("Retrieved {0} pages for namespace {1}".format(len(new_pages), ns_index))
        pages += new_pages

    pages = sorted(pages)
    export_pages(args.url, pages, args.output_path)

if __name__ == "__main__":
    main()