forked from priyankamandikal/arowf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
backlog.py
100 lines (84 loc) · 3.85 KB
/
backlog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# -*- coding: utf-8 -*-
### denote lines that need to be changed for different categories
import sys
reload(sys)
sys.setdefaultencoding("utf-8") # to handle UnicodeDecode errors
from math import ceil # top 20% of rankings
from traceback import format_exc # to handle errors
import pickle # to store article rankings
import json # for parsing the json response
from urllib2 import urlopen # to load urls
from os import path, listdir
from operator import itemgetter # to rank articles in the order of decreasing pageviews in a list
# from collections import OrderedDict # to store articles in the order of decreasing pageviews in a dict
from pageviews import format_date, article_views # to get pageviews
# cmlimit to specify number of articles to extract, max can be 500 (5000 for bots)
# cmtitle for name of Category to look in
# cmstartsortkeyprefix for starting the article listing from a particular alphabet or set of alphabets,
# 'b' for PA outdated
category_api_url = 'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmlimit=500&format=json&cmstartsortkeyprefix=m' ###
recdir = 'TL_records' + path.sep ###
def nextrecord():
try:
records = listdir(recdir)
record = 1+int(max(records)[:9])
### todo: check for improperly named files
return format(record, '09')
except:
return format(1, '09')
if __name__ == '__main__':
#category_list = ['Category:All_Wikipedia_articles_in_need_of_updating',
# 'Category:All_NPOV_disputes']
try:
category_url = '&cmtitle='.join([category_api_url, 'Category:All_NPOV_disputes']) ###
json_obj = urlopen(category_url).read()
except:
print "Error while obtaining articles from Category API"
print format_exc()
readable_json = json.loads(json_obj)
cnt = 0
d = [] # list of lists of rankings to be stored in a pickle file
for ele in readable_json['query']['categorymembers']:
title = ele['title']
link = '/'.join(['https://en.wikipedia.org/wiki', title.replace(' ', '_')])
categ = 'Category:All_NPOV_disputes' ###
pageviews = article_views(title)
print cnt+1, title, pageviews
d.append([title, link, pageviews, categ])
cnt = cnt+1
# od = OrderedDict(sorted(d.items(), key=lambda t:t[1][1], reverse=True)) # ordered dict in descending order of final score
od = sorted(d, key=itemgetter(2), reverse=True) # ordered list in descending order of pageviews
print '\n\nArticle rankings based on pageviews:\n'
for item in od:
print item
#with open('npov_b_ranking.pkl', 'wb') as f:
with open('TL_pickles/npov_m_ranking.pkl', 'wb') as f: ###
pickle.dump(od, f)
# if __name__ == '__main__':
# with open('PA_pickles/npov_m_ranking.pkl', 'rb') as f: ### use when od has already been created; comment above stuff
# od = load(f)
cnt = 0
counter = int(ceil(0.2*len(od))) # top 20% of rankings
#url = 'http://127.0.0.1:5000/ask' # url for POSTing to ask. Replace with Labs/PythonAnywhere instance if needed
for i in od:
# POSTing to ask
# data = {'question':'The article '+i[1]+' is in https://en.wikipedia.org/wiki/'+i[3]+'.\nHow would you resolve it?\n'+i[3],
# 'iframeurl':i[1]}
# r = requests.post(url, data=data)
fn = recdir + nextrecord() + 'q'
print fn
if path.exists(fn):
print('A billion questions reached! Start answering!')
exit()
f = open(fn, 'w')
# use 'How would you resolve it?' for NPOV and 'How would you update it?' for outdated
f.write('The article <a target="_blank" href="' + i[1] + '">' + i[0] +
'</a> is in <a target="_blank" href = "https://en.wikipedia.org/wiki/'+ i[3] + '">' + i[3] +
'</a>. How would you resolve it?<br/><a style="float:right;" href="' +
i[1] + '">'+i[1]+'</a><iframe src="' + i[1] +
'" style="height: 40%; width: 100%;">[Can not display <a target="_blank" href="' + i[1] + '">'
+ i[1] + '</a> inline as an iframe here.]</iframe>') ###
f.close()
cnt += 1
if (cnt == counter):
exit()