-
Notifications
You must be signed in to change notification settings - Fork 0
/
inlinks.py
176 lines (146 loc) · 6.58 KB
/
inlinks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import pandas as pd
import json
import urllib.parse
def wiki_homepages(pagename, json_file, titles_file):
'''
Generates a list of inlinks (pages that link to pagename) of a page.
Inputs:
pagename - name of the page of interest
json_file - file of the json containing the line numbers of
pages and their inlinks
titles_file - file containing pagenames of each number in json_file
Output:
list of inlinks
'''
# Generates a dataframe containing pagenames for each line number
titles = pd.read_csv(titles_file, delimiter = ' ', names = ['page title'])
# Finds the line index of pagename
line_index = titles[titles[[0]] == pagename].dropna().index.tolist()
if line_index == []:
print('pagename does not exist')
return None
page_line_num = line_index[0] + 1
# Opens json_file, extracts the list of inlinks of pagename, and closes
# json_file
f = open(json_file, 'r')
homepage_line_nums = json.load(f).get(str(page_line_num), [])
f.close()
homepage_titles = []
for homepage_line_num in homepage_line_nums:
title = titles.iloc[[int(homepage_line_num) - 1]].values[0][0]
# appends the title of each inlink to a list
homepage_titles.append(title)
return homepage_titles
def one_to_five_inlinks_dump(json_file):
'''
Generates a dict of pages with one to five inlinks (pages that link to
pagename) of a page, and dumps it to a json file named 'one_to_five_inlinks'.
Inputs:
json_file - file of the json containing the line numbers of
pages and their inlinks
'''
one_to_five_dict = {}
with open(json_file, 'r') as f:
for key, val in json.load(f).items():
# if the pagename contains one to five inlinks, append to dict
if len(val) >= 1 and len(val) <= 5:
one_to_five_dict[key] = val
with open('one_to_five_inlinks', 'w') as f:
json.dump(one_to_five_dict, f)
def one_to_five_inlinks_sample_dump(json_file, titles_file, num_of_inlinks, \
num_of_pages):
'''
Generates a dict of num_of_pages pages with num_of_inlinks inlinks by
loading json_file (a dict containing all pages with one to five inlinks),
filtering through, and appending suitable items into the dict inlinks_sample.
inlinks_sample is then dumped into a sample directory, which is used for
the function one_to_five_inlinks_sample.
Inputs:
json_file - file of the json containing the line numbers of
pages and their inlinks
titles_file - file containing pagenames of each number in json_file
num_of_inlinks - parameter determining the number of inlinks for each
page
num_of_pages - parameter determining the number of pages needed
'''
titles = pd.read_csv(titles_file, delimiter = ' ', names = ['page title'])
inlinks_sample = {}
with open(json_file, 'r') as f:
count = 0
for key, val in json.load(f).items():
if len(val) == num_of_inlinks:
# convert key from line number (int) to pagename (str)
key_name = titles.iloc[[int(key) - 1]].values[0][0]
# parsing titles to remove unusual characters
title = urllib.parse.unquote_plus(key_name)
x = 0
while '%' in title:
if x == 10:
break
title = urllib.parse.unquote_plus(title)
x += 1
# convert each value from line number (int) to pagename (str)
for v in val:
val_name = titles.iloc[[int(v) - 1]].values[0][0]
# parsing title to remove unusual characters
title = urllib.parse.unquote_plus(val_name)
x = 0
while '%' in title:
if x == 10:
break
title = urllib.parse.unquote_plus(title)
x += 1
# appends val_name to key_name in inlink_sample dict
inlinks_sample[key_name] = inlinks_sample.get(key_name, \
[]) + [val_name]
count += 1
if count == num_of_pages:
# once num_of_pages pages with the specified attributes
# are obtained, dump the inlinks_sample dict and return None
with open('samples/sample_' + str(num_of_inlinks) + '_' + \
str(num_of_pages), 'w') as f:
json.dump(inlinks_sample, f)
return None
# if num_of_pages exceeds the number of pages that have num_of_inlinks
# inlinks, then all the pages with num_of_inlinks are dumped into the
# proper file
print('there are only {:} pages', count)
with open('samples/sample_' + str(num_of_inlinks) + '_' + \
str(num_of_pages), 'w') as f:
json.dump(inlinks_sample, f)
return None
def one_to_five_inlinks_sample(num_of_inlinks, num_of_pages):
'''
Loads the proper json file with num_of_inlinks inlinks and num_of_pages
pages, and returns the sample_dict.
Inputs:
num_of_inlinks - parameter determining the number of inlinks for each
page
num_of_pages - parameter determining the number of pages needed
Output:
sample_dict with num_of_pages pages with num_of_inlinks inlinks
'''
with open('samples/sample_' + str(num_of_inlinks) + '_' + \
str(num_of_pages), 'r') as f:
sample_dict = json.load(f)
return sample_dict
def two_inlinks_sample(json_file_two):
'''
Returns a sample dict with two inlinks each.
Input:
json_file_two - a json file with a dictionary sample of pages
with two inlinks each
Output:
a sample dict of pages with two inlinks as keys
'''
two_inlinks_sample = {}
with open(json_file_two, 'r') as f:
inlinks_dict = json.load(f)
two_inlinks_sample['Wrestling_Slang'] = inlinks_dict['Wrestling_Slang']
two_inlinks_sample['Concordia_University,_St._Paul'] = \
inlinks_dict['Concordia_University,_St._Paul']
two_inlinks_sample['A_Spaceman_Came_Travelling_(Christmas_Remix)'] = \
inlinks_dict['A_Spaceman_Came_Travelling_(Christmas_Remix)']
two_inlinks_sample['Transcendentals'] = inlinks_dict['Transcendentals']
two_inlinks_sample['Platinum_Card'] = inlinks_dict['Platinum_Card']
return two_inlinks_sample