-
Notifications
You must be signed in to change notification settings - Fork 0
/
makeindex.py
65 lines (46 loc) · 2.06 KB
/
makeindex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re,glob
from nltk.corpus import stopwords
stop = list(stopwords.words('english'))
stop.extend('section subsection subsubsection figure table item paragraph begin end caption autoref includegraph png pdf textwidth endsubfigure subfigure the is this centering label textbf textit emph fig cite citep width height includegraphics sec use equation eqn pp conscat ch also nc used using pt within align https com ac whereupon github'.split())
# NOTE these are now removed from the regex exp anyway.
files = glob.glob('*_*.tex')
files.sort()
corpus = []
title = []
for f in files:
try:
dir = open(f,'r').read()
loc = re.findall(r'\\subimport\{([^\}]*)',dir)[0]
text = open('%scombigned.tex'%loc,'r').read().lower()
title.append(re.findall(r'\\chapter\{(.*)\}',dir)[0])
text = re.sub(r'\s*%.*\n','',text )
text = re.sub(r'\\\w+ *','',text )
text = re.sub(r'[\._\W\s\d]+',' ',re.sub('-','',text ))
corpus.append(text)
except:None
vectorizer = TfidfVectorizer(analyzer = 'word',stop_words=set(stop))
vectors = vectorizer.fit_transform(corpus)
names = vectorizer.get_feature_names()
data = vectors.todense().tolist()
# Create a dataframe with the results
df = pd.DataFrame(data, columns=names,index=title)
tfidf = '''
\\chapter{Chapter Keywords}
This section uses the Term Frequency Inverse Document Frequency to determine the keywords of each chapter - a technique which has been described in \\autoref{ch3}. Text size corresponds to the importance of each word.
'''
N = 100;
for i in df.iterrows():
i = list(i)
i[0] = i[0].split('}')[0]
print('\n\n',i[0])
tfidf += '\n\n \\section{%s}\n {\\fontfamily{cmtt}\\selectfont \\parbox{\\textwidth}{\n'%i[0]
rankings = i[1].sort_values(ascending=False)[:N]
print(rankings)
for i in rankings.iteritems():
tfidf += '\\size{%d}{ %s }'%(1+i[1]*100,i[0].upper())
#tfidf=tfidf[:-2]
tfidf+='} }'
with open('appendices/keywords.tex','w') as f:
f.write(tfidf)