-
Notifications
You must be signed in to change notification settings - Fork 17
/
demo_catvec.py
73 lines (67 loc) · 3 KB
/
demo_catvec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
import sys, os
import urllib
from cat2vec import Category2Vec
from sent2vec import Sentence2Vec
from sentences import SampledWikiSentence as WikiSentence
import utils
import logging
jawiki_name = "jawiki.tsv.gz"
jawiki_url = "https://s3-ap-northeast-1.amazonaws.com/category2vec/jawiki-20141122-pages_plaintext.tsv.gz"
enwiki_name = "enwiki.tsv.gz"
enwiki_url = "https://s3-ap-northeast-1.amazonaws.com/category2vec/enwiki-20141106-pages_plaintext.tsv.gz"
wiki_name = enwiki_name
wiki_url = enwiki_url
model_dir = "models"
logger = logging.getLogger("demo")
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
current_dir = os.path.dirname(os.path.realpath(__file__))
wikip_data = current_dir+"/"+wiki_name
c2v_model_name = current_dir+"/"+model_dir+"/"+ wiki_name + "_cat.model"
if not os.path.exists(current_dir+"/"+model_dir):
os.mkdir(current_dir+"/"+model_dir)
if not os.path.isfile(wikip_data):
logger.info("downloading Wikipedia data")
urllib.urlretrieve(wiki_url, wikip_data)
logger.info("downloaded in %s" % wikip_data)
sentences = WikiSentence(wikip_data)
if not os.path.isfile(c2v_model_name):
model = Category2Vec(sentences,iteration=20, model="cb", hs = 1, negative = 0, size=300)
model.save(c2v_model_name)
else:
model = Category2Vec.load(c2v_model_name)
print "Input a category name or an article title (type EXIT to exit)"
sys.stdout.write("Name: ")
line = sys.stdin.readline()
while line:
line = utils.to_unicode(line.rstrip())
if line == "EXIT":
break
try:
if model.cat_no_hash.has_key(line):
cat_no = model.cat_no_hash[line]
cat_vec = model.cats[cat_no]
ncats = model.most_similar_category(cat_vec, 11)
print "Similar categories similarity"
print "-"*45
for ncat in ncats[1:]:
print ncat[0], " "*(max(30 - len(utils.to_utf8(ncat[0])), 0)), ncat[1]
print
if model.sent_no_hash.has_key(line):
sent_no = model.sent_no_hash[line]
sent_vec = model.sents[sent_no]
nsents = model.most_similar_sentence(sent_vec, 11)
print "Similar articles similarity"
print "-"*45
for nsent in nsents[1:]:
print nsent[0], " "*(max(30 - len(utils.to_utf8(nsent[0])), 0)), nsent[1]
print
if not model.cat_no_hash.has_key(line) and not model.sent_no_hash.has_key(line):
print "we couldn't find the specified category/article"
print
except Exception:
print "something wrong is happened"
print "Input a category name or an article title (type EXIT to exit)"
sys.stdout.write("Name: ")
line = sys.stdin.readline()