-
Notifications
You must be signed in to change notification settings - Fork 8
/
word2vec.py
executable file
·100 lines (89 loc) · 3.78 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, unicode_literals
import argparse
import logging
import os
import sys
from gensim.models import Word2Vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
class SentencesIterator(object):
def __iter__(self):
for l in sys.stdin:
yield l.strip().split()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Word2Vec algorithm")
parser.add_argument("output",
type=str,
metavar="OUTPUT",
help="File to store the word vectors.")
parser.add_argument("vocab",
type=str,
metavar="VOCAB_OUTPUT",
help="File to store the vocabulary")
parser.add_argument("--size",
type=int,
metavar="SIZE",
help="Set size of word vectors.",
default=300)
parser.add_argument("--window",
type=int,
metavar="WINDOW",
help="Max skip length between words.",
default=5)
parser.add_argument("--threads",
type=int,
metavar="THREADS",
help="Set the number of threads for parallelizing.",
default=12)
parser.add_argument("--min_count",
type=int,
metavar="MIN_COUNT",
help="Set the minimum number of occurrences for a word",
default=5)
parser.add_argument("--sample",
type=float,
metavar="SAMPLE",
help="Threshold for configuring which higher-frequency words are randomly downsampled",
default=0.001)
parser.add_argument("--alpha",
type=float,
metavar="ALPHA",
help="Set the starting learning rate",
default=0.001)
parser.add_argument("--iter",
type=int,
metavar="ITERATIONS",
help="number of iterations (epochs) over the corpus.",
default=5)
parser.add_argument("--cbow",
action="store_true",
help="Train usining CBOW instead of SkipGram.")
parser.add_argument("--cbow-mean",
action="store_true",
help="Use mean instead of sum when using CBOW.")
parser.add_argument("--negs",
type=int,
metavar="NEGATIVE_SAMPLING_COUNT",
help="If > 0, negative sampling will be used, the int for negative specifies how many " +
"\"noise words\" should be drawn (usually between 5-20).",
default=10)
args = parser.parse_args()
model_config = {
"size": args.size,
"window": args.window,
"workers": args.threads,
"min_count": args.min_count,
"sample": args.sample,
"alpha": args.alpha,
"iter": args.iter,
"negative": args.negs,
"sg": 0 if args.cbow else 1,
"hs": 0 if args.negs > 0 else 1,
"cbow_mean": 1 if args.cbow_mean else 0
}
sentences = SentencesIterator()
print("Creating and Training Word2Vec model.", file=sys.stderr)
model = Word2Vec(sentences, **model_config)
print("Saving the model in Word2Vec binary format.", file=sys.stderr)
model.wv.save_word2vec_format(args.output, fvocab=args.vocab, binary=True)