From f6c558a4d21ebc3411d85794eb5c38d0bb20ecb6 Mon Sep 17 00:00:00 2001 From: ynqa Date: Tue, 8 Dec 2020 02:14:50 +0900 Subject: [PATCH] uPdate --- README.md | 81 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 3d11192..ff3c8b5 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,8 @@ Inspired by [Data Science in Go](https://speakerdeck.com/chewxy/data-science-in- ## Installation +Use `go` command to get this pkg. + ``` $ go get -u github.com/ynqa/wego $ bin/wego -h @@ -59,45 +61,64 @@ Available Commands: word2vec Word2Vec: Continuous Bag-of-Words and Skip-gram model ``` -### Go SDK +`word2vec`, `glove` and `lexvec` executes the workflow to generate word vectors: +1. Build a dictionary for vocabularies and count word frequencies by scanning a given corpus. +2. Start training. The execution time depends on the size of the corpus, the hyperparameters (flags), and so on. +3. Save the words and their vectors as a text file. -```go -package main +`query` and `console` are the commands which are related to nearest neighbor searching for the trained word vectors. + +`query` outputs similar words against a given word using sing word vectors which are generated by the above models. + +e.g. `wego query -i word_vector.txt microsoft`: +``` + RANK | WORD | SIMILARITY +-------+-----------+------------- + 1 | hypercard | 0.791492 + 2 | xp | 0.768939 + 3 | software | 0.763369 + 4 | freebsd | 0.761084 + 5 | unix | 0.749563 + 6 | linux | 0.747327 + 7 | ibm | 0.742115 + 8 | windows | 0.731136 + 9 | desktop | 0.715790 + 10 | linspire | 0.711171 +``` + +*wego* does not reproduce word vectors between each trial because it adopts HogWild! algorithm which updates the parameters (in this case word vector) async. -import ( - "os" +`console` is for REPL mode to calculate the basic arithmetic operations (`+` and `-`) for word vectors. - "github.com/ynqa/wego/pkg/model/modelutil/vector" - "github.com/ynqa/wego/pkg/model/word2vec" +### Go SDK + +It can define the hyper parameters for models by functional options. + +```go +model, err := word2vec.New( + word2vec.Window(5), + word2vec.Model(word2vec.Cbow), + word2vec.Optimizer(word2vec.NegativeSampling), + word2vec.NegativeSampleSize(5), + word2vec.Verbose(), ) +``` + +The models have some methods: -func main() { - model, err := word2vec.New( - word2vec.Window(5), - word2vec.Model(word2vec.Cbow), - word2vec.Optimizer(word2vec.NegativeSampling), - word2vec.NegativeSampleSize(5), - word2vec.Verbose(), - ) - if err != nil { - // failed to create word2vec. - } - - input, _ := os.Open("text8") - if err = model.Train(input); err != nil { - // failed to train. - } - - // write word vector. - model.Save(os.Stdin, vector.Agg) +```go +type Model interface { + Train(io.ReadSeeker) error + Save(io.Writer, vector.Type) error + WordVector(vector.Type) *matrix.Matrix } ``` -## Formats +### Formats -As training word vectors *wego* requires file format for inputs/outputs. +As training word vectors wego requires the following file formats for inputs/outputs. -### Input +#### Input Input corpus must be subject to the formats to be divided by space between words like [text8](http://mattmahoney.net/dc/textdata.html). @@ -105,7 +126,7 @@ Input corpus must be subject to the formats to be divided by space between words word1 word2 word3 ... ``` -### Output +#### Output After training *wego* save the word vectors into a txt file with the following format (`N` is the dimension for word vectors you given):