-
Notifications
You must be signed in to change notification settings - Fork 2
/
tfidf.go
105 lines (86 loc) · 2.29 KB
/
tfidf.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// package tfidf is a lingo-friendly TF-IDF library
package tfidf
import (
"math"
"sync"
"github.com/xtgo/set"
)
// ScoreFn is any function that returns a score of the document.
type ScoreFn func(tf *TFIDF, doc Document) []float64
// TFIDF is a structure holding the relevant state information about TF/IDF
type TFIDF struct {
// Term Frequency
TF map[int]float64 `json:"tf"`
// Inverse Document Frequency
IDF map[int]float64 `json:"idf"`
// Docs is the count of documents
Docs int `json:"docs"`
// Len is the total length of docs
Len int `json:"len"`
sync.Mutex
}
// Document is a representation of a document.
type Document interface {
IDs() []int
}
// New creates a new TFIDF structure
func New() *TFIDF {
return &TFIDF{
TF: make(map[int]float64),
IDF: make(map[int]float64),
}
}
// Add adds a document to the state
func (tf *TFIDF) Add(doc Document) {
ints := BOW(doc)
tf.Lock()
for _, w := range ints {
tf.TF[w]++
}
tf.Docs++
tf.Len += len(ints) // yes we are adding only unique words
tf.Unlock()
}
// CalculateIDF calculates the inverse document frequency
func (tf *TFIDF) CalculateIDF() {
docs := float64(tf.Docs)
tf.Lock()
for t, f := range tf.TF {
tf.IDF[t] = math.Log(docs / f)
}
tf.Unlock()
}
// TF calculates the term frequencies of term. This is useful for scoring functions.
// It does not make it a unique bag of words.
func TF(doc Document) []float64 {
ids := doc.IDs()
retVal := make([]float64, len(ids))
TF := make(map[int]float64)
for _, id := range ids {
TF[id]++
}
for i, id := range ids {
retVal[i] = TF[id]
}
return retVal
}
// BOW turns a document into a bag of words. The words of the document will have been deduplicated. A unique list of word IDs is then returned.
func BOW(doc Document) []int {
ids := doc.IDs()
retVal := make([]int, len(ids))
copy(retVal, ids)
retVal = set.Ints(retVal)
return retVal
}
// Score calculates the TFIDF score (TF * IDF) for the document without adding the document to the tracked document count.
//
// This function is only useful for a handful of cases. It's recommended you write your own scoring functions.
func (tf *TFIDF) Score(doc Document) []float64 {
ids := doc.IDs()
retVal := TF(doc)
l := float64(len(ids))
for i, freq := range retVal {
retVal[i] = (freq / l) * tf.IDF[ids[i]]
}
return retVal
}