forked from alainrk/quick-match
-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
184 lines (156 loc) · 5.62 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
'use strict'
const { distance } = require('fastest-levenshtein')
const dice = require('fast-dice-coefficient')
const Ajv = require('ajv')
const { Result } = require('./result')
const Stemming = require('./stemming')
const candidatesSchema = require('./schema/candidates.json')
const optionsValidator = require('./schema/options.json')
const WORD_SPLITTER_REGEX = /[\s,.;!:'?|\-()]/
const DISTANCE_ALGORITHMS = {
dice: dice,
levenshtein: distance
}
/**
* Intesection between array/set
* @param {Set<string>|string[]} first
* @param {Set<string>|string[]} second
*/
const intersect = (first, second) => {
if (first instanceof Set) {
first = Array.from(first)
}
if (!(second instanceof Set)) {
second = new Set(second)
}
return first.filter(x => second.has(x))
}
class QuickMatch {
constructor (options = {}) {
const ajv = new Ajv()
this.candidatesValidator = ajv.compile(candidatesSchema)
this.options = this.initOptions(options)
// console.log(JSON.stringify(this.options, ' ', 2))
this.algorithm = DISTANCE_ALGORITHMS[this.options.algorithm]
this.digits = (
(maxn) => {
const res = new Array(maxn)
for (let i = 1; i < maxn + 1; i++) res[i - 1] = i.toString()
return res
}
)(this.options.numbers.maxDigit)
this.digitsSet = new Set(this.digits)
this.cardinalsSet = new Set(this.options.numbers.cardinals)
this.ordinalsSet = new Set(this.options.numbers.ordinals)
this.stemming = new Stemming(this.options.stemming.language)
// log(this.options)
}
initOptions (options) {
const ajv = new Ajv({ useDefaults: true }) // Apply defaults in schema
ajv.addSchema(optionsValidator)
this.optionsValidator = ajv.compile(optionsValidator)
if (!this.optionsValidator(options)) throw new Error('Options is not in a valid format.')
return options
}
normalizeCandidates (candidates) {
return candidates.reduce((acc, c) => {
const item = {}
if (typeof c === 'string') {
item.text = c
item.keywords = []
} else {
item.text = c.text
item.keywords = c.keywords || []
}
acc.push(item)
return acc
}, [])
}
applyAlgorithm (text, candidates, result) {
for (let i = 0; i < candidates.length; i++) {
const c = candidates[i]
// Main candidate text
const score = this.algorithm(text, c.text)
result.setCandidateScore(i, score)
// Take also the best from their keywords, if there are any
if (this.options.enableAlgorithmOnKeywords) {
for (let j = 0; j < c.keywords.length; j++) {
const kw = c.keywords[j]
const score = this.algorithm(text, kw)
result.setCandidateScore(i, score)
}
}
}
}
applyMatchNumber (text, candidates, result) {
let intersection
const words = text.split(WORD_SPLITTER_REGEX)
if (words.length > this.options.numbers.maxWordsEnablingNumbers) return false
if (this.options.numbers.enableDigits && this.digitsSet.has(text)) {
const idx = parseInt(text) - 1
if (idx >= candidates.length) return false
result.setNumberMatch('digit', idx)
return true
}
intersection = intersect(words, this.ordinalsSet)
if (this.options.numbers.enableOrdinals && intersection.length) {
const match = intersection[0]
const idx = this.options.numbers.ordinals.findIndex(v => v === match)
if (idx < 0 || idx >= candidates.length) return false
result.setNumberMatch('ordinal', idx)
return true
}
intersection = intersect(words, this.cardinalsSet)
if (this.options.numbers.enableCardinals && intersection.length) {
const match = intersection[0]
const idx = this.options.numbers.cardinals.findIndex(v => v === match)
if (idx < 0 || idx >= candidates.length) return false
result.setNumberMatch('cardinal', idx)
return true
}
}
phraseToStemmedArray (phrase) {
return this.stemming.stemArray(
phrase.split(WORD_SPLITTER_REGEX)
.filter(w => w.length >= this.options.stemming.minPreStemmingLength)
).filter(w => w.length >= this.options.stemming.minPostStemmingLength)
}
arrayToStemmedArray (arr) {
return this.stemming.stemArray(
arr.filter(w => w.length >= this.options.stemming.minPreStemmingLength)
).filter(w => w.length >= this.options.stemming.minPostStemmingLength)
}
applyStemming (text, candidates, result) {
const stemmedTextArr = this.phraseToStemmedArray(text)
result.setStemmedText(stemmedTextArr)
for (let i = 0; i < candidates.length; i++) {
const c = candidates[i]
const stemmedCandArr = this.phraseToStemmedArray(c.text).concat(
this.arrayToStemmedArray(c.keywords)
)
const intersections = intersect(stemmedTextArr, stemmedCandArr)
result.setStemmedCandidate(i, stemmedCandArr)
result.setCandidateStemIntersections(i, intersections)
}
}
normalizeText (text) {
return text.toLowerCase().trim()
}
run (text, candidates) {
const originalText = text
text = this.normalizeText(text)
if (!this.candidatesValidator(candidates)) throw new Error('Candidates has not a valid format!')
candidates = this.normalizeCandidates(candidates)
const result = new Result(this.options.algorithm, originalText, candidates)
// log(`\nAlgorithm: ${this.options.algorithm} - [${text}]`)
if (this.applyMatchNumber(text, candidates, result)) {
return result.build()
}
this.applyAlgorithm(text, candidates, result)
if (this.options.enableStemming) {
this.applyStemming(text, candidates, result)
}
return result.build()
}
}
module.exports = { QuickMatch }