-
Notifications
You must be signed in to change notification settings - Fork 0
/
translate-en-mn.py
37 lines (26 loc) · 1.17 KB
/
translate-en-mn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import ctranslate2
from mosestokenizer import MosesSentenceSplitter, MosesTokenizer
from indicnlp.tokenize import sentence_tokenize, indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import codecs
import sentencepiece as sp
## sentencepiece processor
sp_enmn_nits = sp.SentencePieceProcessor(model_file='train.model')
## Translator
translator_enmn_nits = ctranslate2.Translator("model_deploy",
# compute_type="int8",
inter_threads=4, intra_threads=1)
paras = "This is a test sentence. Another sentence."
# Split Sentences
inp_lines = sentence_tokenize.sentence_split(paras, "hi")
# Apply sentencepiece
inp_lines = sp_enmn_nits.encode_as_pieces(inp_lines)
# Translate
out_lines = translator_enmn_nits.translate_batch(inp_lines, beam_size=5, max_batch_size=16)
out_lines = [out_lines[i].hypotheses[0] for i in range(len(out_lines))]
# Remove sentencepiece
out_lines = [sp_enmn_nits.decode(out_line).replace(chr(9601), " ") for out_line in out_lines]
#out_lines = [out_line.replace(chr(9601), " ") for out_line in out_lines]
# Post Processing
out_lines = [line.replace('"', '').replace("u200d", "").strip() for line in out_lines]
print(out_lines)