-
Notifications
You must be signed in to change notification settings - Fork 1
/
extraction.py
61 lines (51 loc) · 1.37 KB
/
extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
org_path = './europarl/txt'
des_path = './dataset'
print('Default writting directory:', des_path)
try:
lst = os.listdir(org_path)
except:
print("Dataset Not Found")
print('Kindly Download Dataset First')
print('http://www.statmt.org/europarl/')
exit()
language = {
'fi': 'finnish',
'bg': 'bulgarian',
'da': 'danish',
'sk': 'slovak',
'lt': 'lithuanian',
'en': 'english',
'sl': 'slovenian',
'pl': 'polish',
'lv': 'latvian',
'pt': 'portuguese',
'ro': 'romanian',
'de': 'german',
'es': 'spanish',
'nl': 'dutch',
'cs': 'czech',
'sv': 'swedish',
'et': 'estonian',
'el': 'greek',
'hu': 'hungarian',
'it': 'italian',
'fr': 'french'
}
default_char_length = 100_000
print('Warning: Default Char Length set on', default_char_length)
for code, lang in language.items():
from_path = os.path.join(org_path, code)
to_path = os.path.join(des_path, lang + '.txt')
dirs = os.listdir(from_path)
template = ''
for file in dirs:
path = os.path.join(from_path, file)
with open(path) as handle:
output = handle.read()
template += output
if len(template) >= default_char_length:
break
with open(to_path, 'w') as handle:
handle.write(template)
print('Written for', lang,':' ,len(template))