-
Notifications
You must be signed in to change notification settings - Fork 0
/
intertext2tsv.py
79 lines (67 loc) · 2.54 KB
/
intertext2tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import shutil
import argparse
from xml.etree.ElementTree import parse
def main():
parser = argparse.ArgumentParser(description='Convert Intertext to TSV')
parser.add_argument('-i', '--input', type=str, required=True, help='Input directory for Intertext alignments.')
parser.add_argument('-o', '--output', type=str, required=True, help='Output directory for TSV files.')
args = parser.parse_args()
make_dir(args.output)
input_files = get_input_files(args.input)
for src_file, tgt_file, align_file in input_files:
out_file = align_file.replace('xml', 'tsv')
print("Converting {} to {} ...".format(align_file, out_file))
src_sents = get_sents(os.path.join(args.input, src_file))
tgt_sents = get_sents(os.path.join(args.input, tgt_file))
alignments = get_alignments(os.path.join(args.input, align_file))
write_tsv(src_sents, tgt_sents, alignments, os.path.join(args.output, out_file))
def write_tsv(src_sents, tgt_sents, alignments, out_file):
tsv = []
for src_idx, tgt_idx in alignments:
src_sent = find_sent_by_id(src_idx, src_sents)
tgt_sent = find_sent_by_id(tgt_idx, tgt_sents)
tsv.append(src_sent + "\t" + tgt_sent)
with open(out_file, 'wt', encoding="utf-8") as f:
f.write("\n".join(tsv))
def find_sent_by_id(idx, sents):
sent = ''
if len(idx) > 0:
sent = ' '.join(sents[idx[0]:idx[-1]+1])
return sent
def get_alignments(file):
doc = parse(file)
links = []
for link in doc.iterfind('link'):
tgt_link, src_link = link.get('xtargets').split(';')
src_bead = parse_link(src_link)
tgt_bead = parse_link(tgt_link)
links.append((src_bead, tgt_bead))
return links
def parse_link(link):
bead = []
if len(link) > 0:
bead = [ int(item.split(':')[1]) - 1 for item in link.split(' ')]
return bead
def get_sents(file):
doc = parse(file)
sents = []
for sent in doc.iterfind('p/s'):
sents.append(sent.text)
return sents
def get_input_files(dir):
input_files = []
for file in os.listdir(dir):
names = file.split('.')
if (len(names)) == 4:
prj, src, tgt, suffix = names
src_file = '.'.join([prj, src, suffix])
tgt_file = '.'.join([prj, tgt, suffix])
input_files.append([src_file, tgt_file, file])
return input_files
def make_dir(dir):
if os.path.isdir(dir):
shutil.rmtree(dir)
os.makedirs(dir, exist_ok=True)
if __name__ == '__main__':
main()