-
Notifications
You must be signed in to change notification settings - Fork 7
/
Makefile
187 lines (155 loc) · 6.29 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
URLBASE = https://github.com/Helsinki-NLP/OPUS/blob/main/corpus
TSV_FILES = info/RELEASES.tsv \
info/RELEASE_LICENSES.tsv \
info/RELEASE_HISTORY.tsv \
info/RELEASE_SIZE.tsv \
info/RELEASE_NR_OF_LANGUAGES.tsv
MD_FILES = info/RELEASES.md info/RELEASES-without-ELRC.md
all: ${MD_FILES} ${TSV_FILES}
${MAKE} commit
${MAKE} cleanup-dry-run > untracked-files.txt
info: ${MD_FILES} ${TSV_FILES}
## remove untracked files
## (all released data that we don't store in the repository)
cleanup-dry-run:
echo "${MAKE} commit"
git clean -d -n
cleanup:
${MAKE} commit
git clean -d -f
# create a TSV file with essential release information
info/RELEASES.tsv: corpus
find corpus/ -mindepth 3 -name info.yaml | xargs grep 'release date:' > [email protected]
cut -f1 -d: [email protected] | cut -f2,3 -d/ > [email protected]
cut -f3- -d: [email protected] | \
sed 's/unknown/1900-01-01/' |\
sed 's/^ /date +%F --date="/;s/$$/"/' > [email protected]
chmod +x [email protected]
./[email protected] | sed 's/1900-01-01/unknown/' > [email protected]
find corpus/ -mindepth 3 -name info.yaml | xargs grep 'license:' > [email protected]
cut -f1 -d: [email protected] | cut -f2,3 -d/ > [email protected]
cut -f3- -d: [email protected] | sed 's/^ *//' | tr "\t" ' ' | sed 's/<[^>]*>//g' > [email protected]
find corpus/ -mindepth 3 -name info.yaml | xargs grep 'alignments:' > [email protected]
cut -f1 -d: [email protected] | cut -f2,3 -d/ > [email protected]
cut -f3 -d: [email protected] | sed 's/^ *//' > [email protected]
find corpus/ -mindepth 3 -name info.yaml | xargs grep 'sentences:' > [email protected]
cut -f1 -d: [email protected] | cut -f2,3 -d/ > [email protected]
cut -f3 -d: [email protected] | sed 's/^ *//' > [email protected]
find corpus/ -mindepth 3 -name info.yaml | xargs grep 'tokens:' > [email protected]
cut -f1 -d: [email protected] | cut -f2,3 -d/ > [email protected]
cut -f3 -d: [email protected] | sed 's/^ *//' > [email protected]
find corpus/ -mindepth 3 -name info.yaml | xargs grep 'number of languages:' > [email protected]
cut -f1 -d: [email protected] | cut -f2,3 -d/ > [email protected]
cut -f3 -d: [email protected] | sed 's/^ *//' > [email protected]
find corpus/ -mindepth 3 -name info.yaml | xargs grep 'number of language pairs:' > [email protected]
cut -f1 -d: [email protected] | cut -f2,3 -d/ > [email protected]
cut -f3 -d: [email protected] | sed 's/^ *//' > [email protected]
join -t' ' -e unknown -a1 -a2 -o 0,1.2,1.3,2.2 [email protected] [email protected] > [email protected]
join -t' ' -e unknown -a1 -a2 -o 0,1.2,1.3,1.4,2.2 [email protected] [email protected] > [email protected]
join -t' ' -e unknown -a1 -a2 -o 0,1.2,1.3,1.4,1.5,2.2 [email protected] [email protected] > [email protected]
join -t' ' -e unknown -a1 -a2 -o 0,1.2,1.3,1.4,1.5,1.6,2.2 [email protected] [email protected] > [email protected]
join -t' ' -e unknown -a1 -a2 -o 0,1.2,1.3,1.4,1.5,1.6,1.7,2.2 [email protected] [email protected] > [email protected]
echo 'name release release date alignments sentences tokens languages language pairs license' > $@
sed 's#\([^/]*\)/#\1 #' < [email protected] >> $@
rm -f $@.*
info/RELEASE_LICENSES.tsv: info/RELEASES.tsv
head -1 $< > $@
tail -n +2 $< | sort -t' ' -k4,4 >> $@
info/RELEASE_HISTORY.tsv: info/RELEASES.tsv
head -1 $< > $@
tail -n +2 $< | sort -t' ' -k3,3r >> $@
info/RELEASE_SIZE.tsv: info/RELEASES.tsv
head -1 $< > $@
tail -n +2 $< | sort -t' ' -k5,5nr -k6,6nr -k7,7nr >> $@
info/RELEASE_NR_OF_LANGUAGES.tsv: info/RELEASES.tsv
head -1 $< > $@
tail -n +2 $< | sort -t' ' -k8,8nr -k9,9nr >> $@
info/RELEASES.md: corpus
echo "# List of corpus releases" >$@
echo "" >> $@
echo "* [list of releases without ELRC](RELEASES-without-ELRC.md)" >> $@
echo "" >> $@
echo "| corpus | releases | " >> $@
echo "|--------|----------| " >> $@
for c in `find corpus -maxdepth 1 -mindepth 1 -type d -printf "%f\n" | sort`; do \
if [ -e corpus/$$c/info.yaml ]; then \
w=`grep '^website:' corpus/$$c/info.yaml | cut -f2 -d' '`; \
echo -n "| [$$c]($$w) | " >> $@; \
else \
echo -n "| $$c | " >> $@; \
fi; \
for v in `find corpus/$$c -maxdepth 1 -mindepth 1 -type d -printf "%f\n" | sort`; do \
echo -n "[$$v](${URLBASE}/$$c/$$v) " >> $@; \
done; \
echo " |" >> $@; \
done
info/RELEASES-without-ELRC.md: corpus
echo "# List of corpus releases (without ELRC)" >$@
echo "" >> $@
echo "* [complete list of releases](RELEASES.md)" >> $@
echo "" >> $@
echo "| corpus | releases | " >> $@
echo "|--------|----------| " >> $@
for c in `find corpus -maxdepth 1 -mindepth 1 -type d -printf "%f\n" | grep -v ELRC | grep -v ELRA | sort`; do \
if [ -e corpus/$$c/info.yaml ]; then \
w=`grep '^website:' corpus/$$c/info.yaml | cut -f2 -d' '`; \
echo -n "| [$$c]($$w) | " >> $@; \
else \
echo -n "| $$c | " >> $@; \
fi; \
for v in `find corpus/$$c -maxdepth 1 -mindepth 1 -type d -printf "%f\n" | sort`; do \
echo -n "[$$v](${URLBASE}/$$c/$$v) " >> $@; \
done; \
echo " |" >> $@; \
done
## commit all data in the current directory
## (adding files that we want to keep etc)
commit:
find corpus -name 'README' | xargs git add
find corpus -name 'LICENSE' | xargs git add
find corpus -name 'INFO' -exec git add {} \;
find corpus -name 'CONTACT*' -exec git add {} \;
find corpus -name 'CITATION*' -exec git git add {} \;
find corpus -name '*.tsv' | xargs -n 1000 git add
find corpus -name '*.yaml' | xargs -n 1000 git add
find corpus -mindepth 2 -maxdepth 2 -name '*.txt' | xargs -n 1000 git add
find corpus -mindepth 2 -maxdepth 2 -name '*.info' | xargs -n 1000 git add
find corpus -mindepth 2 -maxdepth 2 -name '.released*' | xargs -n 1000 git add
find corpus -mindepth 2 -maxdepth 2 -name '.uploaded*' | xargs -n 1000 git add
git add *.md
git commit -am 'corpus update'
RELEASED_VERSIONS = $(shell find corpus -maxdepth 2 -mindepth 2 -type d)
check-release-flags:
@for r in ${RELEASED_VERSIONS}; do \
d=`dirname $$r` ;\
v=`basename $$r` ;\
if [ ! -e $$d/.released-$$v ]; then \
echo "no release flag for $$r ($$d/.released-$$v)"; \
fi \
done
check-upload-flags:
@for r in ${RELEASED_VERSIONS}; do \
d=`dirname $$r` ;\
v=`basename $$r` ;\
if [ ! -e $$d/.uploaded-$$v ]; then \
echo "no upload flag for $$r ($$d/.uploaded-$$v)"; \
fi \
done
check-both-flags:
@for r in ${RELEASED_VERSIONS}; do \
d=`dirname $$r` ;\
v=`basename $$r` ;\
if [ ! -e $$d/.released-$$v ]; then \
if [ ! -e $$d/.uploaded-$$v ]; then \
echo "neither upload no release flag for $$r"; \
fi \
fi \
done