-
Notifications
You must be signed in to change notification settings - Fork 0
/
patch_catss.py
542 lines (459 loc) · 22.7 KB
/
patch_catss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
import re
from pathlib import Path
from regex_patterns import ref_string, hchars, gchars
from datetime import datetime
def patch_morpho(data_dir='source', output_dir='source/patched', silent=False, debug=False):
log = ''
log += datetime.now().__str__() + '\n'
n_edits = 0
def report(msg):
# give feedback
nonlocal log # see https://stackoverflow.com/a/8178808/8351428
log += msg + '\n'
if not silent:
print(msg)
data = Path(data_dir)
file2lines = {}
for file in data.glob('*.mlxx'):
file2lines[file.name] = file.read_text().split('\n')
# apply select changes
edits = [
('01.Gen.1.mlxx', 12540, 'ADI2P', "KAQI/SATE VA AAD2P I(/ZW KATA"),
('05.Num.mlxx', 24859, 'SONTAIVC', "SUGKATAKLHRONOMHQH/SONTAI VC APS2S KLHRONOME/W SUN KATA"),
]
report('\napplying bulk manual edits...\n')
file = ''
for edit in edits:
# unpack data
file = edit[0] or file
ln, re_confirm, redaction = edit[1:]
old_line = file2lines[file][ln]
# confirm and apply changes, give reports throughout
if re.findall(re_confirm, old_line):
file2lines[file][ln] = redaction
report(f'correction for {file} line {ln}:')
report(f'\tOLD: {old_line}')
report(f'\tNEW: {redaction}')
n_edits += 1
else:
if debug:
raise Exception(f'FOLLOWING EDIT UNCONFIRMED: {edit} at {old_line}')
report(f'**WARNING: THE FOLLOWING EDIT WAS NOT CONFIRMED**:')
report(f'\tTARGET: {old_line}')
report(f'\tEDIT: {edit}')
# export the corrected files
report(f'\nwriting patched data to {output_dir}')
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
for file, lines in file2lines.items():
text = '\n'.join(lines)
file_path = output_dir.joinpath(file)
file_path.write_text(text)
# write changes to a log file
log_path = output_dir.joinpath('log.txt')
log_path.write_text(log)
report('\nDONE with all patches!')
report(f'\ttotal edits: {n_edits}')
def patch_parallel(data_dir='source', output_dir='source/patched', silent=False, debug=False):
"""Corrects known errors in the CATSS database."""
log = ''
log += datetime.now().__str__() + '\n'
n_edits = 0
def report(msg):
# give feedback
nonlocal log # see https://stackoverflow.com/a/8178808/8351428
log += msg + '\n'
if not silent:
print(msg)
data = Path(data_dir)
file2lines = {}
for file in data.glob('*.par'):
file2lines[file.name] = file.read_text().split('\n')
# -- Manual Edits --
# manual corrections loaded into tuples consisting of:
# (file, line_number, regex condition, new line)
# where line numbers refer to the original line numbers in the docs,
# the regex condition is a pattern to search all in the line to confirm the
# change (a safeguard for erroneous changes or for when the underlying data
# changes). All of the changes are enacted in a large loop.
# If filename is left empty, the previous filename is used
# NB: linenumbers are given as 0-indexed
edits = [
('06.JoshB.par', 983, 'MRY KAI', 'W/)T H/GRG$Y ^ =W/)T W/H/)MRY\t KAI\\ TO\\N AMORRAI=ON '),
('', 1366, '\.kb # KAI', 'W/H/KHNYM =W/H/)BNYM .m .kb #\t KAI\\ OI( LI/QOI '),
('', 3737, '12 E', 'W/YC+YRW =;W/YC+YDW .rd <9.12>\t E)PESITI/SANTO {d} KAI\\ H(TOIMA/SANTO'),
('', 9517, '<19.49> E', "--+ '' =;L/GBWLWT/YHM <19.49>\t E)N TOI=S O(RI/OIS AU)TW=N "),
('', 2006, '\t<6.20>\t', '-+ =;H/(YR/H <6.20>\tEI)S TH\\N PO/LIN '),
('', 9515, '\t<19\.49>\t', "--+ '' =;M/XLQ <19.49>\tDIAMERI/SAS "),
('', 7104, 'RNA.*\t', 'W/DNH =:W/RNH .dr\tKAI\ RENNA'),
('', 1659, '----', "M/MCRYM\t--- ''"),
('', 4673, '{=51}', "W/YMYT/M\t--- <=51>"), # Normalize this to a note
('', 10235, '{TOU', "--+\tSALAMIN {d} {...TOU= SWTHRI/OU}"),
('', 11304, 'A\)PO\|', "M/CPWN\tA)PO\ BORRA= [31] "),
('07.JoshA.par', 645, ' \)PO', "M/&M)L\tA)PO\ A)RISTERW=N"),
('01.Genesis.par', 9550, "--\+ ' ", "--+ '' =;W/BH <24.14>\tKAI\ E)N TOU/TW|"),
('', 9552, "--\+ ' ", "--+ '' =;KY <24.14>\tO(/TI"),
('', 9557, '=:ABRHM', "--+ =:)BRHM\tABRAAM"),
('', 2316, '--= ', "--+ '' =H/BHMH\tTW=N KTHNW=N"),
('', 12939, '\.a', "B/GLL/K =?B/RGL/YK .s <^30.30\tTH=| SH=| ^ EI)SO/DW|"), # typo: .a for .s
('', 10822, '}}', "NG(NW/K\t{...H(MEI=S} {...SE} ^ E)BDELUCA/MEQA"),
('17.1Esdras.par', 477, 'CC35\.24', 'W/Y(BYR/HW\tKAI\\ {..^A)PE/STHSAN AU)TO\\N} [cc35.24]'),
('', 6514, 'LI.*\t', ")L(ZR =:)LYW(NY\tE)LIWNA=S [e10.31]"),
('', 2857, '\[e2 10', "$$ M)WT )RB(YM W/$NYM =+\tE(CAKO/SIOI TESSARA/KONTA O)KTW/ [e2.10]"),
('', 772, 'SAS 3', ")$R H$BY(/W\t{...O(RKISQEI\S}{d} E)PIORKH/SAS #"),
('', 4525, 'O.I\(', "BNY GLWT/)\tOI( E)K TH=S AI)XMALWSI/AS [e6.16]"), # remove unknown char
('27.Sirach.par', 4843, '{\.\.}', '[..]\tA)PO\\'),
('', 3697, '\s\s\s\s\s', "#\tA(MARTWLOU=} [7]}"),
('', 16898, ' no id\.', "NSH[..] 4\t--- ''<c - no id.>"), # put weird note in brackets
('', 14099, '{\.\.\.\)', "<<KY>> 12\t{...}"),
('11.1Sam.par', 2096, 'O\t', "--+ '' =KPWT\tOI( KARPOI\\"),
('', 2097, 'T\t', "--+ '' =;YD/YW\tTW=N XEIRW=N AU)TOU="),
('12.2Sam.par', 8592, 'EI\)S\)', "H/&DH =;H/Y(R\t{..pEI)S} TO\\N DRUMO\\N"),
('13.1Kings.par', 15936, 'EI\)S}\t', "W/YBW)\tKAI\ EI)SH=LQEN {...EI)S}"),
('', 2987, 'GY', "MCRYM\tAI)GU/PTOU [2.46k,10.26a]"),
('14.2Kings.par', 4735, '{c}\? ', "YNHG\tE)GE/NETO {c?H)=GEN}"),
('40.Isaiah.par', 1855, 'E\t', "B/$LKT =;M$LKT <q1a>\tE)KPE/SH|"),
('', 11657, '_', "B/M(LWT\t--- ?"),
('', 18586, '\.\.\.TO', "W/L/QDW$\tTO\ A(/GION {d} {..^KAI\ DIA\}{..^TO|N"),
('', 11769, '=XWHa,XYY', "YXYW =@XWHa =@XYY\tA)NHGGE/LH {d} {...KAI\ E)CHGEIRA/S}"),
('26.Job.par', 2245, 'OU\)}\t', "W/L)\t{..^OU)}DE\\"),
('', 2063, '=a', "$DY =@$/DYa\tO( TA\ PA/NTA POIH/SAS"),
('', 7441, '{#}', "YMYN\tDECIW=N {---%}"),
('', 7927, 'S\.\.\^', "W/T$Q\tEI) DE\ KAI\ {..^EPIQEI\S}{..^E)FI/LHSA}"),
('', 7615, '{c\?}', "XMH =?@XSM,@ZMMa [[30:11]]\tFIMOU= {c?QUMOU=}"),
('', 7535, 'KRATAI', "B/(CM\t{..^KRATAIA=|}"),
('44.Ezekiel.par', 471, 'OU=} MDBR', "MDBR =v\t{...?AU)TOU=} LALOU=NTOS"),
('', 18162, '<42\.9\)', "--+ =;L/HNH <42.9>\tDI' AU)TW=N"),
('', 20424, '\s\s\s\s\s', "NTNW #\tDE/DONTAI #"),
('', 16686, r'XEIR\\', "^^^ ^ =W/B/YD/W\tKAI\ E)N TH=| XEI\R AU)TOU="),
('', 8218, '\+RAUS\+', "L/MWG =%vap\tQRAUSQH=|"),
('16.2Chron.par', 10095, '\t---$', "MLK\t--- ''"),
('', 10096, '\t---$', "B/YRW$LM\t--- ''"),
('', 1522, 'W:', "L/YHWH\tTW=| KURI/W|"),
('', 3575, '-\.-', ''), # erase redundant line
('', 4093, '{TOU', "W/B/BNYMN\tKAI\ {cTOU=} BENIAMIN"),
('02.Exodus.par', 18838, '<40\.9}', '--+ '' {x} =;B/W <40.9>\tAU)TH=S'),
('', 3197, '\s\s\s\s\s', "--+ =HW) <sp>\tAU)TO\S"),
('04.Num.par', 7479, '<de1\.39\)', "--+ '' =;)$R <de1.39>\tO(/SOI"),
('20.Psalms.par', 21382, '{\.1\.d', "W/M/PZ\tKAI\ {..dU(PE\R} TOPA/ZION [118.127]"),
('', 8991, '\*YCPYNW\*', "**YCPYNW *YCPWNW\tKAI\ KATAKRU/YOUSIN [55.7]"),
('', 21484, 'Y\*', "CR/Y\tOI( E)XQROI/ MOU [118.139]"),
('', 7997, 'PROS/', "W/)L\tKAI\ {..dPRO/S} [49.4]"),
('', 8968, r'TOUS\\', "DBR/W\tTOU\S LO/GOUS MOU [55.5]"),
('23.Prov.par', 89, 'c18\.7\s', 'W/(NQYM <ju8.26 ge41.42 c18.7>\tKAI\ KLOIO\\N XRU/SEON'),
('', 3274, 'ER\t', "{...}\tW(/SPER"),
('', 3317, '{c} ', "YQB/HW =?@$BQa\tU(POLI/POITO {cU(POLH/NION} AU)TO\\N"),
('', 3482, '\^EN\)', "MCWD =MCWR .dr\t{..^E)N} O)XURW/MASIN}"),
('', 7090, r'G\\AR', "KY\tGA\R"),
('', 8517, r'A\|\(', "$)WL\tA(/|DHS"),
('03.Lev.par', 6866, '<sp\^\s', "--+ '' =;B/W <nu19.13> <sp^> #\tE)N AU)TW=|"),
('', 12382, '{\.\.\.L\)\t', "W/PSL {...L)}\tOU)DE\ GLUPTA\\"),
('41.Jer.par', 4751, '--\t', "H(D {!}-\t--- ''"),
('', 4752, '--\t', "H(DTY {!}-\t--- ''"),
('05.Deut.par', 11173, 'KI.*\t', "--+ '' =;KY <24.22>\tO(/TI"),
('', 13270, 'Deut 28:65', 'Deut 28:64'),
('', 13293, '\s\*', "^ W/)BN\t^^^\n\nDeut 28:65"),
('', 2297, 'Deut 4:26', 'Deut 4:25'),
('', 2316, '\(YD', "\nDeut 4:26\nH(YDTY\tDIAMARTU/ROMAI"),
('08.JudgesB.par', 8041, r'N\.\.\.\)T', 'W/TY$N/HW =W/TY$N {...)T $M$WN}\tKAI\ E)KOI/MISEN {...TO\\N SAMYWN}'),
('', 7568, '=@a\+', "=@+R)a\tE)KRERIMME/NHN"),
('', 8151, ' %vpa', "W/YCXQ =%vpa {d}\tKAI\ E)/PAIZEN {d} {...KAI\ E)RRA/PIZON}"),
('30.Amos.par', 603, '\[c', "B/)RC\tTH=S --- {cGH=S}"),
('', 751, '\[c', ")$H\tGUMNAI\ {cGUNAI=KES}"),
('18.Esther.par', 4779, 'TH=!', "--+ ''\tTH=| TESSARESKAIDEKA/TH|"),
('19.Neh.par', 1663, 'MEneN', "K/H/YWM\tW(S SH/MERON"),
('', 3198, '{c\?}', "$(R =?(YR\tTH=S PO/LEWS {c?PU/LHS}"),
('', 166, '{\*\*\t', "*W/HBW)TY/M **W/HBY)WTY/M {**}\tKAI\ EI)SA/CW AU)TOU\S"),
('45.DanielOG.par', 7333, '{\?}', "YMYM\t--- <?>"),
('', 2883, 'Q/Q', "(L M$KB/Y ,,a\tE)KA/QEUDON [10]"),
('43.Lam.par', 1587, 'A \)', "+M)\tA)KAQA/RTWN"),
]
report('\napplying bulk manual edits...\n')
file = ''
for edit in edits:
# unpack data
file = edit[0] or file
ln, re_confirm, redaction = edit[1:]
old_line = file2lines[file][ln]
# confirm and apply changes, give reports throughout
if re.findall(re_confirm, old_line):
file2lines[file][ln] = redaction
report(f'correction for {file} line {ln}:')
report(f'\tOLD: {old_line}')
report(f'\tNEW: {redaction}')
n_edits += 1
else:
if debug:
raise Exception(f'FOLLOWING EDIT UNCONFIRMED: {edit} at {old_line}')
report(f'**WARNING: THE FOLLOWING EDIT WAS NOT CONFIRMED**:')
report(f'\tTARGET: {old_line}')
report(f'\tEDIT: {edit}')
# -- Other Edits --
report('\nApplying corrections to orphaned / corrupt lines...\n')
# there is a corruption in the lines for Exod 35:19:
#
# 16283 ^ ^^^ =L/$RT {...?H/&RD} # {+} E)N AI(=S LEITOURGH/SOUSIN
# 16284
# 16285 Exod 1:10
# 16286 #
# 16287
# 16288 Exod 35:19
# 16289 --+ E)N AU)TAI=S
#
# the interposition of blank lines and the "Exod 1:10" string are not
# supposed to be there, and they interrupt the data-lines for Exod 35:19
# these incorrect lines will be removed; the extra Exod 35:19 heading will
# likewise become unnecessary
# NB that line numbers below will be 1 less due to zero-indexing of Python
# first check that the edit still applies to current file
exod = file2lines['02.Exodus.par']
if exod[16284] == 'Exod 1:10':
report('patching corrupt lines 16283-16289 in 02.Exodus.par...')
fixed_lines = exod[:16283] + [exod[16285]] + exod[16288:]
file2lines['02.Exodus.par'] = fixed_lines
report('\tdone')
n_edits += 1
else:
if debug:
raise Exception('EXODUS CORRUPTION REPAIR SKIPPED!')
report('**WARNING: SKIPPING EXODUS CORRUPTION REPAIR DUE TO CHANGED LINE NUMBERS; see code')
# orphaned lines are cases where parts of a line are inexplicably broken off
# these are handled in bulk in a loop further below; but Ps 68:31 contains a
# special case with 2 orphaned lines in a row
# to prevent need for recursive algorith, we just fix it manually
# we do it here to avoid needed to adjust indices after the correction
# listed subsequent to this one
pss = file2lines['20.Psalms.par']
if pss[10848] == 'MTR':
report('patching double-orphaned lines in lines 10849-10851 of 20.Psalms.par (Ps 68:31)')
ps68_31_patch = [pss[10848] + pss[10849] + pss[10850]]
file2lines['20.Psalms.par'] = pss[:10848] + ps68_31_patch + pss[10851:]
report('\tdone')
n_edits += 1
else:
if debug:
raise Exception('PSALMS ORPHAN REPAIR SKIPPED!')
report('**WARNING: SKIPPING PSALMS ORPHAN REPAIR DUE TO CHANGED LINE NUMBERS; see code')
# An identical corruption to the one discussed above in Exodus 35:15
# likewise in 20.Psalms.par lines 2455-2459
pss = file2lines['20.Psalms.par'] # rename to resume corrected data
if pss[2459] == 'Ps 18:40':
report('patching corrupt lines 2457-2461 in 20.Psalms.par...')
fixed_lines = pss[:2456] + [pss[2457]] + pss[2460:]
file2lines['20.Psalms.par'] = fixed_lines
report('\tdone')
n_edits += 1
else:
if debug:
raise Exception('PSALMS ORPHAN REPAIR 2 SKIPPED!')
report('**WARNING: SKIPPING PSALMS ORPHAN REPAIR #2 DUE TO CHANGED LINE NUMBERS; see code')
# There is repeated material in Ezek, lines 20600-20607 (Ezek 47:20)
# We repair that here
ezek = file2lines['44.Ezekiel.par']
if ' ' in ezek[20599]:
ezek[20599] = "--+ =:XMT\tHMAQ"
fix = ezek[:20600] + ezek[20607:]
file2lines['44.Ezekiel.par'] = fix
n_edits += 1
else:
if debug:
raise Exception('EZEKIEL DUPLICATE CONTENT REPAIR SKIPPED!')
report('**WARNING: EZEKIEL DUPLICATE CONTENT REPAIR SKIPPED; see code')
# -- Repair Orphaned Lines --
# a search for lines without \t reveals that numerous lines are
# orphaned from their original line, for instance, see DanTh 6:17:
# >>> 4132 L/DNY)L ,,a TO\N
# >>> 4133 DANIHL
# here DANIHL should be a part of the previous line
# this problem is found in Sirach, Psalms, Daniel, Chronicles, Ezekiel, Neh,
# etc. and is correlated with the book names. For instance, in the Psalms,
# the Hebrew column is affected anywhere the characters "PS" appear (פס)
# In Deuteronomy, the Greek column is affected where DEUT appears in the text
# This was probably caused by a bad export and regex pattern that inserted a
# newline everywhere a book reference was found in the database, with the ill-effect
# that text containing the first characters of the books were also cleft by the newline.
# Since most book abbreviations contain vowels, the Greek column is primarily affected,
# meaning that orphaned lines need to be shifted up and appended to the Greek column.
# The one exception to this is Psalms with the "PS" string that is anywhere a
# פס appears in the text. These cases need to be merged down to the BEGINNING of the
# subsequent line, in the Hebrew column.
# This script will provide a detailed report in the log about which passages are affected,
# as well as how these effects are corrected (either shift up or shift down).
# TODO: This could be better patched by doing a simple search/replace in the text
# for all text beginning with book names and preceded by a newline
# will need a regex pattern that can differentiate genuine booknames and text
report('patching orphaned lines (see code for description)...')
current_verse = None
for file, lines in file2lines.items():
filtered_lines = []
i = 0
while i < len(lines):
line = lines[i]
# track references and keep them
if ref_string.match(line):
current_verse = line
filtered_lines.append(line)
# apply corrections to relevant lines
elif line and '\t' not in line:
# append to log and report which lines are involved
show = f'\n\t\t{lines[i-1]}\n\t--> {line}\n\t\t{lines[i+1]}'
report(f'\tpatching {file} at line {i}, {current_verse}:{show}')
n_edits += 1
# shift line down to HB col if it's in Psalms
if current_verse.startswith('Ps'):
filtered_lines.append(line+lines[i+1])
i += 1 # shift forward 1 extra to skip already-covered line
# otherwise shift it up to GK col
else:
filtered_lines[-1] = filtered_lines[-1] + line
# keep everything else unchanged
else:
filtered_lines.append(line)
# advance the position
i += 1
# reassign to new lines
file2lines[file] = filtered_lines
report('\tdone')
# -- Bulk Normalizations --
# changes which need to be effected systematically are loaded into tuples:
# (regex, replace)
# the changes are enacted with regex substitutions
# not all of these are stricly errors (though they may be), there
# are numerous cases of normalizations applied to bring idiosyncratic
# patterns in line with the majority
# NB that the order of some changes matters, since some patterns are
# dependent on other idiosyncracies being fixed already
normalizations = [
('~', '^'),
('----\+---', "--- ''"), # see 2 Chr 27:8
("---\+", "--+"),
("<([^\s>]*)(\s)(?!.*[>#])", '<\g<1>>\g<2>'), # numerous unclosed brackets
# NB: on below, cases of `{..`; some cases may be ambiguous whether they should be
# {... or {..^ However, it is the stated preference of the docs that
# during encoding {... is to be preferred (1986:7.6)
# and it also seems that several of the examples have a majority
# preference of {... over {..^; thus we go with the former
('{\.\.(?![.^a-z])', '{...'),
('\.\.\.\.', '...'),
('\(!\)', '{!}'), # (!) to {i}, inf. abs.
('(?<![-*])\-\+', '--+'), # -+ to --+
('A(?=.*\t)', ''), # vowels in the Hebrew column, replace with nothing
('=&p', '=%p'), # =&p typo for =%p, preposition differences
('(?<!-)--(?![-+])', '---'), # -- to ---
('=a', '=@a'),
# NB order of this block matters, to ensure space to left of =
('=%p=', '=%p-'),
('([:;])=', '=\g<1>'), # e.g. := to =:
('([^A-Z\/()\s|{}])=', '\g<1> ='), # ensure space to left of = (col.B marker)
# this is case of ellision with interruption
# it would be more consistent to code it as a separate {...} remark
# so we close the previous brace and adda second
('(?<![{\[])\.\.\.(?![}\]])', '}{...'),
('=%pa', '=%vpa'),
('-%vap', '=%vap'),
('{\.\.\.r', '{..r'),
('=p(?=[\s-])', '=%p'),
('<Sp>', '<sp>'),
('=vpa', '=%vpa'),
('{d}%p(\+?)', '%p\g<1> {d}'),
('\+;', '=;'),
('=\?:', '=:?'),
('={d};', '=;{d}'),
('=p%([-+\s])', '=%p\g<1>'),
('{d\t', '{d}\t'),
('{15{', '{15}'),
('\(\?5\)', '{?5}'),
('\[\.\.\.\]', '[..]'),
('{(\d+)(\s)', '{\g<1>}\g<2>'),
('(\s)(\d+)}', '\g<1>{\g<2>}'),
('=%\?p(-?)', '=%p\g<1>?'),
(' ([a-z][a-z]) (?=.*\t)', ' .\g<1> '),
('\(\.\.', '{..'),
(r'\\(?=.*\t)', '/'),
# order of block matters here
('\[([a-zA-Z])}', '{\g<1>}'),
('\[([\d.a-z]+)(?!.*\])', '[\g<1>]'),
('\s\s\s\s+', ' '),
('{\.\.\.\^', '{..^'),
('{\.\.\^\.', '{..^'),
('{\.\.\.([a-z]+)', '{..\g<1>'),
('{t\.}', '{t}'),
('<t\?>', '{t?}'),
('(\s)\?--\+(\s)', '\g<1>--+?\g<2>'),
# move question marks contained in brackets
# to the end of the brackets; this normalizes the `?`
# and allows us to treat them as external decorators
# rather than allowing them to interrupt a symbol
(r"{([^}]*)(\?\??)(.*?)}", "{\g<1>\g<3>}\g<2>"),
# normalize verse cross references in Hebrew portion
#('\[\[(.*[a-zA-Z]+.*\d\..*)\]\](?=.*\t)', '<\g<1>>'),
(r"\[\[(.+?)\]\](?=.*\t)", "<\g<1>>"),
(r"{dt}", "{d}{t}"),
# move `?` to end of etymological exegesis symbol
(r"=@\?(\S*)a", "=@\g<1>a?"),
# close up unclosed curly brackets
(r"{([^\[}#]+)( +|$)(?!.*[}#])", "{\g<1>}\g<2>"),
(r"\^\^\^ \^ ''", "^^^ ^"),
(r"=([A-Z()/&$+]+)a", "=@\g<1>a"),
# change brackets of cross references in Hebrew portion to <>
# where <...> represents a 'note'
("\[([^\]]*?\d[\]]*?)\](?=.*\t)", "<\g<1>>"),
# patch misplaced accents
(r"(\t.*)(\s)([()])(.)", "\g<1>\g<2>\g<4>\g<3>"),
(r"(\t.*)=\)", "\g<1>)="),
(r"\|=", "=|"),
(r"\|\)", ")|"),
(r"TO\|N", r"TO\\N"),
(r"KAI\|", r"KAI\\"),
(r"I\(MAT/TIA", "I(MA/TIA"),
(r"ZN=\|", "ZH=|"),
(r"H\)R=TAI", "H)=RTAI"),
(r"OY\)K", "OU)K"),
(r"EC/NOIS", "CE/NOIS"),
(r"TH=/S", "TH=S"),
]
report('\nMaking various bulk regex normalizations...\n')
for search, replace in normalizations:
report(f'---- applying pattern `{search}` with replace `{replace}` ----')
search = re.compile(search) # compile for efficiency
pattern_successful = False
for file, lines in file2lines.items():
new_lines = []
curr_verse = ''
for i,line in enumerate(lines):
# track passages for reporting since line numbers have already changed
if ref_string.match(line):
curr_verse = line
# apply substitutions
if search.findall(line):
redaction = search.sub(replace, line)
new_lines.append(redaction)
report(f' in {file} in {curr_verse}:')
report(f'\tOLD: {line}')
report(f'\tNEW: {redaction}')
pattern_successful = True
n_edits += 1
# else keep line the same
else:
new_lines.append(line)
file2lines[file] = new_lines
if not pattern_successful:
if debug:
raise Exception(f'PATTERN NOT FOUND: {search}')
else:
report(f'WARNING, PATTERN NOT FOUND: {search}')
# export the corrected files
report(f'\nwriting patched data to {output_dir}')
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
for file, lines in file2lines.items():
text = '\n'.join(lines)
file_path = output_dir.joinpath(file)
file_path.write_text(text)
# write changes to a log file
log_path = output_dir.joinpath('log.txt')
log_path.write_text(log)
report('\nDONE with all patches!')
report(f'\ttotal edits: {n_edits}')