-
Notifications
You must be signed in to change notification settings - Fork 2
/
check-nonos-2.sh
executable file
·159 lines (124 loc) · 6.83 KB
/
check-nonos-2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
source check-nonos-utils.sh
# check 9 f.html "test for quoting" "" "" "ack --color ' id=\"bonuses\"|'\' __"
# check 9 f.html "test for escaping" "" "" "echo '\n\\n\\\n\\\\n \\x7F'"
dont() {
# don't
pcregrep --color=always -M '<p><br /></p>\n<p>[^TB].*?</p>\n<p>[^A[]' $PREFIX!(*.o).html
#pcregrep --color=always -M '(?<!<p>)<br />' $PREFIX!(*.o).html
#pcregrep --color=always -M '<br />(?!</p>)' $PREFIX!(*.o).html
pcregrep --color=always -M '<p> ' $PREFIX!(*.o).html
check 1 p.o.html "Soft line breaks (or <br> in HTML)" \
Remedy "Replace with paragraphs in source document" \
"rg --heading --color=always -C1 '<br(?: /)?>$' __"
check 1 p.f.html "Count of ${UL}tu\"${NL}, ${UL}bonus\"${NL}, and ${UL}ANSWER:${NL}" \
Check "All counts should equal 120 (20 tu + 20 bonus + 80 answer)" \
"grep --color=always -E -c 'tu\"|bonus\"|ANSWER: ' __"
# TODO: count up <author> tags here
# count up total tags i.e. <
check 2 answers "Count of unique categories" "" "" \
"cat __ | cut -f4 | sort | uniq -c | sort -nrk1"
check 2 answers "Count of unique authors" "" "" \
"cat __ | cut -f3 | sort | uniq -c | sort -nrk1"
check 2 answers "Long answers" "" "" \
"gawk 'BEGIN { FS=OFS=\"\t\" } { for (i=5; i<=NF; i++) if (length(\$i) >= 30) print \$1,gensub(/^.+\\/(.+)\\..+$/,\"\\\\1\",\"g\",FILENAME),\$2,length(\$i),\$i}' __"
check 2 answers "Special characters in parsed tags or answers" \
Check "If should be removed" \
"rg --heading --color=always '[\\\\()\\[\\]#_*^~]' __"
check 2 md.nowrap "Bonus marker count" \
Check "Should have the same number of each marker" \
"rg -I -o '^\\\\?\[(?:10)?[emhEMH\S]*' __ | sort | uniq -c"
check 2 md.nowrap "Line count" \
Check "All files should have the same number of lines (false positives: shorter tiebreaker packets)" \
"wc -l __"
check 2 p.o.html "Line count" \
Check "Use to investigate: pr -t -m -w144 -n4 __" \
"wc -l __"
check 2 md "Serial comma" "" "" \
"grep --color=always -PHn '(?<!10 points),\s+\S+[^,]\s+(and|or)\s(?!\S+\\>)' __"
check 3 md "Unnecessary moderator notes (${UL}moderator${NL})" \
Check "If the question can be improved another way" \
"grep --color=always -iHn 'moderator' __"
# TODO: add note to player
check 3 md "${UL}which${NL} vs. ${UL}that${NL}" \
Check "If ${UL}which${NL} should be replaced by ${UL}that${NL} (false positives: quotations)" \
"rg --heading --color=always -P -i '(?<!(?:....(?:.,|,[\"”])|... (?:in|of|to|at|on|by)|.. (?:for|and|but)|. (?:from|with|upon|into|onto|over)| after| under| among| about| above| below|around|during|hrough|ithout|toward)) which' __"
# TODO: add past|across
# TODO: not followed by (states|says|etc.) that
# TODO: which should be always lowercase
# TODO: use nowrap
# tregex.sh '@SBAR <<: S' -- $PREFIX*.parsed
# Tregex: SBAR !$,, /,/ & < (WHNP <<: which)
# TODO: add phrasal -ing -ed (-designed)
# TODO: add non-
# TODO: add languagetool
# TODO: add [] - see docx-to-unparsed.sh
}
dont
# informative checks only
# nbsp:
# ack -ho --mn '(?:\S* )+\S+' $PREFIX | sort
# ruby:
# ack -oh '\S+\s*(?<!class="text">)<ruby>.*?</ruby>\s*\S+' $PREFIX*.r.html | sed -E 's@(<ruby><rb>|</rb><rp>|</rp><rt>|</rt><rp>|</rp></ruby>)@'$'\t@g' | column -ts$'\t'
# TODO: unparsed (malformatted) PGs
# ack '\[“\w' tournaments/hft19/packets/*.r.html
# num of ruby should match num of (" ")
# TODO: check caverphone - PGs that may be wrong (number of words too short/long)
# see instrucs in htmlparser
# check 1
rg --color=always '[^>]<' $PREFIX*.o.html
check 3 p.o.html "Bold tag interrupted" \
Check "If should be removed" \
"rg --heading --color=always '<p( class=\"p1[^\"]*\")?>\d.*</(b|strong)>.*<(b|strong)>[^<]' __"
# TODO: count tags (valid) - need commas
check 4 md "Count tags" "" "" \
"rg --color=always -c '\\\\<(.*)\\\\>' __"
# exit
function dont1() {
# TODO: start after the packet header
# TODO: should be nowrap
check 3 md "Hyphen between two capitalized words" \
Check "If an en dash should be used for doubly eponymous" \
"rg --heading --color=always -P '(?!Naveh-Benjamin|Co-Head)\p{Lu}\p{Ll}+-\p{Lu}\p{Ll}+' __"
# TODO: should be nowrap
# TODO: pipe out to http://jwilk.net/software/anorack
# anorack | while IFS=: read -r f l e; do echo -e "\n$f:$l\n$e"; sed -n "${l}p" $f; done
check 3 md "Wrong usage of a/an" \
Check "If ${UL}a${NL} + vowel or ${UL}an${NL} + consonant usage is correct (false positives: initialisms, silent H, consonantal U, medial A)" \
"rg --heading --color=always -P '\b[Aa] (?![\\\\[(]*emphasize)[()\[\]\\\\* ]*+(?!(?i)(uni(?:vers|form)|union|uniqu|utopi|Euro|USA?|[U][A-Z]*\b))[AEIOUaeiou]| an [()\[\]\\\\* ]*+(?!1[18]|8|[HS][A-Z]*\b)[^aeiouAEIOUéÉ<“]' __"
# TODO: fix for Packet by Columbia A and Texas A
check 4 md.nowrap "Short sentences starting with “That”" \
Check "If a semicolon should connect the previous sentence instead (false positives: initials)" \
"rg --heading --color=always -o '\. That.{0,70}\.\S*' __"
}
dont1
check 4 p.txt "First full pronoun is too far into tossup (70 chars, ignoring PGs)" \
Check "If the pronoun can be moved earlier instead" \
"rg --heading --color=always -P -n -ior '\$a►\$b' '^(?P<a>\d+\. (?>(?> [([](?!this|these).*?[)\]])*+(?!(?&r)).){70}.*?)(?P<b>(?P<r>this|these) \S+(?!.*points each))' __ | awk '/[0-9]. /{o=\$0;\$1=\"\";match(\$0,\"^[^►]*\");printf(\"%4s %s\n\",RLENGTH-1,o);next}{print}'"
# reimplement ack's --range-end='Bonuses' for rg
# "ack -i --range-end='Bonuses' '^\d+\. .*?\K(?:this|these)(*PRUNE)(?<=.{70}) \S+' __"
# "sed '/^Bonuses/Iq' __ | rg --color=always -P -n -io '^\d+\. (?>(?> [([](?!this|these).*?[)\]])*+(?!(?1)).){70}.*?(this|these) \S+'"
# TODO: he him his [this
# TODO: ignore *Note to ... .*
# TODO awk length should not take into account PGs
check 5 md.nowrap "List of all non-ASCII characters" "" "" \
"grep -Poh '[^\\x00-\\x7F]' __ | sort | uniq | tr -d '\n'; echo"
check 5 md "List instances of most non-ASCII characters" "" "" \
"rg --color=always '[^\\x00-\\x7F\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\xFF –—‘’“”…]' __"
check 5 answers "List instances of most non-ASCII characters in answerlines" Check "If lunr-unicode-normalizer translates them correctly" \
"rg --color=always '[^\\x00-\\x7F\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\xFF –—‘’“”…]' __"
# check 5 log "Obsolete LaTeX checks" "" "" \
# "grep --color -EHnA4 'Missing|erfull' __"
# TODO: add test document (tests/checks/)
# TODO: can space ( ) be changed to [ \n] to work with md? or md.nowrap
# TODO: if glob resolves to no files then skip? or show error message?
# TODO: filter by priority
### TODO: ack -c <m w.html
# TODO: check numbering goes in order
# TODO: show frequency=1 authors and categories in context
# TODO: check if frequency<4 tags are typo
# TODO: grep " >"
# TODO: grep "[(this|etc)"
# TODO: grep "(?)" power marks
# TODO: questions or bonus parts that don't end in punctuation mark
# words.sh: add report with histogram; number of PGs (per packet)
# rg --color=always 'Answer ' $PREFIX*.md.nowrap