-
Notifications
You must be signed in to change notification settings - Fork 47
/
case_parse.py
242 lines (200 loc) · 8.15 KB
/
case_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import re
import sys
PYTHON = sys.version_info[0]
if 3 == PYTHON: xrange = range
"""
Parses a variable into a list of words.
Also returns the case type, which can be one of the following:
- upper: All words are upper-case.
- lower: All words are lower-case.
- pascal: All words are title-case or upper-case. Note that the variable may still have separators.
- camel: First word is lower-case, the rest are title-case or upper-case. Variable may still have separators.
- mixed: Any other mixing of word casing. Never occurs if there are no separators.
- unknown: Variable contains no words.
Also returns the first separator character, or False if there isn't one.
"""
def parseVariable(var, detectAcronyms=True, acronyms=[], preserveCase=False):
# TODO: include unicode characters.
lower = re.compile('^[a-z0-9]$')
upper = re.compile('^[A-Z]$')
sep = re.compile('^[^a-zA-Z0-9]$')
notsep = re.compile('^[a-zA-Z0-9]$')
words = []
hasSep = False
# Index of current character. Initially 1 because we don't want to check
# if the 0th character is a boundary.
i = 1
# Index of first character in a sequence
s = 0
# Previous character.
p = var[0:1]
# Treat an all-caps variable as lower-case, so that every letter isn't
# counted as a boundary.
wasUpper = False
if var.isupper():
var = var.lower()
wasUpper = True
# Iterate over each character, checking for boundaries, or places where
# the variable should divided.
while i <= len(var):
c = var[i:i+1]
split = False
if i < len(var):
# Detect upper-case letter as boundary.
if upper.match(c):
split = True
# Detect transition from separator to not separator.
elif notsep.match(c) and sep.match(p):
split = True
# Detect transition not separator to separator.
elif sep.match(c) and notsep.match(p):
split = True
else:
# The loop goes one extra iteration so that it can handle the
# remaining text after the last boundary.
split = True
if split:
if notsep.match(p):
words.append(var[s:i])
else:
# Variable contains at least one separator.
# Use the first one as the variable's primary separator.
if not hasSep: hasSep = var[s:s+1]
# Use None to indicate a separator in the word list.
words.append(None)
# If separators weren't included in the list, then breaks
# between upper-case sequences ("AAA_BBB") would be
# disregarded; the letter-run detector would count them as one
# sequence ("AAABBB").
s = i
i = i + 1
p = c
if detectAcronyms:
if acronyms:
# Use advanced acronym detection with list
# Sanitize acronyms list by discarding invalid acronyms and
# normalizing valid ones to upper-case.
validacronym = re.compile('^[a-zA-Z0-9]+$')
unsafeacronyms = acronyms
acronyms = []
for a in unsafeacronyms:
if validacronym.match(a):
acronyms.append(a.upper())
else:
print("Case Conversion: acronym '%s' was discarded for being invalid" % a)
# Check a run of words represented by the range [s, i]. Should
# return last index of new word groups.
def checkAcronym(s, i):
# Combine each letter into single string.
acstr = ''.join(words[s:i])
# List of ranges representing found acronyms.
rangeList = []
# Set of remaining letters.
notRange = set(range(len(acstr)))
# Search for each acronym in acstr.
for acronym in acronyms:
#TODO: Sanitize acronyms to include only letters.
rac = re.compile(acronym)
# Loop so that all instances of the acronym are found, instead
# of just the first.
n = 0
while True:
m = rac.search(acstr, n)
if not m: break
a, b = m.start(), m.end()
n = b
# Make sure found acronym doesn't overlap with others.
ok = True
for r in rangeList:
if a < r[1] and b > r[0]:
ok = False
break
if ok:
rangeList.append((a, b))
for j in xrange(a, b):
notRange.remove(j)
# Add remaining letters as ranges.
for nr in notRange:
rangeList.append((nr, nr+1))
# No ranges will overlap, so it's safe to sort by lower bound,
# which sort() will do by default.
rangeList.sort()
# Remove original letters in word list.
for j in xrange(s, i): del words[s]
# Replace them with new word grouping.
for j in xrange(len(rangeList)):
r = rangeList[j]
words.insert(s+j, acstr[r[0]:r[1]])
return s+len(rangeList)-1
else:
# Fallback to simple acronym detection.
def checkAcronym(s, i):
# Combine each letter into a single string.
acronym = ''.join(words[s:i])
# Remove original letters in word list.
for j in xrange(s, i): del words[s]
# Replace them with new word grouping.
words.insert(s,''.join(acronym))
return s
# Letter-run detector
# Index of current word.
i = 0
# Index of first letter in run.
s = None
# Find runs of single upper-case letters.
while i < len(words):
word = words[i]
if word != None and upper.match(word):
if s == None: s = i
elif s != None:
i = checkAcronym(s, i) + 1
s = None
i += 1
if s != None:
checkAcronym(s, i)
# Separators are no longer needed, so they can be removed. They *should*
# be removed, since it's supposed to be a *word* list.
words = [w for w in words if w != None]
# Determine case type.
caseType = 'unknown'
if wasUpper:
caseType = 'upper'
elif var.islower():
caseType = 'lower'
elif len(words) > 0:
camelCase = words[0].islower()
pascalCase = words[0].istitle() or words[0].isupper()
if camelCase or pascalCase:
for word in words[1:]:
c = word.istitle() or word.isupper()
camelCase &= c
pascalCase &= c
if not c: break
if camelCase:
caseType = 'camel'
elif pascalCase:
caseType = 'pascal'
else:
caseType = 'mixed'
if preserveCase:
if wasUpper:
words = [w.upper() for w in words]
else:
# Normalize case of each word to PascalCase. From there, other cases
# can be worked out easily.
for i in xrange(len(words)):
if detectAcronyms:
if acronyms:
if words[i].upper() in acronyms:
# Convert known acronyms to upper-case.
words[i] = words[i].upper()
else:
# Capitalize everything else.
words[i] = words[i].capitalize()
else:
# Fallback behavior: Preserve case on upper-case words.
if not words[i].isupper():
words[i] = words[i].capitalize()
else:
words[i] = words[i].capitalize()
return words, caseType, hasSep