-
Notifications
You must be signed in to change notification settings - Fork 20
/
ci.py
415 lines (357 loc) · 11.7 KB
/
ci.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
# -*- coding: utf-8 -*-
import random
import os
import time
# GLOBAL
_cc = 3
# 四声
yun = {"平":[],"上":[],"去":[],"入":[]}
# 平水韵
lines = list(open("db/psy.txt"))
# minimum frequency requirement for characters
poplimit = 40
resultlimit = 30
maxtrial = 200
poems = []
CT = ""
# 一首词的数据类型
class Poem():
def __init__(self,title,author,intro,content,prop=1):
self.title = title
self.author = author
self.intro = intro
self.content = content
self.prop = prop
# 整理平水韵
def makeyun():
for i in range (0, len(lines)):
if "声" in lines[i] and ":" in lines[i]:
nl = lines[i+1].replace("[","@").replace("]","@").replace("\n","")
nl = "".join([nl.split("@")[k] for k in range (0, len(nl.split("@")),2)])
nt = []
for k in range(0,len(nl),_cc):
nt.append(nl[k:k+_cc])
yun[lines[i][0:_cc]].append(nt)
# 在平水韵里找到一个字的平仄
def lookup(zi):
for i in yun.keys():
for j in yun[i]:
if zi in j:
return [i,j]
return None
# 分析数据库每一行信息类型
def linetype(sc,i):
if "," in sc[i] and not("。" in sc[i]):
return "intro"
if (len(sc[i])<=_cc*10 and "全宋词" not in sc[i] and "," not in sc[i] and "。" not in sc[i]) or ("(" in sc[i] and len(sc[i])<=_cc*20):
if sc[i-1] != "\n":
if (len(sc[i-1])>_cc*10 or len(sc[i-1])<=_cc*3):
return "title"
else:
return "intro"
else:
return "author"
else:
if sc[i] != "\n":
return "content"
return False
# 整理全宋词
def classify():
sc = list(open("db/qsc.txt"))
currauthor = ""
for i in range(1, len(sc)):
if linetype(sc,i) == "author":
currauthor = sc[i]
if linetype(sc,i) == "title":
sci = sc[i].split("(")[0].replace("\n","")
con = ""
intro = ""
j = i+1
while linetype(sc,j) == "content" or linetype(sc,j) == "intro":
if linetype(sc,j) == "intro":
intro += sc[j]
if linetype(sc,j) == "content":
con += sc[j]
j += 1
poems.append(Poem(sci,currauthor,intro,con.replace("\n","")))
# 一个字符是否是标点符号?
def ispunc(zi):
if zi !="," and zi !="。" and zi !="、" and zi !=";" and zi !=")":
return False
return True
# sorting dictionary by value
def sortdict(mydict):
return sorted(mydict.iteritems(), key=lambda (k,v): (v,k), reverse = True)
# 根据一个字猜下一个字
def guessnext(zi):
D = {}
for p in poems:
c = p.content
for i in range(0,len(c),_cc):
if c[i:i+_cc] == zi:
if not ispunc(c[i+_cc:i+_cc*2]):
x = c[i+_cc:i+_cc*2]
if x not in D:
D[x] = 0
D[x] += p.prop
return D
# 根据一个字猜上一个字
def guesslast(zi,foo):
D = {}
for p in poems:
c = p.content
for i in range(0,len(c),_cc):
if c[i:i+_cc] == zi and i > 0:
if not ispunc(c[i-_cc:i]):
x = c[i-_cc:i]
if x not in D:
D[x] = 0
D[x] += p.prop
return D
def guesswithpos(zi,pos,dir):
D = {}
for p in poems:
c = p.content
for i in range(0,len(c),_cc):
if c[i:i+_cc] == zi and i > 0:
if not ispunc(c[i+dir*_cc:i+_cc+dir*_cc]):
pis = posinsent(c,i+_cc*dir)
if midsent((pos[0]+(dir+1)/2)%pos[1],pos[1]) == midsent((pis[0]+(-dir+1)/2)%pis[1],pis[1]):
x = c[i+dir*_cc:i+_cc+dir*_cc]
if x not in D:
D[x] = 0
D[x] += p.prop
return D
# 从现成词(字符串)中确定某个字在句子中的位置。
def posinsent(c,ind):
start = 0
end = len(c)
for i in range(ind,0,-_cc):
if i == 0 or ispunc(c[i-_cc:i]):
start = i
break
for i in range(ind,len(c),_cc):
if i >= len(c)-1 or ispunc(c[i:i+_cc]):
end = i
break
if end-start <= 0:
pass
return [(ind-start)/_cc,(end-start)/_cc]
# 根据字在句子中的位置判断他是否处于上下句承接处
def midsent(x,l):
if x == 0:
return True
# 0123-456
if l == 7 and x == 4:
return True
# 01-23-45
if l == 6 and (x == 2 or x == 4):
return True
# 01-234
if l == 5 and x == 2:
return True
# 01-23
if l == 4 and x == 2:
return True
return False
# 字在数据库中出现频率
def popularity(zi):
po = 0
for p in poems:
c = p.content
for i in range(0,len(c),_cc):
if c[i:i+_cc] == zi:
po += 1
return po
# 检查字是否符合格律
def isOK(pai,z,i,yg,usedyg):
while lookup(z) == None:
return False
lxj0 = lookup(z)[0]
if (lxj0 == "平" and (pai[i] == "1" or pai[i] == "0")) or\
(lxj0 != "平" and (pai[i] == "2" or pai[i] == "0")) and popularity(z) > poplimit/2:
return True
if yg == [] and int(pai[i]) >= 3 and(
(lxj0 == "平" and int(pai[i])%2 == 1 ) or\
(lxj0 != "平" and int(pai[i])%2 == 0 )\
) and popularity(z) > poplimit:
usedyg.append(z)
return True
if yg != [] and int(pai[i]) >= 3 and(\
(lxj0 == "平" and int(pai[i])%2 == 1 ) or\
(lxj0 != "平" and int(pai[i])%2 == 0 )\
) and(\
z in yg\
) and z not in usedyg and popularity(z) > poplimit:
#print z
#print "".join(usedyg)
usedyg.append(z)
return True
return False
# 重复字出现位置是否合适
def repeatOK(zi,result,output,pai):
if zi in output:
for i in range(0,len(output),_cc):
if output[i:i+_cc] == zi:
if i != 0 :
return False
l = poswithstruct(getstruct(pai),len(result)/_cc)[1]
if midsent(l-len(output)/_cc,l):
return False
if zi in result:
for i in range(0,len(result),_cc):
if result[i:i+_cc] == zi:
l = poswithstruct(getstruct(pai),len(result)/_cc)[1]
if (not (i == len(result)-_cc and l-len(output)/_cc == 0)) or (len(result)-i)/_cc>20:
return False
return True
# 返回词牌中每句字数
def getstruct(pai):
a = pai.replace(",","$").replace(".","$").replace("`","$").replace("|","").replace("*","")
b = a.split("$")[0:-1]
return [len(b[i]) for i in range(0,len(b))]
# 剔除词牌中标点符号
def unmark(pai):
return pai.replace(",","").replace(".","").replace("`","").replace("|","").replace("*","").replace("$","")
# 根据词牌,把标点符号加回一首词中
def mark(pai,ci):
np = list(pai)
j = 0
for i in range(0,len(np)):
if np[i] not in [",",".","`","|","*"]:
np[i] = ci[j:j+_cc]
j += _cc
elif ci[j:j+_cc] == "":
i = i + 1
break
np = np[:i]
np = ("".join(np))
if "*" in np:
rep = np.split("*")[0].split(".")[-1]
#print rep
np = np.replace("*",rep)
np = np.replace(",",",").replace(".","。").replace("`","、").replace("|","\n")
return np
# 根据韵脚把词牌分成句
def splitbyy(pai):
upai = list(unmark(pai))
nl = []
ne = ""
for s in upai:
ne += s
if int(s) >= 3:
nl.append(ne)
ne = ""
return nl
# 根据词牌每句字数和某字在词中位置判断该字在句子中位置
def poswithstruct(struct,ind):
s = 0
for i in range(0,len(struct)):
s = s + struct[i]
if ind < s:
return [ind - (s - struct[i]), struct[i]]
# 填词
def write(pai,ys,dir = -1):
result = ""
ygs = []
yg = [[]]
for y in ys:
ygs.append(lookup(y)[1])
usedyg = []
trial = []
# backtrack 递归部分
def writeci(pai,col,strict=True):
time.sleep(0)
#_=os.system("clear")
#print mark(PA,result+"".join(output))
#print col, mark(PA,"".join(output))
#print "".join(usedyg)
if col == (len(pai) if dir == 1 else -1):
return "".join(output)
elif output[col] != "":
return writeci(pai,col+dir)
else:
# 选韵脚
if int(pai[col])>= 3:
yg[0] = ygs[int(pai[col])-3]
if dir == -1:
x = yg[0]
elif dir == 1:
sd = sortdict(guesswithpos(output[col-dir],
poswithstruct(getstruct(PA),col-dir + len(result)/_cc),dir))
sd = [k for k,v in sd]
x = list(set(sd) & set(yg[0]))
else:
# 选字
sd = sortdict(guesswithpos(output[col-dir],
poswithstruct(getstruct(PA),col-dir + len(result)/_cc),dir))
#print sd
nsd = []
for ss in sd:
if len(sd) < 5 or (not strict) or ss[1] > poplimit/10:
nsd.append(ss)
x = [k for k,v in nsd]
x = x[0:min(len(x),resultlimit)]
#print "".join(x)
random.shuffle(x)
#print len(x)
#print trial
for j in range(0,min(len(x),resultlimit)):
if (not strict) or (isOK(pai,x[j],col,yg[0],usedyg) and repeatOK(x[j],result,"".join(output),PA)):
# 填填看
trial[col] += 1
if dir == 1:
#print trial[col]
#print strict
if trial[col] > maxtrial+10:
#strict = False
return None
elif trial[col] > maxtrial:
strict = False
else:
pass
#strict = True
output[col] = x[j]
#print strict
solution = writeci(pai,col+dir,strict)
if (solution != None):
return solution
if int(pai[col])>= 3 and x[j] in usedyg:
usedyg.remove(x[j])
# 填不出,回去重填
output[col] = ""
return None
PA = pai
# 填词
if dir == -1:
for s in splitbyy(pai):
output = [""]*(len(s))
trial = [0]*(len(s))
nl = writeci(s,len(s)-1)
if nl == None:
return ""
result+=nl
elif dir == 1:
#print unmark(pai)
trial = [0]*(len(unmark(pai)))
output = [""]*(len(unmark(pai)))
output[0] = "烂"
for i in range(0,len(CT)/_cc):
if i < len(getstruct(pai)):
output[sum(getstruct(pai)[0:i])] = CT[i*_cc:i*_cc+_cc]
nl = writeci(unmark(pai),0)
if nl == None:
return ""
result += nl
return mark(pai,result)
# 随机选择韵脚
def getrandy(n,bounds=[20,1000]):
ze = yun["上"]+yun["去"]+yun["入"]
ping = yun["平"]
pick = ""
while pick == "" or len(lookup(pick)[1])<bounds[0] or len(lookup(pick)[1])>bounds[1]:
if n%2 == 1:
pick = random.choice(ping)[0]
if n%2 == 0:
pick = random.choice(ze)[0]
return pick