-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.py
164 lines (146 loc) · 5.53 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import re
import collections
import numpy as np
import tensorflow as tf
def is_chinese(uchar):
if uchar>= '\u4e00' and uchar<='\u9fa5': # if uchar not in ["_", "(", ")","【",",","。","\n","?","、","…","《","》","】",":"]:
return True # return True
else:
return False
def get_chinese(txtpath):
with open(txtpath, 'r', encoding='utf') as txt_file:
file = txt_file.read()
pattern = re.compile('[\u4e00-\u9fa5]+')
result = re.findall(pattern, file)
chinese = ''
for i in result:
chinese += i
return chinese
def count_char(poetrys) :
all_words = []
for poetry in poetrys:
all_words += [word for word in poetry]
counter = collections.Counter(all_words)
count_pairs = sorted(counter.items(), key=lambda x: -x[1])
return count_pairs
def count_chinese_word(filepath):
_dict = {}
dict = {}
try:
with open(filepath, 'r',encoding='utf') as txt_file:
file = txt_file.read()
for char in file:
if is_chinese(char):
if _dict.__contains__(char):
_dict[char] = _dict[char] + 1
else:
_dict[char] = 1
else:
if dict.__contains__(char):
dict[char] = dict[char] + 1
else:
dict[char] = 1
except IOError as ioerr:
print(ioerr)
print("文件", filepath, "不存在")
sdict = sorted(_dict.items(), key=lambda d: d[1], reverse=True)
return _dict,sdict,dict
def save_word_frequency (_dict,name,file_path):
with open(r'%s\%s.txt'%(file_path,name), mode='w', encoding='utf-8') as outFile:
outFile.write('序号\t汉字\t字频\n')
i = 0
for char, frequency in sorted(_dict.items(), key=lambda d:d[1], reverse = True):
i+= 1
s = '{0}.\t{1}\t{2}\n'.format(i,char, frequency)
outFile.write(s)
def getpoetry (poetry_file = r'_txt\全唐诗.txt'):
poetry_file = poetry_file
poetry = []
with open(poetry_file, mode='r', encoding='utf')as file:
for line in file:
line = line.strip('\n')
P = re.split('[,。?]+', line)
#print(P)
if len(P)!=9:
continue
if len(P[0])!=len(P[3]):
continue
if len(line)!=48 :
continue
poetry.append(line+' ')
return poetry
def poetrys_and_titles (poetry_file = r'_txt\汇总.txt'):
poetry_file = poetry_file
poetrys = []
titles = []
with open(poetry_file,mode='r',encoding='utf')as file :
for line in file :
#print(line)
pat = re.compile(r'\d* (.*?)—.*?《(.*?)》')
if re.match(pat, line) :
poem , title = re.match(pat, line).groups()
else:
continue
if len(poem) < 5 or len(poem) > 79:
continue
poetrys.append(poem)
titles.append(title)
return poetrys,titles
# print(len(poetrys))
# print(len(titles))
#print(poetrys)
# print(titles)
def get_mapping (sortdict):
words, _ = zip(*sortdict)
word_num_map = dict(zip(words, range(len(words))))
#word_num_map.update(dict([('·',-1),(',',-2),('。',-3),(' ',-4)]))
return word_num_map,words
def poembatch (poetrys_vector,word_map,batchsize=64):
batch_size = batchsize
n_chunk = len(poetrys_vector) // batch_size
x_batches = []
y_batches = []
for i in range(n_chunk):
start_index = i * batch_size
end_index = start_index + batch_size
batches = poetrys_vector[start_index:end_index]
length = max(map(len, batches))
xdata = np.full((batch_size, length), word_map[' '], np.int32)
for row in range(batch_size):
xdata[row, :len(batches[row])] = batches[row]
ydata = np.copy(xdata)
ydata[:, :-1] = xdata[:, 1:]
"""
xdata ydata
[6,2,4,6,9] [2,4,6,9,9]
[1,4,2,8,5] [4,2,8,5,5]
"""
x_batches.append(xdata)
y_batches.append(ydata)
return x_batches,y_batches ,n_chunk
def embedding_variable(inputs, rnn_size, word_len):
with tf.variable_scope('rnn'):
# 这里选择使用cpu进行embedding
with tf.device("/cpu:0"):
# 默认使用'glorot_uniform_initializer'初始化,来自源码说明:
# If initializer is `None` (the default), the default initializer passed in
# the variable scope will be used. If that one is `None` too, a
# `glorot_uniform_initializer` will be used.
# 这里实际上是根据字符数量分别生成state_size长度的向量
embedding = tf.get_variable('embedding', [word_len, rnn_size])
# 根据inputs序列中每一个字符对应索引 在embedding中寻找对应向量,即字符转为连续向量:[字]==>[1]==>[0,1,0]
lstm_inputs = tf.nn.embedding_lookup(embedding, inputs)
return lstm_inputs
def softmax_variable(rnn_size, word_len):
with tf.variable_scope('rnn'):
w = tf.get_variable("w", [rnn_size, word_len])
b = tf.get_variable("b", [word_len])
return w, b
if __name__ == "__main__":
# dict1,dict2 = count_chinese_word('poetry.txt')
# save_word_frequency(dict1,'frequency')
# dict3 = collections.Counter(get_chinese('poetry.txt'))
# save_word_frequency(dict3,'counter.txt')
# print(dict2)
getpoetry()
print(getpoetry())