-
Notifications
You must be signed in to change notification settings - Fork 23
/
radical.py
107 lines (77 loc) · 2.94 KB
/
radical.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# encoding=utf-8
import re
import csv
import urllib2
from bs4 import BeautifulSoup
class Radical(object):
dictionary_filepath = 'xinhua.csv'
baiduhanyu_url = 'http://hanyu.baidu.com/zici/s?ptype=zici&wd=%s'
def __init__(self):
self.read_dictionary()
self.origin_len = len(self.dictionary)
def read_dictionary(self):
self.dictionary = {}
file = open(self.dictionary_filepath, 'rU')
reader = csv.reader(file)
for line in reader:
self.dictionary[line[0].decode('utf-8')] = line[1].decode('utf-8')
file.close()
def write_dictionary(self):
file_obj = open(self.dictionary_filepath, 'wb')
writer = csv.writer(file_obj)
for word in self.dictionary:
writer.writerow([word,self.dictionary[word]])
file_obj.close()
def get_radical(self,word):
word = word.decode('utf-8')
if word in self.dictionary:
return self.dictionary[word]
else:
return self.get_radical_from_baiduhanyu(word)
def post_baidu(self,url):
print url
try:
timeout = 5
request = urllib2.Request(url)
#伪装HTTP请求
request.add_header('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')
request.add_header('connection','keep-alive')
request.add_header('referer', url)
# request.add_header('Accept-Encoding', 'gzip') # gzip可提高传输速率,但占用计算资源
response = urllib2.urlopen(request, timeout = timeout)
html = response.read()
#if(response.headers.get('content-encoding', None) == 'gzip'):
# html = gzip.GzipFile(fileobj=StringIO.StringIO(html)).read()
response.close()
return html
except Exception as e:
print 'URL Request Error:', e
return None
def anlysis_radical_from_html(self,html_doc):
soup = BeautifulSoup(html_doc, 'html.parser')
li = soup.find(id="radical")
radical = li.span.contents[0]
return radical
def add_in_dictionary(self,word,radical):
# add in file
file_object = open(self.dictionary_filepath,'a+')
file_object.write(word+','+radical+'\r\n')
file_object.close()
# refresh dictionary
self.read_in_dictionary()
def get_radical_from_baiduhanyu(self,word):
url = self.baiduhanyu_url % word
html = self.post_baidu(url)
if html == None:
return None
radical = self.anlysis_radical_from_html(html)
if radical != None:
self.dictionary[word] = radical
return radical
def save(self):
if len(self.dictionary) > self.origin_len:
self.write_dictionary()
if __name__ == '__main__':
r = Radical()
print r.get_radical('棶')
r.save()