-
Notifications
You must be signed in to change notification settings - Fork 15
/
getAuthors.py
45 lines (35 loc) · 1.23 KB
/
getAuthors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import codecs
from xml.sax import handler, make_parser
paper_tag = ('article','inproceedings','proceedings','book',
'incollection','phdthesis','mastersthesis','www')
class mHandler(handler.ContentHandler):
def __init__(self,result):
self.result = result
self.flag = 0
def startDocument(self):
print 'Document Start'
def endDocument(self):
print 'Document End'
def startElement(self, name, attrs):
if name == 'author':
self.flag = 1
def endElement(self, name):
if name == 'author':
self.result.write(',')
self.flag = 0
if (name in paper_tag) :
self.result.write('\r\n')
def characters(self, chrs): # [8]
if self.flag:
self.result.write(chrs)
def parserDblpXml(source,result):
handler = mHandler(result)
parser = make_parser()
parser.setContentHandler(handler)
parser.parse(source)
if __name__ == '__main__':
source = codecs.open('dblp.xml','r','utf-8')
result = codecs.open('authors.txt','w','utf-8')
parserDblpXml(source,result)
result.close()
source.close()