parse dblp
# filename:
# author: ivanchou
import codecs, os
import xml.etree.ElementTree as ET
paper_tag = ('article','inproceedings','proceedings','book',
class AllEntities:
def __getitem__(self, key):
return key
print ('----------parse begin----------')
result ='authors','w','utf-8')
parser = ET.XMLParser()
parser.entity = AllEntities()
for event, article in ET.iterparse('dblp_part.xml', events=("start", "end"), parser=parser):
for author in article.findall('author'):
result.write(author.text + u'|')
if event == 'end' and article.tag in paper_tag:
print ('----------parse end----------')
