Skip to content

Instantly share code, notes, and snippets.

@tomonari-masada
Created July 25, 2018 08:09
Show Gist options
  • Save tomonari-masada/1568b8affc124490df85c35c5f461574 to your computer and use it in GitHub Desktop.
Save tomonari-masada/1568b8affc124490df85c35c5f461574 to your computer and use it in GitHub Desktop.
A Python parser for dblp.xml
# -*- coding: utf-8 -*-
from lxml import etree
import os
import sys
from io import TextIOWrapper
from nltk.tokenize import RegexpTokenizer
#
# USAGE:
#
# This code outputs the year and the title of each entry. (If you need author names, you may modify the code.)
#
# 1. Preprocess dblp.xml and make dblp._no_tags_.xml, which is parsed by this code.
# $ cat dblp.xml | sed 's/<i>//g' | sed 's/<\/i>//g' | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed 's/<sub>//g' | sed 's/<\sub>//g' | sed 's/<tt>//g' | sed 's/<\/tt>//g' > dblp._no_tags_.xml
#
# 2. Make tags.txt, which is read in this code.
# $ cat dblp.xml | awk '{if(substr($1,1,2)=="</"){split($1,a,">");print substr(a[1],3,length(a[1]))}}' | uniq | sort | uniq > tags.txt
#
# 3. Run this code
# $ python dblp_parse.py
#
sys.stdout = TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
tokenizer = RegexpTokenizer(r'\w+')
with open('tags.txt') as f:
collaborations = f.read().splitlines()
def fast_iter(context):
#author_array = []
title = ''
year = ''
#read chunk line by line
#we focus author and title
for event, elem in context:
if elem.tag == 'title':
if elem.text:
title = elem.text
if elem.tag == 'year':
if elem.text:
year = elem.text
if elem.tag in collaborations:
if title and year:
year = int(year)
print('{:d}'.format(year), end='')
for word in tokenizer.tokenize(title):
print(' {:s}'.format(word), end='')
print(flush=True)
title = ''
year = ''
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
del context
if __name__ == "__main__":
context = etree.iterparse('dblp._no_tags_.xml', load_dtd=True, html=True)
fast_iter(context)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment