Skip to content

Instantly share code, notes, and snippets.

@uchida
Last active March 22, 2024 11:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save uchida/7792ec17e152bcea1fea to your computer and use it in GitHub Desktop.
Save uchida/7792ec17e152bcea1fea to your computer and use it in GitHub Desktop.
a simple BibTeX file generator from arxiv.org URL
#!/usr/bin/python
# -*- coding: utf-8 -*-
# CC0, dedicated to public domain by Akihiro Uchida
import argparse
import urllib2, os
from HTMLParser import HTMLParser
import re
import calendar
ARXIV_ID_RE = re.compile(r'arXiv:((\d\d)(\d\d)\.\d+)')
class bibitem(object):
def __init__(self, bibtype):
assert isinstance(bibtype, str)
self.bibtype = bibtype
self.field = dict()
return
def add(self, dic):
assert isinstance(dic, dict)
for k, v in dic.iteritems():
self.field[k] = self.field.get(k, '') + v
return
def gen_key(self):
key = ''
if 'year' in self.field:
key += self.field['year']
if 'author' in self.field:
authors = self.field['author'].split('and')
for author in authors:
cnt = 0
for w in author.split():
if cnt < len(w):
(cnt, name) = (len(w), w.strip(',.'))
key += name
if 'title' in self.field:
for w in self.field['title'].split():
key += w.title()
if len(w) > 4:
break
return key
def dump(self):
d = '@{}{{{}'.format(self.bibtype, self.gen_key())
for k, v in self.field.iteritems():
if v not in ['', None]:
d += ',\n{}={{{}}}'.format(k, v)
d += '}\n'
return d
class AbstParser(object):
def __init__(self):
self.parse = self.parse_main
self.text = ''
return
def feed(self, text):
i = 0
while i < len(text):
(self.parse, i) = self.parse(text, i)
return
def parse_main(self, text, i):
c = text[i]
if c == '"':
self.text += '``'
return (self.parse_quote, i+1)
if c == '-':
return (self.parse_hyphen, i+1)
else:
if c == '\n':
self.text += ' '
else:
self.text += c
return (self.parse_main, i+1)
def parse_quote(self, text, i):
c = text[i]
if c == '"':
self.text += '\'\''
return (self.parse_main, i+1)
else:
if c == '\n':
self.text += ' '
else:
self.text += c
return (self.parse_quote, i+1)
def parse_hyphen(self, text, i):
c = text[i]
if c not in (' ', '\n'):
self.text += '-'
return (self.parse_main, i+1)
def normalize(cls, dic):
assert cls in dic
value = dic[cls]
result = dict()
if cls == 'title mathjax':
result['title'] = value.strip('\n')
elif cls == 'authors':
result['author'] = ''
for c in value.strip('\n'):
if c == ',':
result['author'] += ' and '
else:
result['author'] += c
elif cls == 'abstract mathjax':
parser = AbstParser()
parser.feed(value.strip())
result['abstract'] = parser.text
if cls.startswith('tablecell '):
c = cls.partition('tablecell ')[-1]
if c == 'arxivid':
result['eprint'] = value
m = ARXIV_ID_RE.match(value)
result['url'] = 'http://arxiv.org/abs/{}'.format(m.group(1))
result['year'] = '20{}'.format(m.group(2))
result['month'] = calendar.month_abbr[int(m.group(3))]
elif c == 'doi':
result[c] = value
result['doi-url'] = 'http://dx.doi.org/{}'.format(value)
else:
result[c] = dic[cls]
return result
some_classes = ('title mathjax', 'authors', 'abstract mathjax',
'tablecell comments', 'tablecell arxivid', 'tablecell subjects',
'tablecell jref', 'tablecell doi', 'tablecell report-number',
'tablecell msc-classes', 'tablecell acm-classes')
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.item = bibitem('misc')
self.stack = []
self.in_descriptor = False
self.tmp = dict()
return
def handle_starttag(self, tag, attrs):
for attr in attrs:
if attr[1] in some_classes:
self.stack.append({'tag': tag, 'class': attr[1]})
if attr[1] == "descriptor":
self.in_descriptor = True
return
def handle_endtag(self, tag):
if self.in_descriptor and tag == "span":
self.in_descriptor = False
if self.stack != [] and tag == self.stack[-1]['tag']:
s = self.stack.pop()
self.item.add(normalize(s['class'], self.tmp))
return
def handle_data(self, data):
for c in some_classes:
if self.in_descriptor:
continue
if self.stack != [] and self.stack[-1]['class'] == c:
self.tmp[c] = self.tmp.get(c, '') + data
return
if __name__ == '__main__':
try:
proxy = {'http': os.environ['http_proxy']}
except KeyError, e:
proxy = {}
handler = urllib2.ProxyHandler(proxy)
opener = urllib2.build_opener(handler)
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('url', type=str)
arg_parser.add_argument('-f', '--file', type=str,
required=True)
args = arg_parser.parse_args()
parser = MyHTMLParser()
response = opener.open(args.url)
parser.feed(response.read())
response.close()
fpath = os.path.abspath(args.file)
fflag = 'a' if os.path.exists(fpath) else 'w'
with open(fpath, fflag) as f:
f.write(parser.item.dump())
parser.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment