Skip to content

Instantly share code, notes, and snippets.

Created March 20, 2015 10:13
Show Gist options
  • Save librarian/1018ea0a34e581e32784 to your computer and use it in GitHub Desktop.
Save librarian/1018ea0a34e581e32784 to your computer and use it in GitHub Desktop.
import requests
from requests.exceptions import RequestException, BaseHTTPError
import lxml.html
def get_page(url, utf8=False):
r = requests.get(url)
except (RequestException, BaseHTTPError) as e:
print('Exception occured at http request performing: ' + url)
if r.status_code !=
print('HTTP status code: ' + str(r.status_code) + '; url: ' + url)
if utf8:
r.encoding = 'utf-8'
return r.text
def get_title(url):
html = get_page(url)
doc = lxml.html.document_fromstring(html)
ts = doc.xpath('//title')
return ts[0].text.strip() if len(ts) >= 1 else 'TODO'
def full_url(url, context_url):
# assume 'context_url' are full url
proto, tail = context_url.split(':', 1)
context_base = proto + '://' + tail.lstrip('/').split('/', 1)[0]
if url.startswith('#'):
context_page = context_url.split('#', 1)[0]
return context_page + url
elif url.startswith('//'):
return proto + ':' + url
elif url.startswith('/'):
return context_base.rstrip('/') + '/' + url.lstrip('/')
elif url.startswith(('http://', 'https://', 'ftp://')):
return url
# Need we support relational link like
# 'smth.html', './smth.html', or '../smth.html'?
raise NameError('bad url in \'full_url\':\nurl: ' + url + '\n')
def innerHTML(node, strip=True):
res = node.text or ''
for child in node:
res += lxml.html.tostring(child, encoding='unicode')
if strip:
res = res.strip()
return res
def process_footnotes(s):
for fn in s['footnotes']:
s['res'] += '[^%s]: %s' % (fn['num'], fn['body']) + s['par_sep']
s['footnotes'] = []
def process_toplevel_a(a, s):
s['res'] += a.xpath('./h1')[0].text.strip() + s['line_break']
s['res'] += full_url(a.get('href'), context_url=s['base_url'])
s['res'] += s['par_sep']
def process_a(a, s):
s['res'] += '[%s][%d]' % (innerHTML(a), s['ref_counter'])
ref = {
'num': s['ref_counter'],
'url': full_url(a.get('href'), context_url=s['base_url']),
s['ref_counter'] += 1
def ret_span(span, orig_s):
# TODO: make it in some other way
s = {
'base_url': orig_s['base_url'],
'ref_counter': 1,
'fn_counter': 1,
'par_sep': '\n\n',
'line_break': '\n',
'res': '',
'footnotes': [],
'references': [],
process_toplevel_p(span, s)
return s['res'].strip()
def process_span(span, s):
# TODO: extract number, don't reenum via 'fn_counver'
if span.get('class') != 'ref':
s['res'] += lxml.html.tostring(span, encoding='unicode')
s['res'] += '[^%s]' % s['fn_counter']
footnote = {
'num': s['fn_counter'],
'body': ret_span(span.xpath('./span[@class="refbody"]')[0], s),
s['fn_counter'] += 1
def process_strong(strong, deep_level, s):
s['res'] += '**' if (deep_level % 2 == 0) else '__'
s['res'] += strong.text or ''
for child in strong:
tail_added = False
if child.tag == 'em':
process_em(child, deep_level + 1, s)
elif child.tag == 'strong':
process_strong(child, deep_level + 1, s)
elif child.tag == 'a':
process_a(child, s)
elif child.tag == 'span':
process_span(child, s)
s['res'] += lxml.html.tostring(child, encoding='unicode')
tail_added = True
if not tail_added:
s['res'] += child.tail or ''
s['res'] += '**' if (deep_level % 2 == 0) else '__'
def process_em(em, deep_level, s):
# TODO: deduplicate code
s['res'] += '*' if (deep_level % 2 == 0) else '_'
s['res'] += em.text or ''
for child in em:
tail_added = False
if child.tag == 'em':
process_em(child, deep_level + 1, s)
elif child.tag == 'strong':
process_strong(child, deep_level + 1, s)
elif child.tag == 'a':
process_a(child, s)
elif child.tag == 'span':
process_span(child, s)
s['res'] += lxml.html.tostring(child, encoding='unicode')
tail_added = True
if not tail_added:
s['res'] += child.tail or ''
s['res'] += '*' if (deep_level % 2 == 0) else '_'
def maybe_formula(par):
if par.lstrip().startswith(r'\[') and par.rstrip().endswith(r'\]'):
return '$$ ' + par.strip()[2:-2].strip() + ' $$'
return par
def process_toplevel_p(p, s):
is_question = 'id' in p.attrib and p.attrib['id'] == 'question'
is_attribute = 'id' in p.attrib and p.attrib['id'] == 'attribute'
if is_question or is_attribute:
s['res'] += '> '
# TODO: check for formula only for entire fragment
s['res'] += maybe_formula(p.text or '')
for child in p:
tail_added = False
if child.tag == 'em':
process_em(child, 0, s)
elif child.tag == 'strong':
process_strong(child, 0, s)
elif child.tag == 'a':
process_a(child, s)
elif child.tag == 'span':
process_span(child, s)
elif child.tag == 'sup':
s['res'] += '<sup>' + ret_span(child, s) + '</sup>'
s['res'] += lxml.html.tostring(child, encoding='unicode')
tail_added = True
if not tail_added:
s['res'] += child.tail or ''
if is_question:
s['res'] += s['line_break'] + '>' + s['line_break']
s['res'] += s['par_sep']
def process_toplevel_img(img, s):
url = full_url(img.get('src'), context_url=s['base_url']),
s['res'] += '![](TODO "%s")' % img.get('title') + s['line_break']
s['res'] += '[labels]' + s['line_break']
s['res'] += 'TODO' + s['line_break']
s['res'] += '[/labels]' + s['line_break']
s['res'] += 'render: ![](%s)' % url + s['par_sep']
def postprocess_references(s):
for r in s['references']:
# TODO: check for spaces and so on
t = get_title(r['url'])
s['res'] += '[%s]: %s "%s"' % (r['num'], r['url'], t) + s['par_sep']
url = ''
html = get_page(url, utf8=True)
doc = lxml.html.document_fromstring(html)
article = doc.xpath('//body//article')[0]
with open('a.html', 'w', encoding='utf-8') as f:
print(innerHTML(article), file=f)
parser_state = {
'base_url': url,
'ref_counter': 1,
'fn_counter': 1,
'par_sep': '\n\n',
'line_break': '\n',
'res': '',
'footnotes': [],
'references': [],
for child in article:
if child.tag == 'a':
process_toplevel_a(child, parser_state)
elif child.tag == 'p':
process_toplevel_p(child, parser_state)
elif child.tag == 'img':
process_toplevel_img(child, parser_state)
with open('', 'w', encoding='utf-8') as f:
print(parser_state['res'].rstrip(), file=f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment