-
-
Save gka/5476f61bfb311447eac237192d4f71da to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from collections import OrderedDict | |
import yaml | |
import moment | |
from _html2text import html2text | |
import re | |
from urllib.request import urlretrieve | |
from urllib.error import HTTPError | |
from os.path import basename, splitext, exists | |
re_img = re.compile('src=[\'"](https?:\/\/(?:www\.)?vis4.net/[^\'"]*\.(?:png|gif|jpe?g))[\'"]') | |
def represent_ordereddict(dumper, data): | |
value = [] | |
for item_key, item_value in data.items(): | |
node_key = dumper.represent_data(item_key) | |
node_value = dumper.represent_data(item_value) | |
value.append((node_key, node_value)) | |
return yaml.nodes.MappingNode(u'tag:yaml.org,2002:map', value) | |
yaml.add_representer(OrderedDict, represent_ordereddict) | |
data = open('vis4.xml').read() #.replace('<wp:', '<').replace('</wp:', '</') | |
parts = data.split('<item>') | |
img_names = {} | |
def downloadImages(item): | |
for img in re.finditer(re_img, item): | |
img_url = img.group(1) | |
img_name = basename(img_url) | |
fn,ext = splitext(img_name) | |
cnt = 1 | |
while img_name in img_names: | |
if img_url != img_names[img_name]: | |
# we need to rename the image | |
cnt += 1 | |
img_name = '{}_{}.{}'.format(fn, cnt, ext) | |
else: | |
break | |
local_path = 'source/images/old/'+img_name | |
error = False | |
if not exists(local_path): | |
try: | |
print('downloading {}...'.format(img_url)) | |
urlretrieve(img_url, local_path) | |
except HTTPError as e: | |
print(img_url, e) | |
error = True | |
except UnicodeEncodeError as e: | |
print(img_url, e) | |
error = True | |
except: | |
error = True | |
if not error: | |
item = item.replace(img_url, '/images/old/'+img_name) | |
return item | |
def getComments(soup): | |
out = '' | |
for comment in soup.find_all('comment'): | |
approved = comment.find('comment_approved').text | |
if approved == '1': | |
author = comment.find('comment_author').text | |
content = comment.find('comment_content').text | |
date = comment.find('comment_date').text | |
nice_date = moment.date(date).format('MMM D, YYYY') | |
out += '\n{} ({})\n'.format(author, nice_date) | |
for line in content.split('\n'): | |
out += '> '+line+'\n' | |
# print(author, date, content) | |
if out != '': | |
out = '\n\n\n# Comments\n\n'+out | |
return out | |
def writePost(slug, meta, content, comments=''): | |
f = open('source/_posts/'+slug+'.md', 'w') | |
out = '---\n{}\n---\n\n{}\n\n{}'.format(yaml.dump(meta), content, comments) | |
f.write(out) | |
f.close() | |
redirects = [] | |
for part in parts[1:]: | |
part = '<item>'+part[:part.index('</item')]+'</item>' | |
# download images | |
part = downloadImages(part) | |
item = BeautifulSoup(part, 'xml') | |
status = item.find('status') | |
ptype = item.find('status') | |
if status and status.text == 'publish': | |
title = item.find('title').text | |
date = item.find('post_date').text | |
slug = item.find('post_name').text | |
content = html2text(item.find('encoded').text) | |
comments = getComments(item) | |
tags = [cat.text.lower() for cat in item.find_all('category')] | |
meta = OrderedDict(title=title, date=date, tags=tags, author='Gregor Aisch') | |
meta['alias'] = 'blog/posts/'+slug | |
old_url = '/blog/posts/'+slug | |
new_url = '{}/{}'.format(moment.date(date).format('YYYY/MM'), slug) | |
redirects.append('Redirect permanent {} https://www.vis4.net/blog/{}'.format(old_url, new_url)) | |
writePost(slug, meta, content, comments) | |
open('redirects.txt', 'w').write('\n'.join(redirects)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""html2text: Turn HTML into equivalent Markdown-structured text.""" | |
__version__ = "3.200.3" | |
__author__ = "Aaron Swartz (me@aaronsw.com)" | |
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." | |
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] | |
# TODO: | |
# Support decoded entities with unifiable. | |
try: | |
True | |
except NameError: | |
setattr(__builtins__, 'True', 1) | |
setattr(__builtins__, 'False', 0) | |
def has_key(x, y): | |
if hasattr(x, 'has_key'): return x.has_key(y) | |
else: return y in x | |
def xrange(x): | |
return iter(range(x)) | |
try: | |
import htmlentitydefs | |
import urlparse | |
import HTMLParser | |
except ImportError: #Python3 | |
import html.entities as htmlentitydefs | |
import urllib.parse as urlparse | |
import html.parser as HTMLParser | |
try: #Python3 | |
import urllib.request as urllib | |
except: | |
import urllib | |
import optparse, re, sys, codecs, types | |
try: from textwrap import wrap | |
except: pass | |
import sys | |
PY2 = sys.version_info[0] == 2 | |
strtype = unicode if PY2 else str | |
# Use Unicode characters instead of their ascii psuedo-replacements | |
UNICODE_SNOB = 0 | |
# Escape all special characters. Output is less readable, but avoids corner case formatting issues. | |
ESCAPE_SNOB = 0 | |
# Put the links after each paragraph instead of at the end. | |
LINKS_EACH_PARAGRAPH = 0 | |
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) | |
BODY_WIDTH = 0 | |
# Don't show internal links (href="#local-anchor") -- corresponding link targets | |
# won't be visible in the plain text file anyway. | |
SKIP_INTERNAL_LINKS = True | |
# Use inline, rather than reference, formatting for images and links | |
INLINE_LINKS = True | |
# Number of pixels Google indents nested lists | |
GOOGLE_LIST_INDENT = 36 | |
IGNORE_ANCHORS = False | |
IGNORE_IMAGES = False | |
IGNORE_EMPHASIS = False | |
### Entity Nonsense ### | |
def name2cp(k): | |
if k == 'apos': return ord("'") | |
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 | |
return htmlentitydefs.name2codepoint[k] | |
else: | |
k = htmlentitydefs.entitydefs[k] | |
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 | |
return ord(codecs.latin_1_decode(k)[0]) | |
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', | |
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', | |
'ndash':'-', 'oelig':'oe', 'aelig':'ae', | |
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', | |
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', | |
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', | |
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', | |
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u', | |
'lrm':'', 'rlm':''} | |
unifiable_n = {} | |
for k in unifiable.keys(): | |
unifiable_n[name2cp(k)] = unifiable[k] | |
### End Entity Nonsense ### | |
def onlywhite(line): | |
"""Return true if the line does only consist of whitespace characters.""" | |
for c in line: | |
if c is not ' ' and c is not ' ': | |
return c is ' ' | |
return line | |
def hn(tag): | |
if tag[0] == 'h' and len(tag) == 2: | |
try: | |
n = int(tag[1]) | |
if n in range(1, 10): return n | |
except ValueError: return 0 | |
def dumb_property_dict(style): | |
"""returns a hash of css attributes""" | |
return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]); | |
def dumb_css_parser(data): | |
"""returns a hash of css selectors, each of which contains a hash of css attributes""" | |
# remove @import sentences | |
data += ';' | |
importIndex = data.find('@import') | |
while importIndex != -1: | |
data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] | |
importIndex = data.find('@import') | |
# parse the css. reverted from dictionary compehension in order to support older pythons | |
elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] | |
try: | |
elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements]) | |
except ValueError: | |
elements = {} # not that important | |
return elements | |
def element_style(attrs, style_def, parent_style): | |
"""returns a hash of the 'final' style attributes of the element""" | |
style = parent_style.copy() | |
if 'class' in attrs: | |
for css_class in attrs['class'].split(): | |
css_style = style_def['.' + css_class] | |
style.update(css_style) | |
if 'style' in attrs: | |
immediate_style = dumb_property_dict(attrs['style']) | |
style.update(immediate_style) | |
return style | |
def google_list_style(style): | |
"""finds out whether this is an ordered or unordered list""" | |
if 'list-style-type' in style: | |
list_style = style['list-style-type'] | |
if list_style in ['disc', 'circle', 'square', 'none']: | |
return 'ul' | |
return 'ol' | |
def google_has_height(style): | |
"""check if the style of the element has the 'height' attribute explicitly defined""" | |
if 'height' in style: | |
return True | |
return False | |
def google_text_emphasis(style): | |
"""return a list of all emphasis modifiers of the element""" | |
emphasis = [] | |
if 'text-decoration' in style: | |
emphasis.append(style['text-decoration']) | |
if 'font-style' in style: | |
emphasis.append(style['font-style']) | |
if 'font-weight' in style: | |
emphasis.append(style['font-weight']) | |
return emphasis | |
def google_fixed_width_font(style): | |
"""check if the css of the current element defines a fixed width font""" | |
font_family = '' | |
if 'font-family' in style: | |
font_family = style['font-family'] | |
if 'Courier New' == font_family or 'Consolas' == font_family: | |
return True | |
return False | |
def list_numbering_start(attrs): | |
"""extract numbering from list element attributes""" | |
if 'start' in attrs: | |
return int(attrs['start']) - 1 | |
else: | |
return 0 | |
class HTML2Text(HTMLParser.HTMLParser): | |
def __init__(self, out=None, baseurl=''): | |
HTMLParser.HTMLParser.__init__(self) | |
# Config options | |
self.unicode_snob = UNICODE_SNOB | |
self.escape_snob = ESCAPE_SNOB | |
self.links_each_paragraph = LINKS_EACH_PARAGRAPH | |
self.body_width = BODY_WIDTH | |
self.skip_internal_links = SKIP_INTERNAL_LINKS | |
self.inline_links = INLINE_LINKS | |
self.google_list_indent = GOOGLE_LIST_INDENT | |
self.ignore_links = IGNORE_ANCHORS | |
self.ignore_images = IGNORE_IMAGES | |
self.ignore_emphasis = IGNORE_EMPHASIS | |
self.google_doc = False | |
self.ul_item_mark = '*' | |
self.emphasis_mark = '_' | |
self.strong_mark = '**' | |
if out is None: | |
self.out = self.outtextf | |
else: | |
self.out = out | |
self.outtextlist = [] # empty list to store output characters before they are "joined" | |
try: | |
self.outtext = unicode() | |
except NameError: # Python3 | |
self.outtext = str() | |
self.quiet = 0 | |
self.p_p = 0 # number of newline character to print before next output | |
self.outcount = 0 | |
self.start = 1 | |
self.space = 0 | |
self.a = [] | |
self.astack = [] | |
self.maybe_automatic_link = None | |
self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://') | |
self.acount = 0 | |
self.list = [] | |
self.blockquote = 0 | |
self.pre = 0 | |
self.startpre = 0 | |
self.code = False | |
self.br_toggle = '' | |
self.lastWasNL = 0 | |
self.lastWasList = False | |
self.style = 0 | |
self.style_def = {} | |
self.tag_stack = [] | |
self.emphasis = 0 | |
self.drop_white_space = 0 | |
self.inheader = False | |
self.abbr_title = None # current abbreviation definition | |
self.abbr_data = None # last inner HTML (for abbr being defined) | |
self.abbr_list = {} # stack of abbreviations to write later | |
self.baseurl = baseurl | |
try: del unifiable_n[name2cp('nbsp')] | |
except KeyError: pass | |
unifiable['nbsp'] = ' _place_holder;' | |
def feed(self, data): | |
data = data.replace("</' + 'script>", "</ignore>") | |
HTMLParser.HTMLParser.feed(self, data) | |
def handle(self, data): | |
self.feed(data) | |
self.feed("") | |
return self.optwrap(self.close()) | |
def outtextf(self, s): | |
self.outtextlist.append(s) | |
if s: self.lastWasNL = s[-1] == '\n' | |
def close(self): | |
HTMLParser.HTMLParser.close(self) | |
self.pbr() | |
self.o('', 0, 'end') | |
self.outtext = self.outtext.join(self.outtextlist) | |
if self.unicode_snob: | |
try: | |
nbsp = unichr(name2cp('nbsp')) | |
except: | |
nbsp = chr(name2cp('nbsp')) | |
else: | |
nbsp = strtype(' ') | |
self.outtext = self.outtext.replace(strtype(' _place_holder;'), nbsp) | |
return self.outtext | |
def handle_charref(self, c): | |
self.o(self.charref(c), 1) | |
def handle_entityref(self, c): | |
self.o(self.entityref(c), 1) | |
def handle_starttag(self, tag, attrs): | |
self.handle_tag(tag, attrs, 1) | |
def handle_endtag(self, tag): | |
self.handle_tag(tag, None, 0) | |
def previousIndex(self, attrs): | |
""" returns the index of certain set of attributes (of a link) in the | |
self.a list | |
If the set of attributes is not found, returns None | |
""" | |
if not has_key(attrs, 'href'): return None | |
i = -1 | |
for a in self.a: | |
i += 1 | |
match = 0 | |
if has_key(a, 'href') and a['href'] == attrs['href']: | |
if has_key(a, 'title') or has_key(attrs, 'title'): | |
if (has_key(a, 'title') and has_key(attrs, 'title') and | |
a['title'] == attrs['title']): | |
match = True | |
else: | |
match = True | |
if match: return i | |
def drop_last(self, nLetters): | |
if not self.quiet: | |
self.outtext = self.outtext[:-nLetters] | |
def handle_emphasis(self, start, tag_style, parent_style): | |
"""handles various text emphases""" | |
tag_emphasis = google_text_emphasis(tag_style) | |
parent_emphasis = google_text_emphasis(parent_style) | |
# handle Google's text emphasis | |
strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough | |
bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis | |
italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis | |
fixed = google_fixed_width_font(tag_style) and not \ | |
google_fixed_width_font(parent_style) and not self.pre | |
if start: | |
# crossed-out text must be handled before other attributes | |
# in order not to output qualifiers unnecessarily | |
if bold or italic or fixed: | |
self.emphasis += 1 | |
if strikethrough: | |
self.quiet += 1 | |
if italic: | |
self.o(self.emphasis_mark) | |
self.drop_white_space += 1 | |
if bold: | |
self.o(self.strong_mark) | |
self.drop_white_space += 1 | |
if fixed: | |
self.o('`') | |
self.drop_white_space += 1 | |
self.code = True | |
else: | |
if bold or italic or fixed: | |
# there must not be whitespace before closing emphasis mark | |
self.emphasis -= 1 | |
self.space = 0 | |
self.outtext = self.outtext.rstrip() | |
if fixed: | |
if self.drop_white_space: | |
# empty emphasis, drop it | |
self.drop_last(1) | |
self.drop_white_space -= 1 | |
else: | |
self.o('`') | |
self.code = False | |
if bold: | |
if self.drop_white_space: | |
# empty emphasis, drop it | |
self.drop_last(2) | |
self.drop_white_space -= 1 | |
else: | |
self.o(self.strong_mark) | |
if italic: | |
if self.drop_white_space: | |
# empty emphasis, drop it | |
self.drop_last(1) | |
self.drop_white_space -= 1 | |
else: | |
self.o(self.emphasis_mark) | |
# space is only allowed after *all* emphasis marks | |
if (bold or italic) and not self.emphasis: | |
self.o(" ") | |
if strikethrough: | |
self.quiet -= 1 | |
def handle_tag(self, tag, attrs, start): | |
#attrs = fixattrs(attrs) | |
if attrs is None: | |
attrs = {} | |
else: | |
attrs = dict(attrs) | |
if self.google_doc: | |
# the attrs parameter is empty for a closing tag. in addition, we | |
# need the attributes of the parent nodes in order to get a | |
# complete style description for the current element. we assume | |
# that google docs export well formed html. | |
parent_style = {} | |
if start: | |
if self.tag_stack: | |
parent_style = self.tag_stack[-1][2] | |
tag_style = element_style(attrs, self.style_def, parent_style) | |
self.tag_stack.append((tag, attrs, tag_style)) | |
else: | |
dummy, attrs, tag_style = self.tag_stack.pop() | |
if self.tag_stack: | |
parent_style = self.tag_stack[-1][2] | |
if hn(tag): | |
self.p() | |
if start: | |
self.inheader = True | |
self.o(hn(tag)*"#" + ' ') | |
else: | |
self.inheader = False | |
return # prevent redundant emphasis marks on headers | |
if tag in ['p', 'div']: | |
if self.google_doc: | |
if start and google_has_height(tag_style): | |
self.p() | |
else: | |
self.soft_br() | |
else: | |
self.p() | |
if tag == "br" and start: self.o(" \n") | |
if tag == "hr" and start: | |
self.p() | |
self.o("* * *") | |
self.p() | |
if tag in ["head", "style", 'script']: | |
if start: self.quiet += 1 | |
else: self.quiet -= 1 | |
if tag == "style": | |
if start: self.style += 1 | |
else: self.style -= 1 | |
if tag in ["body"]: | |
self.quiet = 0 # sites like 9rules.com never close <head> | |
if tag == "blockquote": | |
if start: | |
self.p(); self.o('> ', 0, 1); self.start = 1 | |
self.blockquote += 1 | |
else: | |
self.blockquote -= 1 | |
self.p() | |
if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark) | |
if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark) | |
if tag in ['del', 'strike', 's']: | |
if start: | |
self.o("<"+tag+">") | |
else: | |
self.o("</"+tag+">") | |
if self.google_doc: | |
if not self.inheader: | |
# handle some font attributes, but leave headers clean | |
self.handle_emphasis(start, tag_style, parent_style) | |
if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` `` | |
if tag == "abbr": | |
if start: | |
self.abbr_title = None | |
self.abbr_data = '' | |
if has_key(attrs, 'title'): | |
self.abbr_title = attrs['title'] | |
else: | |
if self.abbr_title != None: | |
self.abbr_list[self.abbr_data] = self.abbr_title | |
self.abbr_title = None | |
self.abbr_data = '' | |
if tag == "a" and not self.ignore_links: | |
if start: | |
if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')): | |
self.astack.append(attrs) | |
self.maybe_automatic_link = attrs['href'] | |
else: | |
self.astack.append(None) | |
else: | |
if self.astack: | |
a = self.astack.pop() | |
if self.maybe_automatic_link: | |
self.maybe_automatic_link = None | |
elif a: | |
if self.inline_links: | |
self.o("](" + escape_md(a['href']) + ")") | |
else: | |
i = self.previousIndex(a) | |
if i is not None: | |
a = self.a[i] | |
else: | |
self.acount += 1 | |
a['count'] = self.acount | |
a['outcount'] = self.outcount | |
self.a.append(a) | |
self.o("][" + str(a['count']) + "]") | |
if tag == "img" and start and not self.ignore_images: | |
if has_key(attrs, 'src'): | |
attrs['href'] = attrs['src'] | |
alt = attrs.get('alt', '') | |
self.o("![" + escape_md(alt) + "]") | |
if self.inline_links: | |
self.o("(" + escape_md(attrs['href']) + ")") | |
else: | |
i = self.previousIndex(attrs) | |
if i is not None: | |
attrs = self.a[i] | |
else: | |
self.acount += 1 | |
attrs['count'] = self.acount | |
attrs['outcount'] = self.outcount | |
self.a.append(attrs) | |
self.o("[" + str(attrs['count']) + "]") | |
if tag == 'dl' and start: self.p() | |
if tag == 'dt' and not start: self.pbr() | |
if tag == 'dd' and start: self.o(' ') | |
if tag == 'dd' and not start: self.pbr() | |
if tag in ["ol", "ul"]: | |
# Google Docs create sub lists as top level lists | |
if (not self.list) and (not self.lastWasList): | |
self.p() | |
if start: | |
if self.google_doc: | |
list_style = google_list_style(tag_style) | |
else: | |
list_style = tag | |
numbering_start = list_numbering_start(attrs) | |
self.list.append({'name':list_style, 'num':numbering_start}) | |
else: | |
if self.list: self.list.pop() | |
self.lastWasList = True | |
else: | |
self.lastWasList = False | |
if tag == 'li': | |
self.pbr() | |
if start: | |
if self.list: li = self.list[-1] | |
else: li = {'name':'ul', 'num':0} | |
if self.google_doc: | |
nest_count = self.google_nest_count(tag_style) | |
else: | |
nest_count = len(self.list) | |
self.o(" " * nest_count) #TODO: line up <ol><li>s > 9 correctly. | |
if li['name'] == "ul": self.o(self.ul_item_mark + " ") | |
elif li['name'] == "ol": | |
li['num'] += 1 | |
self.o(str(li['num'])+". ") | |
self.start = 1 | |
if tag in ["table", "tr"] and start: self.p() | |
if tag == 'td': self.pbr() | |
if tag == "pre": | |
if start: | |
self.startpre = 1 | |
self.pre = 1 | |
else: | |
self.pre = 0 | |
self.p() | |
def pbr(self): | |
if self.p_p == 0: | |
self.p_p = 1 | |
def p(self): | |
self.p_p = 2 | |
def soft_br(self): | |
self.pbr() | |
self.br_toggle = ' ' | |
def o(self, data, puredata=0, force=0): | |
if self.abbr_data is not None: | |
self.abbr_data += data | |
if not self.quiet: | |
if self.google_doc: | |
# prevent white space immediately after 'begin emphasis' marks ('**' and '_') | |
lstripped_data = data.lstrip() | |
if self.drop_white_space and not (self.pre or self.code): | |
data = lstripped_data | |
if lstripped_data != '': | |
self.drop_white_space = 0 | |
if puredata and not self.pre: | |
data = re.sub('\s+', ' ', data) | |
if data and data[0] == ' ': | |
self.space = 1 | |
data = data[1:] | |
if not data and not force: return | |
if self.startpre: | |
#self.out(" :") #TODO: not output when already one there | |
if not data.startswith("\n"): # <pre>stuff... | |
data = "\n" + data | |
bq = (">" * self.blockquote) | |
if not (force and data and data[0] == ">") and self.blockquote: bq += " " | |
if self.pre: | |
if not self.list: | |
bq += " " | |
#else: list content is already partially indented | |
for i in xrange(len(self.list)): | |
bq += " " | |
data = data.replace("\n", "\n"+bq) | |
if self.startpre: | |
self.startpre = 0 | |
if self.list: | |
data = data.lstrip("\n") # use existing initial indentation | |
if self.start: | |
self.space = 0 | |
self.p_p = 0 | |
self.start = 0 | |
if force == 'end': | |
# It's the end. | |
self.p_p = 0 | |
self.out("\n") | |
self.space = 0 | |
if self.p_p: | |
self.out((self.br_toggle+'\n'+bq)*self.p_p) | |
self.space = 0 | |
self.br_toggle = '' | |
if self.space: | |
if not self.lastWasNL: self.out(' ') | |
self.space = 0 | |
if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"): | |
if force == "end": self.out("\n") | |
newa = [] | |
for link in self.a: | |
if self.outcount > link['outcount']: | |
self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href'])) | |
if has_key(link, 'title'): self.out(" ("+link['title']+")") | |
self.out("\n") | |
else: | |
newa.append(link) | |
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. | |
self.a = newa | |
if self.abbr_list and force == "end": | |
for abbr, definition in self.abbr_list.items(): | |
self.out(" *[" + abbr + "]: " + definition + "\n") | |
self.p_p = 0 | |
self.out(data) | |
self.outcount += 1 | |
def handle_data(self, data): | |
if r'\/script>' in data: self.quiet -= 1 | |
if self.style: | |
self.style_def.update(dumb_css_parser(data)) | |
if not self.maybe_automatic_link is None: | |
href = self.maybe_automatic_link | |
if href == data and self.absolute_url_matcher.match(href): | |
self.o("<" + data + ">") | |
return | |
else: | |
self.o("[") | |
self.maybe_automatic_link = None | |
if not self.code and not self.pre: | |
data = escape_md_section(data, snob=self.escape_snob) | |
self.o(data, 1) | |
def unknown_decl(self, data): pass | |
def charref(self, name): | |
if name[0] in ['x','X']: | |
c = int(name[1:], 16) | |
else: | |
c = int(name) | |
if not self.unicode_snob and c in unifiable_n.keys(): | |
return unifiable_n[c] | |
else: | |
try: | |
return unichr(c) | |
except NameError: #Python3 | |
return chr(c) | |
def entityref(self, c): | |
if not self.unicode_snob and c in unifiable.keys(): | |
return unifiable[c] | |
else: | |
try: name2cp(c) | |
except KeyError: return "&" + c + ';' | |
else: | |
try: | |
return unichr(name2cp(c)) | |
except NameError: #Python3 | |
return chr(name2cp(c)) | |
def replaceEntities(self, s): | |
s = s.group(1) | |
if s[0] == "#": | |
return self.charref(s[1:]) | |
else: return self.entityref(s) | |
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") | |
def unescape(self, s): | |
return self.r_unescape.sub(self.replaceEntities, s) | |
def google_nest_count(self, style): | |
"""calculate the nesting count of google doc lists""" | |
nest_count = 0 | |
if 'margin-left' in style: | |
nest_count = int(style['margin-left'][:-2]) / self.google_list_indent | |
return nest_count | |
def optwrap(self, text): | |
"""Wrap all paragraphs in the provided text.""" | |
if not self.body_width: | |
return text | |
assert wrap, "Requires Python 2.3." | |
result = '' | |
newlines = 0 | |
for para in text.split("\n"): | |
if len(para) > 0: | |
if not skipwrap(para): | |
result += "\n".join(wrap(para, self.body_width)) | |
if para.endswith(' '): | |
result += " \n" | |
newlines = 1 | |
else: | |
result += "\n\n" | |
newlines = 2 | |
else: | |
if not onlywhite(para): | |
result += para + "\n" | |
newlines = 1 | |
else: | |
if newlines < 2: | |
result += "\n" | |
newlines += 1 | |
return result | |
ordered_list_matcher = re.compile(r'\d+\.\s') | |
unordered_list_matcher = re.compile(r'[-\*\+]\s') | |
md_chars_matcher = re.compile(r"([\\\[\]\(\)])") | |
md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])") | |
md_dot_matcher = re.compile(r""" | |
^ # start of line | |
(\s*\d+) # optional whitespace and a number | |
(\.) # dot | |
(?=\s) # lookahead assert whitespace | |
""", re.MULTILINE | re.VERBOSE) | |
md_plus_matcher = re.compile(r""" | |
^ | |
(\s*) | |
(\+) | |
(?=\s) | |
""", flags=re.MULTILINE | re.VERBOSE) | |
md_dash_matcher = re.compile(r""" | |
^ | |
(\s*) | |
(-) | |
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr) | |
# or another dash (header or hr) | |
""", flags=re.MULTILINE | re.VERBOSE) | |
slash_chars = r'\`*_{}[]()#+-.!' | |
md_backslash_matcher = re.compile(r''' | |
(\\) # match one slash | |
(?=[%s]) # followed by a char that requires escaping | |
''' % re.escape(slash_chars), | |
flags=re.VERBOSE) | |
def skipwrap(para): | |
# If the text begins with four spaces or one tab, it's a code block; don't wrap | |
if para[0:4] == ' ' or para[0] == '\t': | |
return True | |
# If the text begins with only two "--", possibly preceded by whitespace, that's | |
# an emdash; so wrap. | |
stripped = para.lstrip() | |
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-": | |
return False | |
# I'm not sure what this is for; I thought it was to detect lists, but there's | |
# a <br>-inside-<span> case in one of the tests that also depends upon it. | |
if stripped[0:1] == '-' or stripped[0:1] == '*': | |
return True | |
# If the text begins with a single -, *, or +, followed by a space, or an integer, | |
# followed by a ., followed by a space (in either case optionally preceeded by | |
# whitespace), it's a list; don't wrap. | |
if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped): | |
return True | |
return False | |
def wrapwrite(text): | |
text = text.encode('utf-8') | |
try: #Python3 | |
sys.stdout.buffer.write(text) | |
except AttributeError: | |
sys.stdout.write(text) | |
def html2text(html, baseurl=''): | |
h = HTML2Text(baseurl=baseurl) | |
return h.handle(html) | |
def unescape(s, unicode_snob=False): | |
h = HTML2Text() | |
h.unicode_snob = unicode_snob | |
return h.unescape(s) | |
def escape_md(text): | |
"""Escapes markdown-sensitive characters within other markdown constructs.""" | |
return md_chars_matcher.sub(r"\\\1", text) | |
def escape_md_section(text, snob=False): | |
"""Escapes markdown-sensitive characters across whole document sections.""" | |
text = md_backslash_matcher.sub(r"\\\1", text) | |
if snob: | |
text = md_chars_matcher_all.sub(r"\\\1", text) | |
text = md_dot_matcher.sub(r"\1\\\2", text) | |
text = md_plus_matcher.sub(r"\1\\\2", text) | |
text = md_dash_matcher.sub(r"\1\\\2", text) | |
return text | |
def main(): | |
baseurl = '' | |
p = optparse.OptionParser('%prog [(filename|url) [encoding]]', | |
version='%prog ' + __version__) | |
p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", | |
default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis") | |
p.add_option("--ignore-links", dest="ignore_links", action="store_true", | |
default=IGNORE_ANCHORS, help="don't include any formatting for links") | |
p.add_option("--ignore-images", dest="ignore_images", action="store_true", | |
default=IGNORE_IMAGES, help="don't include any formatting for images") | |
p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", | |
default=False, help="convert an html-exported Google Document") | |
p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", | |
default=False, help="use a dash rather than a star for unordered list items") | |
p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", | |
default=False, help="use an asterisk rather than an underscore for emphasized text") | |
p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", | |
default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap") | |
p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", | |
default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") | |
p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", | |
default=False, help="hide strike-through text. only relevant when -g is specified as well") | |
p.add_option("--escape-all", action="store_true", dest="escape_snob", | |
default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.") | |
(options, args) = p.parse_args() | |
# process input | |
encoding = "utf-8" | |
if len(args) > 0: | |
file_ = args[0] | |
if len(args) == 2: | |
encoding = args[1] | |
if len(args) > 2: | |
p.error('Too many arguments') | |
if file_.startswith('http://') or file_.startswith('https://'): | |
baseurl = file_ | |
j = urllib.urlopen(baseurl) | |
data = j.read() | |
if encoding is None: | |
try: | |
from feedparser import _getCharacterEncoding as enc | |
except ImportError: | |
enc = lambda x, y: ('utf-8', 1) | |
encoding = enc(j.headers, data)[0] | |
if encoding == 'us-ascii': | |
encoding = 'utf-8' | |
else: | |
data = open(file_, 'rb').read() | |
if encoding is None: | |
try: | |
from chardet import detect | |
except ImportError: | |
detect = lambda x: {'encoding': 'utf-8'} | |
encoding = detect(data)['encoding'] | |
else: | |
data = sys.stdin.read() | |
if PY2: | |
data = data.decode(encoding) | |
h = HTML2Text(baseurl=baseurl) | |
# handle options | |
if options.ul_style_dash: h.ul_item_mark = '-' | |
if options.em_style_asterisk: | |
h.emphasis_mark = '*' | |
h.strong_mark = '__' | |
h.body_width = options.body_width | |
h.list_indent = options.list_indent | |
h.ignore_emphasis = options.ignore_emphasis | |
h.ignore_links = options.ignore_links | |
h.ignore_images = options.ignore_images | |
h.google_doc = options.google_doc | |
h.hide_strikethrough = options.hide_strikethrough | |
h.escape_snob = options.escape_snob | |
wrapwrite(h.handle(data)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment