Skip to content

Instantly share code, notes, and snippets.

@jetsanix
Last active December 16, 2015 05:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jetsanix/5386415 to your computer and use it in GitHub Desktop.
Save jetsanix/5386415 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""html2text: Turn HTML into equivalent Markdown-structured text."""
__version__ = "3.1"
__author__ = "Aaron Swartz (me@aaronsw.com)"
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
# TODO:
# Support decoded entities with unifiable.
try:
True
except NameError:
setattr(__builtins__, 'True', 1)
setattr(__builtins__, 'False', 0)
def has_key(x, y):
if hasattr(x, 'has_key'): return x.has_key(y)
else: return y in x
try:
import htmlentitydefs
import urlparse
import HTMLParser
except ImportError: #Python3
import html.entities as htmlentitydefs
import urllib.parse as urlparse
import html.parser as HTMLParser
try: #Python3
import urllib.request as urllib
except:
import urllib
import optparse, re, sys, codecs, types
try: from textwrap import wrap
except: pass
# Use Unicode characters instead of their ascii psuedo-replacements
UNICODE_SNOB = 0
# Put the links after each paragraph instead of at the end.
LINKS_EACH_PARAGRAPH = 0
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
BODY_WIDTH = False
# Don't show internal links (href="#local-anchor") -- corresponding link targets
# won't be visible in the plain text file anyway.
SKIP_INTERNAL_LINKS = True
# Use inline, rather than reference, formatting for images and links
INLINE_LINKS = True
# Number of pixels Google indents nested lists
GOOGLE_LIST_INDENT = 36
IGNORE_ANCHORS = False
IGNORE_IMAGES = False
### Entity Nonsense ###
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
'lrm':'', 'rlm':''}
unifiable_n = {}
for k in unifiable.keys():
unifiable_n[name2cp(k)] = unifiable[k]
def charref(name):
if name[0] in ['x','X']:
c = int(name[1:], 16)
else:
c = int(name)
if not UNICODE_SNOB and c in unifiable_n.keys():
return unifiable_n[c]
else:
try:
return unichr(c)
except NameError: #Python3
return chr(c)
def entityref(c):
if not UNICODE_SNOB and c in unifiable.keys():
return unifiable[c]
else:
try: name2cp(c)
except KeyError: return "&" + c + ';'
else:
try:
return unichr(name2cp(c))
except NameError: #Python3
return chr(name2cp(c))
def replaceEntities(s):
s = s.group(1)
if s[0] == "#":
return charref(s[1:])
else: return entityref(s)
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape(s):
return r_unescape.sub(replaceEntities, s)
### End Entity Nonsense ###
def onlywhite(line):
"""Return true if the line does only consist of whitespace characters."""
for c in line:
if c is not ' ' and c is not ' ':
return c is ' '
return line
def optwrap(text):
"""Wrap all paragraphs in the provided text."""
return text
if not BODY_WIDTH:
return text
assert wrap, "Requires Python 2.3."
result = ''
newlines = 0
for para in text.split("\n"):
if len(para) > 0:
if para[0] != ' ' and para[0] != '-' and para[0] != '*':
for line in wrap(para, BODY_WIDTH):
result += line + "\n"
result += "\n"
newlines = 2
else:
if not onlywhite(para):
result += para + "\n"
newlines = 1
else:
if newlines < 2:
result += "\n"
newlines += 1
return result
def hn(tag):
if tag[0] == 'h' and len(tag) == 2:
try:
n = int(tag[1])
if n in range(1, 10): return n
except ValueError: return 0
def dumb_property_dict(style):
"""returns a hash of css attributes"""
return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
def dumb_css_parser(data):
"""returns a hash of css selectors, each of which contains a hash of css attributes"""
# remove @import sentences
importIndex = data.find('@import')
while importIndex != -1:
data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
importIndex = data.find('@import')
# parse the css. reverted from dictionary compehension in order to support older pythons
elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
return elements
def element_style(attrs, style_def, parent_style):
"""returns a hash of the 'final' style attributes of the element"""
style = parent_style.copy()
if 'class' in attrs:
for css_class in attrs['class'].split():
css_style = style_def['.' + css_class]
style.update(css_style)
if 'style' in attrs:
immediate_style = dumb_property_dict(attrs['style'])
style.update(immediate_style)
return style
def google_list_style(style):
"""finds out whether this is an ordered or unordered list"""
if 'list-style-type' in style:
list_style = style['list-style-type']
if list_style in ['disc', 'circle', 'square', 'none']:
return 'ul'
return 'ol'
def google_nest_count(style):
"""calculate the nesting count of google doc lists"""
nest_count = 0
if 'margin-left' in style:
nest_count = int(style['margin-left'][:-2]) / GOOGLE_LIST_INDENT
return nest_count
def google_has_height(style):
"""check if the style of the element has the 'height' attribute explicitly defined"""
if 'height' in style:
return True
return False
def google_text_emphasis(style):
"""return a list of all emphasis modifiers of the element"""
emphasis = []
if 'text-decoration' in style:
emphasis.append(style['text-decoration'])
if 'font-style' in style:
emphasis.append(style['font-style'])
if 'font-weight' in style:
emphasis.append(style['font-weight'])
return emphasis
def google_fixed_width_font(style):
"""check if the css of the current element defines a fixed width font"""
font_family = ''
if 'font-family' in style:
font_family = style['font-family']
if 'Courier New' == font_family or 'Consolas' == font_family:
return True
return False
def list_numbering_start(attrs):
"""extract numbering from list element attributes"""
if 'start' in attrs:
return int(attrs['start']) - 1
else:
return 0
class _html2text(HTMLParser.HTMLParser):
def __init__(self, out=None, baseurl=''):
HTMLParser.HTMLParser.__init__(self)
if out is None: self.out = self.outtextf
else: self.out = out
self.outtextlist = [] # empty list to store output characters before they are "joined"
try:
self.outtext = unicode()
except NameError: # Python3
self.outtext = str()
self.quiet = 0
self.p_p = 0 # number of newline character to print before next output
self.outcount = 0
self.start = 1
self.space = 0
self.a = []
self.astack = []
self.acount = 0
self.list = []
self.blockquote = 0
self.pre = 0
self.startpre = 0
self.code = False
self.br_toggle = ''
self.lastWasNL = 0
self.lastWasList = False
self.style = 0
self.style_def = {}
self.tag_stack = []
self.emphasis = 0
self.drop_white_space = 0
self.inheader = False
self.abbr_title = None # current abbreviation definition
self.abbr_data = None # last inner HTML (for abbr being defined)
self.abbr_list = {} # stack of abbreviations to write later
self.baseurl = baseurl
if options.google_doc:
del unifiable_n[name2cp('nbsp')]
unifiable['nbsp'] = '&nbsp_place_holder;'
def feed(self, data):
data = data.replace("</' + 'script>", "</ignore>")
HTMLParser.HTMLParser.feed(self, data)
def outtextf(self, s):
self.outtextlist.append(s)
if s: self.lastWasNL = s[-1] == '\n'
def close(self):
HTMLParser.HTMLParser.close(self)
self.pbr()
self.o('', 0, 'end')
self.outtext = self.outtext.join(self.outtextlist)
if options.google_doc:
self.outtext = self.outtext.replace('&nbsp_place_holder;', ' ');
return self.outtext
def handle_charref(self, c):
self.o(charref(c), 1)
def handle_entityref(self, c):
self.o(entityref(c), 1)
def handle_starttag(self, tag, attrs):
self.handle_tag(tag, attrs, 1)
def handle_endtag(self, tag):
self.handle_tag(tag, None, 0)
def previousIndex(self, attrs):
""" returns the index of certain set of attributes (of a link) in the
self.a list
If the set of attributes is not found, returns None
"""
if not has_key(attrs, 'href'): return None
i = -1
for a in self.a:
i += 1
match = 0
if has_key(a, 'href') and a['href'] == attrs['href']:
if has_key(a, 'title') or has_key(attrs, 'title'):
if (has_key(a, 'title') and has_key(attrs, 'title') and
a['title'] == attrs['title']):
match = True
else:
match = True
if match: return i
def drop_last(self, nLetters):
if not self.quiet:
self.outtext = self.outtext[:-nLetters]
def handle_emphasis(self, start, tag_style, parent_style):
"""handles various text emphases"""
tag_emphasis = google_text_emphasis(tag_style)
parent_emphasis = google_text_emphasis(parent_style)
# handle Google's text emphasis
strikethrough = 'line-through' in tag_emphasis and options.hide_strikethrough
bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
fixed = google_fixed_width_font(tag_style) and not \
google_fixed_width_font(parent_style) and not self.pre
if start:
# crossed-out text must be handled before other attributes
# in order not to output qualifiers unnecessarily
if bold or italic or fixed:
self.emphasis += 1
if strikethrough:
self.quiet += 1
if italic:
self.o("_")
self.drop_white_space += 1
if bold:
self.o("**")
self.drop_white_space += 1
if fixed:
self.o('`')
self.drop_white_space += 1
self.code = True
else:
if bold or italic or fixed:
# there must not be whitespace before closing emphasis mark
self.emphasis -= 1
self.space = 0
self.outtext = self.outtext.rstrip()
if fixed:
if self.drop_white_space:
# empty emphasis, drop it
self.drop_last(1)
self.drop_white_space -= 1
else:
self.o('`')
self.code = False
if bold:
if self.drop_white_space:
# empty emphasis, drop it
self.drop_last(2)
self.drop_white_space -= 1
else:
self.o("**")
if italic:
if self.drop_white_space:
# empty emphasis, drop it
self.drop_last(1)
self.drop_white_space -= 1
else:
self.o("_")
# space is only allowed after *all* emphasis marks
if (bold or italic) and not self.emphasis:
self.o(" ")
if strikethrough:
self.quiet -= 1
def handle_tag(self, tag, attrs, start):
#attrs = fixattrs(attrs)
if attrs is None:
attrs = {}
else:
attrs = dict(attrs)
if options.google_doc:
# the attrs parameter is empty for a closing tag. in addition, we
# need the attributes of the parent nodes in order to get a
# complete style description for the current element. we assume
# that google docs export well formed html.
parent_style = {}
if start:
if self.tag_stack:
parent_style = self.tag_stack[-1][2]
tag_style = element_style(attrs, self.style_def, parent_style)
self.tag_stack.append((tag, attrs, tag_style))
else:
dummy, attrs, tag_style = self.tag_stack.pop()
if self.tag_stack:
parent_style = self.tag_stack[-1][2]
if hn(tag):
self.p()
if start:
self.inheader = True
self.o(hn(tag)*"#" + ' ')
else:
self.inheader = False
return # prevent redundant emphasis marks on headers
if tag in ['p', 'div']:
if options.google_doc:
if start and google_has_height(tag_style):
self.p()
else:
self.soft_br()
else:
self.p()
if tag == "br" and start: self.o(" \n")
if tag == "hr" and start:
self.p()
self.o("* * *")
self.p()
if tag in ["head", "style"]:
if start: self.quiet += 1
else: self.quiet -= 1
if tag == "style":
if start: self.style += 1
else: self.style -= 1
if tag in ["body"]:
self.quiet = 0 # sites like 9rules.com never close <head>
if tag == "blockquote":
if start:
self.p(); self.o('> ', 0, 1); self.start = 1
self.blockquote += 1
else:
self.blockquote -= 1
self.p()
if tag in ['em', 'i', 'u']: self.o("_")
if tag in ['strong', 'b']: self.o("**")
if tag in ['del', 'strike', "script"]:
if start:
self.o("<"+tag+">")
else:
self.o("</"+tag+">")
if options.google_doc:
if not self.inheader:
# handle some font attributes, but leave headers clean
self.handle_emphasis(start, tag_style, parent_style)
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
if tag == "abbr":
if start:
self.abbr_title = None
self.abbr_data = ''
if has_key(attrs, 'title'):
self.abbr_title = attrs['title']
else:
if self.abbr_title != None:
self.abbr_list[self.abbr_data] = self.abbr_title
self.abbr_title = None
self.abbr_data = ''
if tag == "a" and not IGNORE_ANCHORS:
if start:
if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
self.astack.append(attrs)
self.o("[")
else:
self.astack.append(None)
else:
if self.astack:
a = self.astack.pop()
if a:
if INLINE_LINKS:
self.o("](" + a['href'] + ")")
else:
i = self.previousIndex(a)
if i is not None:
a = self.a[i]
else:
self.acount += 1
a['count'] = self.acount
a['outcount'] = self.outcount
self.a.append(a)
self.o("][" + str(a['count']) + "]")
if tag == "img" and start and not IGNORE_IMAGES:
if has_key(attrs, 'src'):
attrs['href'] = attrs['src']
alt = attrs.get('alt', '')
if INLINE_LINKS:
self.o("![")
self.o(alt)
self.o("]("+ attrs['href'] +")")
else:
i = self.previousIndex(attrs)
if i is not None:
attrs = self.a[i]
else:
self.acount += 1
attrs['count'] = self.acount
attrs['outcount'] = self.outcount
self.a.append(attrs)
self.o("![")
self.o(alt)
self.o("]["+ str(attrs['count']) +"]")
if tag == 'dl' and start: self.p()
if tag == 'dt' and not start: self.pbr()
if tag == 'dd' and start: self.o(' ')
if tag == 'dd' and not start: self.pbr()
if tag in ["ol", "ul"]:
# Google Docs create sub lists as top level lists
if (not self.list) and (not self.lastWasList):
self.p()
if start:
if options.google_doc:
list_style = google_list_style(tag_style)
else:
list_style = tag
numbering_start = list_numbering_start(attrs)
self.list.append({'name':list_style, 'num':numbering_start})
else:
if self.list: self.list.pop()
self.lastWasList = True
else:
self.lastWasList = False
if tag == 'li':
self.pbr()
if start:
if self.list: li = self.list[-1]
else: li = {'name':'ul', 'num':0}
if options.google_doc:
nest_count = google_nest_count(tag_style)
else:
nest_count = len(self.list)
self.o(" " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
if li['name'] == "ul": self.o(options.ul_item_mark + " ")
elif li['name'] == "ol":
li['num'] += 1
self.o(str(li['num'])+". ")
self.start = 1
if tag in ["table", "tr"] and start: self.p()
if tag == 'td': self.pbr()
if tag == "pre":
if start:
self.startpre = 1
self.pre = 1
else:
self.pre = 0
self.p()
if tag in ["iframe"]:
if start:
_tag = "<"+tag
for k, v in attrs.iteritems():
_tag += " "+str(k)+"='"+str(v)+"'"
_tag += ">"
self.o(_tag)
else:
self.o("</"+tag+">")
def pbr(self):
if self.p_p == 0: self.p_p = 1
def p(self): self.p_p = 2
def soft_br(self):
self.pbr()
self.br_toggle = ' '
def o(self, data, puredata=0, force=0):
if self.abbr_data is not None: self.abbr_data += data
if not self.quiet:
if options.google_doc:
# prevent white space immediately after 'begin emphasis' marks ('**' and '_')
lstripped_data = data.lstrip()
if self.drop_white_space and not (self.pre or self.code):
data = lstripped_data
if lstripped_data != '':
self.drop_white_space = 0
if puredata and not self.pre:
data = re.sub('\s+', ' ', data)
if data and data[0] == ' ':
self.space = 1
data = data[1:]
if not data and not force: return
if self.startpre:
#self.out(" :") #TODO: not output when already one there
self.startpre = 0
bq = (">" * self.blockquote)
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
if self.pre:
bq += " "
data = data.replace("\n", "\n"+bq)
if self.start:
self.space = 0
self.p_p = 0
self.start = 0
if force == 'end':
# It's the end.
self.p_p = 0
self.out("\n")
self.space = 0
if self.p_p:
self.out((self.br_toggle+'\n'+bq)*self.p_p)
self.space = 0
self.br_toggle = ''
if self.space:
if not self.lastWasNL: self.out(' ')
self.space = 0
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
if force == "end": self.out("\n")
newa = []
for link in self.a:
if self.outcount > link['outcount']:
self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
if has_key(link, 'title'): self.out(" ("+link['title']+")")
self.out("\n")
else:
newa.append(link)
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
self.a = newa
if self.abbr_list and force == "end":
for abbr, definition in self.abbr_list.items():
self.out(" *[" + abbr + "]: " + definition + "\n")
self.p_p = 0
self.out(data)
self.outcount += 1
def handle_data(self, data):
if r'\/script>' in data: self.quiet -= 1
if self.style:
self.style_def.update(dumb_css_parser(data))
self.o(data, 1)
def unknown_decl(self, data): pass
def wrapwrite(text):
text = text.encode('utf-8')
try: #Python3
sys.stdout.buffer.write(text)
except AttributeError:
sys.stdout.write(text)
def html2text_file(html, out=wrapwrite, baseurl=''):
h = _html2text(out, baseurl)
h.feed(html)
h.feed("")
return h.close()
def html2text(html, baseurl=''):
return optwrap(html2text_file(html, None, baseurl))
class Storage: pass
options = Storage()
options.google_doc = False
options.ul_item_mark = '*'
if __name__ == "__main__":
baseurl = ''
p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
version='%prog ' + __version__)
p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
default=False, help="convert an html-exported Google Document")
p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
default=False, help="use a dash rather than a star for unordered list items")
p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
default=78, help="number of characters per output line, 0 for no wrap")
p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
default=False, help="hide strike-through text. only relevent when -g is specified as well")
(options, args) = p.parse_args()
# handle options
if options.ul_style_dash:
options.ul_item_mark = '-'
else:
options.ul_item_mark = '*'
BODY_WIDTH = options.body_width
GOOGLE_LIST_INDENT = options.list_indent
# process input
if len(args) > 0:
file_ = args[0]
encoding = None
if len(args) == 2:
encoding = args[1]
if len(args) > 2:
p.error('Too many arguments')
if file_.startswith('http://') or file_.startswith('https://'):
baseurl = file_
j = urllib.urlopen(baseurl)
text = j.read()
if encoding is None:
try:
from feedparser import _getCharacterEncoding as enc
except ImportError:
enc = lambda x, y: ('utf-8', 1)
encoding = enc(j.headers, text)[0]
if encoding == 'us-ascii':
encoding = 'utf-8'
data = text.decode(encoding)
else:
data = open(file_, 'rb').read()
if encoding is None:
try:
from chardet import detect
except ImportError:
detect = lambda x: {'encoding': 'utf-8'}
encoding = detect(data)['encoding']
data = data.decode(encoding)
else:
data = sys.stdin.read()
wrapwrite(html2text(data, baseurl))
#!/usr/bin/env python
# encoding: utf-8
"""
tumblr2calepin.py
Created by jet tsang <jetsanix@gmail.com> on 2013-04-15T12:50:22.889342+08:00
"""
import sys
from datetime import datetime
file1=sys.argv[1]
f1 = open(file1, "r")
data = f1.readlines()
f1.close()
y=1
Slug = None
Tags = None
Title = None
for x in data:
x = unicode(x, "utf8")
xsp = x.split("!$$$!")
if "date" == xsp[0]:
d = datetime.strptime(xsp[1].strip(),'%m/%d/%Y %H:%M:%S')
day_string = d.strftime('%Y-%m-%d %H:%M:%S')
day_string_corto = d.strftime('%Y')
theDate = day_string
if "slug" == xsp[0]:
Slug = xsp[1].strip()
if "tag" == xsp[0]:
Tags = xsp[1].strip().replace("|", ",")
if "title" == xsp[0]:
Title = xsp[1].strip()
if "](ht" in Title:
Title = Title[1:Title.lstrip("[").find("]")]
if "content" == x[0:7]:
break
y += 1
post = 'Date: '+theDate+'\n'
if Slug:
if Slug == "view-on-path":
Slug = d.strftime('%Y%m%d%H%M')
else:
Slug = d.strftime('%Y%m%d%H%M')
post += 'Slug: '+Slug+'\n'
if Tags:
post += 'Tags: '+Tags+'\n'
if Title:
if Title == "view-on-path":
Title = d.strftime('%Y%m%d%H%M')
else:
Title = d.strftime('%Y%m%d%H%M')
post += 'Title: '+Title+'\n'
post += "\n"
markdownfile = day_string_corto+"-"+Title+".md".replace(" ","-")
print " ->", Title
out = open( markdownfile, "w" )
out.write( post.encode("utf-8") )
if data[-1] == data[y]:
out.write( data[y] )
else:
for x in data[y:-1]:
out.write( x )
out.close()
#!/bin/sh
# Created by jet tsang <jetsanix@gmail.com>
case "$1" in
name)
python tumblr_backup.py $2 # modify from https://github.com/bdoms/tumblr_backup
mkdir -p calepin
cd $2/posts/
for n in *.html
do
rm -f temp.txt
echo "==> processing" $n
../../html2text.py $n > temp.txt # modify from http://www.aaronsw.com/2002/html2text/
../../tumblr2calepin.py temp.txt
done
mv -f *.md ../../calepin/
;;
*)
echo "E.g: $0 name YourBlogName"
exit 3
esac
#!/usr/bin/env python
# encoding: utf-8
# standard Python library imports
from __future__ import with_statement
import os
import sys
import urllib
import urllib2
from xml.sax.saxutils import escape
from xml.sax import SAXException
import codecs
import imghdr
from collections import defaultdict
import time
import locale
from glob import glob
import re
# extra required packages
import xmltramp
join = os.path.join
# add another JPEG recognizer
# see http://www.garykessler.net/library/file_sigs.html
def test_jpg(h, f):
if h[:3] == '\xFF\xD8\xFF' and h[3] in "\xDB\xE0\xE1\xE2\xE3":
return 'jpg'
imghdr.tests.append(test_jpg)
# variable directory names, will be set in TumblrBackup.backup()
save_folder = ''
image_folder = ''
# constant names
root_folder = os.getcwdu()
post_dir = 'posts'
xml_dir = 'xml'
image_dir = 'images'
archive_dir = 'archive'
theme_dir = 'theme'
backup_css = 'backup.css'
custom_css = 'custom.css'
avatar_base = 'avatar'
blog_name = ''
post_header = ''
post_ext = '.html'
have_custom_css = False
# ensure the right date/time format
try:
locale.setlocale(locale.LC_TIME, '')
except locale.Error:
pass
encoding = 'utf-8'
time_encoding = locale.getlocale(locale.LC_TIME)[1] or encoding
def log(account, s):
if not options.quiet:
if account:
sys.stdout.write('%s: ' % account)
sys.stdout.write(s[:-1] + ' ' * 20 + s[-1:])
sys.stdout.flush()
def mkdir(dir, recursive=False):
if not os.path.exists(dir):
if recursive:
os.makedirs(dir)
else:
os.mkdir(dir)
def path_to(*parts):
return join(save_folder, *parts)
def open_file(open_fn, parts):
if len(parts) > 1:
mkdir(path_to(*parts[:-1]))
return open_fn(path_to(*parts))
def open_text(*parts):
return open_file(
lambda f: codecs.open(f, 'w', encoding, 'xmlcharrefreplace'), parts
)
def open_image(*parts):
return open_file(lambda f: open(f, 'wb'), parts)
def strftime(format, t=None):
if t is None:
t = time.localtime()
return time.strftime(format, t).decode(time_encoding)
def get_api_url(account):
"""construct the tumblr API URL"""
global blog_name
blog_name = account
if '.' not in account:
blog_name += '.tumblr.com'
base = 'http://' + blog_name + '/api/read'
if options.private:
password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
password_manager.add_password(None, base, '', options.private)
auth_manager = urllib2.HTTPBasicAuthHandler(password_manager)
opener = urllib2.build_opener(auth_manager)
urllib2.install_opener(opener)
return base
def xmlparse(url, data=None):
for _ in range(10):
try:
resp = urllib2.urlopen(url, data)
except (urllib2.URLError, urllib2.HTTPError) as e:
sys.stderr.write('%s getting %s\n' % (e, url))
continue
if resp.info().gettype() == 'text/xml':
break
else:
return None
xml = resp.read()
try:
doc = xmltramp.parse(xml)
except SAXException as e:
sys.stderr.write('%s %r\n\n%r\n\n%s\n' % (resp.info().gettype(), resp.msg, e, xml))
return None
return doc if doc._name == 'tumblr' else None
def save_image(image_url):
"""saves an image if not saved yet, returns the local file name"""
image_filename = image_url.split('/')[-1]
glob_filter = '' if '.' in image_filename else '.*'
# check if a file with this name already exists
image_glob = glob(join(image_folder, image_filename + glob_filter))
if image_glob:
return os.path.split(image_glob[0])[1]
# download the image data
try:
image_response = urllib2.urlopen(image_url)
except urllib2.HTTPError:
# return the original URL
return image_url
image_data = image_response.read()
image_response.close()
# determine the file type if it's unknown
if '.' not in image_filename:
image_type = imghdr.what(None, image_data[:32])
if image_type:
image_filename += '.' + image_type.replace('jpeg', 'jpg')
# save the image
with open_image(image_dir, image_filename) as image_file:
image_file.write(image_data)
return image_filename
def save_style():
with open_text(backup_css) as css:
css.write('''\
body { width: 720px; margin: 0 auto; }
img { max-width: 720px; }
blockquote { margin-left: 0; border-left: 8px #999 solid; padding: 0 24px; }
.archive h1, .subtitle, article { padding-bottom: 0.75em; border-bottom: 1px #ccc dotted; }
.post a.llink { display: none; }
.meta a { text-decoration: none; }
.avatar { float: right; }
''')
def header(heading, title='', body_class='', subtitle='', avatar=''):
root_rel = '' if body_class == 'index' else '../'
css_rel = root_rel + (custom_css if have_custom_css else backup_css)
if body_class:
body_class = ' class=' + body_class
h = u'''<!DOCTYPE html>
<meta charset=%s>
<title>%s</title>
<link rel=stylesheet href=%s>
<body%s>
''' % (encoding, heading, css_rel, body_class)
if avatar:
h += '<img src=%s%s/%s alt=Avatar class=avatar>\n' % (root_rel, theme_dir, avatar)
if title:
h += u'<h1>%s</h1>\n' % title
if subtitle:
h += u'<p class=subtitle>%s</p>\n' % subtitle
return h
def get_avatar():
try:
resp = urllib2.urlopen('http://api.tumblr.com/v2/blog/%s/avatar' % blog_name)
avatar_data = resp.read()
except:
return
avatar_file = avatar_base + '.' + imghdr.what(None, avatar_data[:32])
with open_image(theme_dir, avatar_file) as f:
f.write(avatar_data)
class TumblrBackup:
def __init__(self):
self.total_count = 0
def build_index(self):
for f in glob(path_to(post_dir, '*.html')):
post = LocalPost(f)
self.index[post.tm.tm_year][post.tm.tm_mon].append(post)
def save_index(self):
f = glob(path_to(theme_dir, avatar_base + '.*'))
avatar = os.path.split(f[0])[1] if f else None
with open_text('index.html') as idx:
idx.write(header(self.title, self.title, body_class='index',
subtitle=self.subtitle, avatar=avatar
))
for year in sorted(self.index.keys(), reverse=options.reverse_index):
self.save_year(idx, year)
idx.write('<p>Generated on %s.</p>\n' % strftime('%x %X'))
def save_year(self, idx, year):
idx.write('<h3>%s</h3>\n<ul>\n' % year)
for month in sorted(self.index[year].keys(), reverse=options.reverse_index):
tm = time.localtime(time.mktime([year, month, 3, 0, 0, 0, 0, 0, -1]))
month_name = self.save_month(year, month, tm)
idx.write(' <li><a href=%s/%s title="%d post(s)">%s</a></li>\n' % (
archive_dir, month_name, len(self.index[year][month]),
strftime('%B', tm)
))
idx.write('</ul>\n\n')
def save_month(self, year, month, tm):
file_name = '%d-%02d.html' % (year, month)
with open_text(archive_dir, file_name) as arch:
arch.write('\n\n'.join([
header(self.title, strftime('%B %Y', tm), body_class='archive'),
'\n'.join(p.get_post() for p in sorted(
self.index[year][month], key=lambda x: x.date, reverse=options.reverse_month
)),
'<p><a href=../ rel=contents>Index</a></p>\n'
]))
return file_name
def backup(self, account):
"""makes single files and an index for every post on a public Tumblr blog account"""
base = get_api_url(account)
# make sure there are folders to save in
global save_folder, image_folder, post_ext, post_dir, have_custom_css
if options.blosxom:
save_folder = root_folder
post_ext = '.txt'
post_dir = os.curdir
post_class = BlosxomPost
else:
save_folder = join(root_folder, account)
image_folder = path_to(image_dir)
post_class = TumblrPost
have_custom_css = os.access(path_to(custom_css), os.R_OK)
mkdir(save_folder, True)
self.post_count = 0
# prepare the period start and end timestamps
if options.period:
i = 0; tm = [int(options.period[:4]), 1, 1, 0, 0, 0, 0, 0, -1]
if len(options.period) >= 6:
i = 1; tm[1] = int(options.period[4:6])
if len(options.period) == 8:
i = 2; tm[2] = int(options.period[6:8])
p_start = time.mktime(tm)
tm[i] += 1
p_stop = time.mktime(tm)
# get the highest post id already saved
ident_max = None
if options.incremental:
try:
ident_max = max(
long(os.path.splitext(os.path.split(f)[1])[0])
for f in glob(path_to(post_dir, '*' + post_ext))
)
log(account, "Backing up posts after %d\r" % ident_max)
except ValueError: # max() arg is an empty sequence
pass
else:
log(account, "Getting basic information\r")
# start by calling the API with just a single post
soup = xmlparse(base + '?num=1')
if not soup:
return
# collect all the meta information
tumblelog = soup.tumblelog
try:
self.title = escape(tumblelog('title'))
except KeyError:
self.title = account
self.subtitle = unicode(tumblelog)
# use the meta information to create a HTML header
global post_header
post_header = header(self.title, body_class='post')
# find the total number of posts
total_posts = options.count or int(soup.posts('total'))
last_post = options.skip + total_posts
def _backup(posts):
for p in sorted(posts, key=lambda x: long(x('id')), reverse=True):
post = post_class(p)
if ident_max and long(post.ident) <= ident_max:
return False
if options.period:
if post.date >= p_stop:
continue
if post.date < p_start:
return False
post.generate_content()
if post.error:
sys.stderr.write('%s%s\n' % (post.error, 50 * ' '))
post.save_post()
self.post_count += 1
return True
# Get the XML entries from the API, which we can only do for max 50 posts at once.
# Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
MAX = 50
for i in range(options.skip, last_post, MAX):
# find the upper bound
j = min(i + MAX, last_post)
log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, total_posts))
soup = xmlparse('%s?num=%d&start=%d' % (base, j - i, i))
if soup is None:
return
if not _backup(soup.posts['post':]):
break
if not options.blosxom and self.post_count:
get_avatar()
if not have_custom_css:
save_style()
self.index = defaultdict(lambda: defaultdict(list))
self.build_index()
self.save_index()
log(account, "%d posts backed up\n" % self.post_count)
self.total_count += self.post_count
class TumblrPost:
def __init__(self, post):
self.content = ''
self.post = post
self.xml_content = post.__repr__(1, 1)
self.ident = post('id')
self.url = post('url')
self.slug = post('slug')
self.typ = post('type')
self.date = int(post('unix-timestamp'))
self.tm = time.localtime(self.date)
self.title = ''
self.tags = []
self.file_name = self.ident + post_ext
self.error = None
def generate_content(self):
"""generates the content for this post"""
post = self.post
content = []
def append(s, fmt=u'%s'):
# the %s conversion calls unicode() on the xmltramp element
content.append(fmt % s)
def get_try(elt):
try:
return unicode(post[elt])
except KeyError:
return ''
def append_try(elt, fmt=u'%s'):
elt = get_try(elt)
if elt:
append(elt, fmt)
if self.typ == 'regular':
self.title = get_try('regular-title')
append_try('regular-body')
elif self.typ == 'photo':
url = escape(get_try('photo-link-url'))
for p in post.photoset['photo':] if hasattr(post, 'photoset') else [post]:
src = unicode(p['photo-url'])
append(escape(self.get_image_url(src)), u'<img alt="" src="%s">')
if url:
content[-1] = '<a href="%s">%s</a>' % (url, content[-1])
content[-1] = '<p>' + content[-1] + '</p>'
if p._name == 'photo' and p('caption'):
append(p('caption'), u'<p>%s</p>')
append_try('photo-caption')
elif self.typ == 'link':
url = unicode(post['link-url'])
self.title = u'<a href="%s">%s</a>' % (escape(url),
post['link-text'] if 'link-text' in post else url
)
append_try('link-description')
elif self.typ == 'quote':
append(post['quote-text'], u'<blockquote><p>%s</p></blockquote>')
append_try('quote-source', u'<p>%s</p>')
elif self.typ == 'video':
source = unicode(post['video-source']).strip()
if source.startswith('<iframe') or source.startswith('<object'):
append(source, u'<p>%s</p>')
append_try('video-caption')
else:
append(post['video-player'], u'<p>%s</p>')
append_try('video-caption')
append(escape(source), u'<p><a href="%s">Original</a></p>')
elif self.typ == 'audio':
append(post['audio-player'])
append_try('audio-caption')
elif self.typ == 'answer':
self.title = post.question
append(post.answer)
elif self.typ == 'conversation':
self.title = get_try('conversation-title')
append(
'<br>\n'.join(escape(unicode(l)) for l in post.conversation['line':]),
u'<p>%s</p>'
)
else:
self.error = u"Unknown post type '%s' in post #%s" % (self.typ, self.ident)
append(escape(self.xml_content), u'<pre>%s</pre>')
self.tags = [u'%s' % t for t in post['tag':]]
self.content = '\n'.join(content)
# fix wrongly nested HTML tags
for p in ('<p>(<(%s)>)', '(</(%s)>)</p>'):
self.content = re.sub(p % 'p|ol|iframe[^>]*', r'\1', self.content)
def get_image_url(self, url):
return url
#url = save_image(url)
#if '://' in url: # in case of download errors
# return url
#return u'../%s/%s' % (image_dir, url)
def get_post(self):
"""returns this post in HTML"""
post = 'date!$$$!%s<br>\n' % strftime('%x %X', self.tm)
if self.slug:
post += u'slug!$$$!%s<br>\n' % self.slug
if self.tags:
post += u'tag!$$$!%s<br>\n' % u'|'.join(t for t in self.tags)
if self.title:
post += 'title!$$$!%s<br>\n' % self.title
post += 'content!$$$!<br>\n'
post += self.content
return post
def save_post(self):
"""saves this post locally"""
with open_text(post_dir, self.file_name) as f:
f.write(self.get_post())
os.utime(path_to(post_dir, self.file_name),
(self.date, self.date)
)
if options.xml:
with open_text(xml_dir, self.ident + '.xml') as f:
f.write(self.xml_content)
class BlosxomPost(TumblrPost):
def get_image_url(self, url):
return url
def get_post(self):
"""returns this post as a Blosxom post"""
post = self.title + '\nmeta-id: _' + self.ident + '\nmeta-url: ' + self.url
if self.tags:
post += '\nmeta-tags: ' + ' '.join(t.replace(' ', '+') for t in self.tags)
post += '\n\n' + self.content
return post
class LocalPost:
def __init__(self, post_file):
with codecs.open(post_file, 'r', encoding) as f:
self.lines = f.readlines()
# remove header and footer
while self.lines and '<article ' not in self.lines[0]:
del self.lines[0]
while self.lines and '</article>' not in self.lines[-1]:
del self.lines[-1]
self.file_name = os.path.split(post_file)[1]
self.ident = os.path.splitext(self.file_name)[0]
self.date = os.stat(post_file).st_mtime
self.tm = time.localtime(self.date)
def get_post(self):
return u''.join(self.lines)
if __name__ == '__main__':
import optparse
parser = optparse.OptionParser("Usage: %prog [options] blog-name ...",
description="Makes a local backup of Tumblr blogs."
)
parser.add_option('-q', '--quiet', action='store_true',
help="suppress progress messages"
)
parser.add_option('-i', '--incremental', action='store_true',
help="incremental backup mode"
)
parser.add_option('-x', '--xml', action='store_true',
help="save the original XML source"
)
parser.add_option('-b', '--blosxom', action='store_true',
help="save the posts in blosxom format"
)
parser.add_option('-r', '--reverse-month', action='store_false', default=True,
help="reverse the post order in the monthly archives"
)
parser.add_option('-R', '--reverse-index', action='store_false', default=True,
help="reverse the index file order"
)
parser.add_option('-a', '--auto', type='int', metavar="HOUR",
help="do a full backup at HOUR hours, otherwise do an incremental backup"
" (useful for cron jobs)"
)
parser.add_option('-n', '--count', type='int', help="save only COUNT posts")
parser.add_option('-s', '--skip', type='int', default=0,
help="skip the first SKIP posts"
)
parser.add_option('-p', '--period', help="limit the backup to PERIOD"
" ('y', 'm', 'd' or YYYY[MM[DD]])"
)
parser.add_option('-P', '--private', help="password for a private tumblr",
metavar='PASSWORD'
)
options, args = parser.parse_args()
if options.auto is not None:
if options.auto == time.localtime().tm_hour:
options.incremental = False
else:
options.incremental = True
if options.period:
try:
options.period = time.strftime(
{'y': '%Y', 'm': '%Y%m', 'd': '%Y%m%d'}[options.period]
)
except KeyError:
options.period = options.period.replace('-', '')
if len(options.period) not in (4, 6, 8):
parser.error("Period must be 'y', 'm', 'd' or YYYY[MM[DD]]")
if not args:
args = ['bbolli']
tb = TumblrBackup()
for account in args:
tb.backup(account)
sys.exit(0 if tb.total_count else 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment