jetsanix/html2text.py

## html2text.py
#!/usr/bin/env python
"""html2text: Turn HTML into equivalent Markdown-structured text."""
__version__ = "3.1"
__author__ = "Aaron Swartz (me@aaronsw.com)"
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]

# TODO:
#   Support decoded entities with unifiable.

try:
    True
except NameError:
    setattr(__builtins__, 'True', 1)
    setattr(__builtins__, 'False', 0)

def has_key(x, y):
    if hasattr(x, 'has_key'): return x.has_key(y)
    else: return y in x

try:
    import htmlentitydefs
    import urlparse
    import HTMLParser
except ImportError: #Python3
    import html.entities as htmlentitydefs
    import urllib.parse as urlparse
    import html.parser as HTMLParser
try: #Python3
    import urllib.request as urllib
except:
    import urllib
import optparse, re, sys, codecs, types

try: from textwrap import wrap
except: pass

# Use Unicode characters instead of their ascii psuedo-replacements
UNICODE_SNOB = 0

# Put the links after each paragraph instead of at the end.
LINKS_EACH_PARAGRAPH = 0

# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
BODY_WIDTH = False

# Don't show internal links (href="#local-anchor") -- corresponding link targets
# won't be visible in the plain text file anyway.
SKIP_INTERNAL_LINKS = True

# Use inline, rather than reference, formatting for images and links
INLINE_LINKS = True

# Number of pixels Google indents nested lists
GOOGLE_LIST_INDENT = 36

IGNORE_ANCHORS = False
IGNORE_IMAGES = False

### Entity Nonsense ###

def name2cp(k):
    if k == 'apos': return ord("'")
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
        return htmlentitydefs.name2codepoint[k]
    else:
        k = htmlentitydefs.entitydefs[k]
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
        return ord(codecs.latin_1_decode(k)[0])

unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
'lrm':'', 'rlm':''}

unifiable_n = {}

for k in unifiable.keys():
    unifiable_n[name2cp(k)] = unifiable[k]

def charref(name):
    if name[0] in ['x','X']:
        c = int(name[1:], 16)
    else:
        c = int(name)

    if not UNICODE_SNOB and c in unifiable_n.keys():
        return unifiable_n[c]
    else:
        try:
            return unichr(c)
        except NameError: #Python3
            return chr(c)

def entityref(c):
    if not UNICODE_SNOB and c in unifiable.keys():
        return unifiable[c]
    else:
        try: name2cp(c)
        except KeyError: return "&" + c + ';'
        else:
            try:
                return unichr(name2cp(c))
            except NameError: #Python3
                return chr(name2cp(c))

def replaceEntities(s):
    s = s.group(1)
    if s[0] == "#":
        return charref(s[1:])
    else: return entityref(s)

r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape(s):
    return r_unescape.sub(replaceEntities, s)

### End Entity Nonsense ###

def onlywhite(line):
    """Return true if the line does only consist of whitespace characters."""
    for c in line:
        if c is not ' ' and c is not '  ':
            return c is ' '
    return line

def optwrap(text):
    """Wrap all paragraphs in the provided text."""
    return text
    if not BODY_WIDTH:
        return text

    assert wrap, "Requires Python 2.3."
    result = ''
    newlines = 0
    for para in text.split("\n"):
        if len(para) > 0:
            if para[0] != ' ' and para[0] != '-' and para[0] != '*':
                for line in wrap(para, BODY_WIDTH):
                    result += line + "\n"
                result += "\n"
                newlines = 2
            else:
                if not onlywhite(para):
                    result += para + "\n"
                    newlines = 1
        else:
            if newlines < 2:
                result += "\n"
                newlines += 1
    return result

def hn(tag):
    if tag[0] == 'h' and len(tag) == 2:
        try:
            n = int(tag[1])
            if n in range(1, 10): return n
        except ValueError: return 0

def dumb_property_dict(style):
    """returns a hash of css attributes"""
    return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);

def dumb_css_parser(data):
    """returns a hash of css selectors, each of which contains a hash of css attributes"""
    # remove @import sentences
    importIndex = data.find('@import')
    while importIndex != -1:
        data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
        importIndex = data.find('@import')

    # parse the css. reverted from dictionary compehension in order to support older pythons
    elements =  [x.split('{') for x in data.split('}') if '{' in x.strip()]
    elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])

    return elements

def element_style(attrs, style_def, parent_style):
    """returns a hash of the 'final' style attributes of the element"""
    style = parent_style.copy()
    if 'class' in attrs:
        for css_class in attrs['class'].split():
            css_style = style_def['.' + css_class]
            style.update(css_style)
    if 'style' in attrs:
        immediate_style = dumb_property_dict(attrs['style'])
        style.update(immediate_style)
    return style

def google_list_style(style):
    """finds out whether this is an ordered or unordered list"""
    if 'list-style-type' in style:
        list_style = style['list-style-type']
        if list_style in ['disc', 'circle', 'square', 'none']:
            return 'ul'
    return 'ol'

def google_nest_count(style):
    """calculate the nesting count of google doc lists"""
    nest_count = 0
    if 'margin-left' in style:
        nest_count = int(style['margin-left'][:-2]) / GOOGLE_LIST_INDENT
    return nest_count

def google_has_height(style):
    """check if the style of the element has the 'height' attribute explicitly defined"""
    if 'height' in style:
        return True
    return False

def google_text_emphasis(style):
    """return a list of all emphasis modifiers of the element"""
    emphasis = []
    if 'text-decoration' in style:
        emphasis.append(style['text-decoration'])
    if 'font-style' in style:
        emphasis.append(style['font-style'])
    if 'font-weight' in style:
        emphasis.append(style['font-weight'])
    return emphasis

def google_fixed_width_font(style):
    """check if the css of the current element defines a fixed width font"""
    font_family = ''
    if 'font-family' in style:
        font_family = style['font-family']
    if 'Courier New' == font_family or 'Consolas' == font_family:
        return True
    return False

def list_numbering_start(attrs):
    """extract numbering from list element attributes"""
    if 'start' in attrs:
        return int(attrs['start']) - 1
    else:
        return 0

class _html2text(HTMLParser.HTMLParser):
    def __init__(self, out=None, baseurl=''):
        HTMLParser.HTMLParser.__init__(self)

        if out is None: self.out = self.outtextf
        else: self.out = out
        self.outtextlist = [] # empty list to store output characters before they are  "joined"
        try:
            self.outtext = unicode()
        except NameError: # Python3
            self.outtext = str()
        self.quiet = 0
        self.p_p = 0 # number of newline character to print before next output
        self.outcount = 0
        self.start = 1
        self.space = 0
        self.a = []
        self.astack = []
        self.acount = 0
        self.list = []
        self.blockquote = 0
        self.pre = 0
        self.startpre = 0
        self.code = False
        self.br_toggle = ''
        self.lastWasNL = 0
        self.lastWasList = False
        self.style = 0
        self.style_def = {}
        self.tag_stack = []
        self.emphasis = 0
        self.drop_white_space = 0
        self.inheader = False
        self.abbr_title = None # current abbreviation definition
        self.abbr_data = None # last inner HTML (for abbr being defined)
        self.abbr_list = {} # stack of abbreviations to write later
        self.baseurl = baseurl

        if options.google_doc:
            del unifiable_n[name2cp('nbsp')]
            unifiable['nbsp'] = '&nbsp_place_holder;'

    def feed(self, data):
        data = data.replace("</' + 'script>", "</ignore>")
        HTMLParser.HTMLParser.feed(self, data)

    def outtextf(self, s):
        self.outtextlist.append(s)
        if s: self.lastWasNL = s[-1] == '\n'

    def close(self):
        HTMLParser.HTMLParser.close(self)

        self.pbr()
        self.o('', 0, 'end')

        self.outtext = self.outtext.join(self.outtextlist)

        if options.google_doc:
            self.outtext = self.outtext.replace('&nbsp_place_holder;', ' ');

        return self.outtext

    def handle_charref(self, c):
        self.o(charref(c), 1)

    def handle_entityref(self, c):
        self.o(entityref(c), 1)

    def handle_starttag(self, tag, attrs):
        self.handle_tag(tag, attrs, 1)

    def handle_endtag(self, tag):
        self.handle_tag(tag, None, 0)

    def previousIndex(self, attrs):
        """ returns the index of certain set of attributes (of a link) in the
            self.a list

            If the set of attributes is not found, returns None
        """
        if not has_key(attrs, 'href'): return None

        i = -1
        for a in self.a:
            i += 1
            match = 0

            if has_key(a, 'href') and a['href'] == attrs['href']:
                if has_key(a, 'title') or has_key(attrs, 'title'):
                        if (has_key(a, 'title') and has_key(attrs, 'title') and
                            a['title'] == attrs['title']):
                            match = True
                else:
                    match = True

            if match: return i

    def drop_last(self, nLetters):
        if not self.quiet:
            self.outtext = self.outtext[:-nLetters]

    def handle_emphasis(self, start, tag_style, parent_style):
        """handles various text emphases"""
        tag_emphasis = google_text_emphasis(tag_style)
        parent_emphasis = google_text_emphasis(parent_style)

        # handle Google's text emphasis
        strikethrough =  'line-through' in tag_emphasis and options.hide_strikethrough
        bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
        italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
        fixed = google_fixed_width_font(tag_style) and not \
                google_fixed_width_font(parent_style) and not self.pre

        if start:
            # crossed-out text must be handled before other attributes
            # in order not to output qualifiers unnecessarily
            if bold or italic or fixed:
                self.emphasis += 1
            if strikethrough:
                self.quiet += 1
            if italic:
                self.o("_")
                self.drop_white_space += 1
            if bold:
                self.o("**")
                self.drop_white_space += 1
            if fixed:
                self.o('`')
                self.drop_white_space += 1
                self.code = True
        else:
            if bold or italic or fixed:
                # there must not be whitespace before closing emphasis mark
                self.emphasis -= 1
                self.space = 0
                self.outtext = self.outtext.rstrip()
            if fixed:
                if self.drop_white_space:
                    # empty emphasis, drop it
                    self.drop_last(1)
                    self.drop_white_space -= 1
                else:
                    self.o('`')
                self.code = False
            if bold:
                if self.drop_white_space:
                    # empty emphasis, drop it
                    self.drop_last(2)
                    self.drop_white_space -= 1
                else:
                    self.o("**")
            if italic:
                if self.drop_white_space:
                    # empty emphasis, drop it
                    self.drop_last(1)
                    self.drop_white_space -= 1
                else:
                    self.o("_")
            # space is only allowed after *all* emphasis marks
            if (bold or italic) and not self.emphasis:
                    self.o(" ")
            if strikethrough:
                self.quiet -= 1

    def handle_tag(self, tag, attrs, start):
        #attrs = fixattrs(attrs)
        if attrs is None:
            attrs = {}
        else:
            attrs = dict(attrs)

        if options.google_doc:
            # the attrs parameter is empty for a closing tag. in addition, we
            # need the attributes of the parent nodes in order to get a
            # complete style description for the current element. we assume
            # that google docs export well formed html.
            parent_style = {}
            if start:
                if self.tag_stack:
                  parent_style = self.tag_stack[-1][2]
                tag_style = element_style(attrs, self.style_def, parent_style)
                self.tag_stack.append((tag, attrs, tag_style))
            else:
                dummy, attrs, tag_style = self.tag_stack.pop()
                if self.tag_stack:
                    parent_style = self.tag_stack[-1][2]

        if hn(tag):
            self.p()
            if start:
                self.inheader = True
                self.o(hn(tag)*"#" + ' ')
            else:
                self.inheader = False
                return # prevent redundant emphasis marks on headers

        if tag in ['p', 'div']:
            if options.google_doc:
                if start and google_has_height(tag_style):
                    self.p()
                else:
                    self.soft_br()
            else:
                self.p()

        if tag == "br" and start: self.o("  \n")

        if tag == "hr" and start:
            self.p()
            self.o("* * *")
            self.p()

        if tag in ["head", "style"]:
            if start: self.quiet += 1
            else: self.quiet -= 1

        if tag == "style":
            if start: self.style += 1
            else: self.style -= 1

        if tag in ["body"]:
            self.quiet = 0 # sites like 9rules.com never close <head>

        if tag == "blockquote":
            if start:
                self.p(); self.o('> ', 0, 1); self.start = 1
                self.blockquote += 1
            else:
                self.blockquote -= 1
                self.p()

        if tag in ['em', 'i', 'u']: self.o("_")
        if tag in ['strong', 'b']: self.o("**")
        if tag in ['del', 'strike', "script"]:
            if start:
                self.o("<"+tag+">")
            else:
                self.o("</"+tag+">")

        if options.google_doc:
            if not self.inheader:
                # handle some font attributes, but leave headers clean
                self.handle_emphasis(start, tag_style, parent_style)

        if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
        if tag == "abbr":
            if start:
                self.abbr_title = None
                self.abbr_data = ''
                if has_key(attrs, 'title'):
                    self.abbr_title = attrs['title']
            else:
                if self.abbr_title != None:
                    self.abbr_list[self.abbr_data] = self.abbr_title
                    self.abbr_title = None
                self.abbr_data = ''

        if tag == "a" and not IGNORE_ANCHORS:
            if start:
                if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
                    self.astack.append(attrs)
                    self.o("[")
                else:
                    self.astack.append(None)
            else:
                if self.astack:
                    a = self.astack.pop()
                    if a:
                        if INLINE_LINKS:
                            self.o("](" + a['href'] + ")")
                        else:
                            i = self.previousIndex(a)
                            if i is not None:
                                a = self.a[i]
                            else:
                                self.acount += 1
                                a['count'] = self.acount
                                a['outcount'] = self.outcount
                                self.a.append(a)
                            self.o("][" + str(a['count']) + "]")

        if tag == "img" and start and not IGNORE_IMAGES:
            if has_key(attrs, 'src'):
                attrs['href'] = attrs['src']
                alt = attrs.get('alt', '')
                if INLINE_LINKS:
                    self.o("![")
                    self.o(alt)
                    self.o("]("+ attrs['href'] +")")
                else:
                    i = self.previousIndex(attrs)
                    if i is not None:
                        attrs = self.a[i]
                    else:
                        self.acount += 1
                        attrs['count'] = self.acount
                        attrs['outcount'] = self.outcount
                        self.a.append(attrs)
                    self.o("![")
                    self.o(alt)
                    self.o("]["+ str(attrs['count']) +"]")

        if tag == 'dl' and start: self.p()
        if tag == 'dt' and not start: self.pbr()
        if tag == 'dd' and start: self.o('    ')
        if tag == 'dd' and not start: self.pbr()

        if tag in ["ol", "ul"]:
            # Google Docs create sub lists as top level lists
            if (not self.list) and (not self.lastWasList):
                self.p()
            if start:
                if options.google_doc:
                    list_style = google_list_style(tag_style)
                else:
                    list_style = tag
                numbering_start = list_numbering_start(attrs)
                self.list.append({'name':list_style, 'num':numbering_start})
            else:
                if self.list: self.list.pop()
            self.lastWasList = True
        else:
            self.lastWasList = False

        if tag == 'li':
            self.pbr()
            if start:
                if self.list: li = self.list[-1]
                else: li = {'name':'ul', 'num':0}
                if options.google_doc:
                    nest_count = google_nest_count(tag_style)
                else:
                    nest_count = len(self.list)
                self.o("  " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
                if li['name'] == "ul": self.o(options.ul_item_mark + " ")
                elif li['name'] == "ol":
                    li['num'] += 1
                    self.o(str(li['num'])+". ")
                self.start = 1

        if tag in ["table", "tr"] and start: self.p()
        if tag == 'td': self.pbr()

        if tag == "pre":
            if start:
                self.startpre = 1
                self.pre = 1
            else:
                self.pre = 0
            self.p()

        if tag in ["iframe"]:
          if start:
            _tag = "<"+tag
            for k, v in attrs.iteritems():
              _tag += " "+str(k)+"='"+str(v)+"'"
            _tag += ">"
            self.o(_tag)
          else:
            self.o("</"+tag+">")

    def pbr(self):
        if self.p_p == 0: self.p_p = 1

    def p(self): self.p_p = 2

    def soft_br(self):
        self.pbr()
        self.br_toggle = '  '

    def o(self, data, puredata=0, force=0):
        if self.abbr_data is not None: self.abbr_data += data

        if not self.quiet:
            if options.google_doc:
                # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
                lstripped_data = data.lstrip()
                if self.drop_white_space and not (self.pre or self.code):
                    data = lstripped_data
                if lstripped_data != '':
                    self.drop_white_space = 0

            if puredata and not self.pre:
                data = re.sub('\s+', ' ', data)
                if data and data[0] == ' ':
                    self.space = 1
                    data = data[1:]
            if not data and not force: return

            if self.startpre:
                #self.out(" :") #TODO: not output when already one there
                self.startpre = 0

            bq = (">" * self.blockquote)
            if not (force and data and data[0] == ">") and self.blockquote: bq += " "

            if self.pre:
                bq += "    "
                data = data.replace("\n", "\n"+bq)

            if self.start:
                self.space = 0
                self.p_p = 0
                self.start = 0

            if force == 'end':
                # It's the end.
                self.p_p = 0
                self.out("\n")
                self.space = 0

            if self.p_p:
                self.out((self.br_toggle+'\n'+bq)*self.p_p)
                self.space = 0
                self.br_toggle = ''

            if self.space:
                if not self.lastWasNL: self.out(' ')
                self.space = 0

            if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
                if force == "end": self.out("\n")

                newa = []
                for link in self.a:
                    if self.outcount > link['outcount']:
                        self.out("   ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
                        if has_key(link, 'title'): self.out(" ("+link['title']+")")
                        self.out("\n")
                    else:
                        newa.append(link)

                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.

                self.a = newa

            if self.abbr_list and force == "end":
                for abbr, definition in self.abbr_list.items():
                    self.out("  *[" + abbr + "]: " + definition + "\n")

            self.p_p = 0
            self.out(data)
            self.outcount += 1

    def handle_data(self, data):
        if r'\/script>' in data: self.quiet -= 1

        if self.style:
            self.style_def.update(dumb_css_parser(data))

        self.o(data, 1)

    def unknown_decl(self, data): pass

def wrapwrite(text):
    text = text.encode('utf-8')
    try: #Python3
        sys.stdout.buffer.write(text)
    except AttributeError:
        sys.stdout.write(text)

def html2text_file(html, out=wrapwrite, baseurl=''):
    h = _html2text(out, baseurl)
    h.feed(html)
    h.feed("")
    return h.close()

def html2text(html, baseurl=''):
    return optwrap(html2text_file(html, None, baseurl))

class Storage: pass
options = Storage()
options.google_doc = False
options.ul_item_mark = '*'

if __name__ == "__main__":
    baseurl = ''

    p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
                              version='%prog ' + __version__)
    p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
        default=False, help="convert an html-exported Google Document")
    p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
        default=False, help="use a dash rather than a star for unordered list items")
    p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
        default=78, help="number of characters per output line, 0 for no wrap")
    p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
        default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
    p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
        default=False, help="hide strike-through text. only relevent when -g is specified as well")
    (options, args) = p.parse_args()

    # handle options
    if options.ul_style_dash:
        options.ul_item_mark = '-'
    else:
        options.ul_item_mark = '*'

    BODY_WIDTH = options.body_width
    GOOGLE_LIST_INDENT = options.list_indent

    # process input
    if len(args) > 0:
        file_ = args[0]
        encoding = None
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            text = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, text)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
            data = text.decode(encoding)

        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
            data = data.decode(encoding)
    else:
        data = sys.stdin.read()
    wrapwrite(html2text(data, baseurl))

## tumblr2calepin.py
#!/usr/bin/env python
# encoding: utf-8
"""
tumblr2calepin.py
Created by jet tsang <jetsanix@gmail.com> on 2013-04-15T12:50:22.889342+08:00
"""

import sys
from datetime import datetime

file1=sys.argv[1]
f1 = open(file1, "r")

data = f1.readlines()
f1.close()
y=1

Slug = None
Tags = None
Title = None

for x in data:
  x = unicode(x, "utf8")
  xsp = x.split("!$$$!")
  if "date" == xsp[0]:
    d = datetime.strptime(xsp[1].strip(),'%m/%d/%Y %H:%M:%S')
    day_string = d.strftime('%Y-%m-%d %H:%M:%S')
    day_string_corto = d.strftime('%Y')
    theDate = day_string
  if "slug" == xsp[0]:
    Slug = xsp[1].strip()
  if "tag" == xsp[0]:
    Tags = xsp[1].strip().replace("|", ",")
  if "title" == xsp[0]:
    Title = xsp[1].strip()
    if "](ht" in Title:
      Title = Title[1:Title.lstrip("[").find("]")]

  if "content" == x[0:7]:
    break
  y += 1

post = 'Date: '+theDate+'\n'
if Slug:
  if Slug == "view-on-path":
    Slug = d.strftime('%Y%m%d%H%M')
else:
  Slug = d.strftime('%Y%m%d%H%M')

post += 'Slug: '+Slug+'\n'

if Tags:
  post += 'Tags: '+Tags+'\n'

if Title:
  if Title == "view-on-path":
    Title = d.strftime('%Y%m%d%H%M')
else:
  Title = d.strftime('%Y%m%d%H%M')

post += 'Title: '+Title+'\n'
post += "\n"
markdownfile = day_string_corto+"-"+Title+".md".replace(" ","-")
print "  ->", Title

out = open( markdownfile, "w" )
out.write( post.encode("utf-8") )
if data[-1] == data[y]:
  out.write( data[y] )
else:
  for x in data[y:-1]:
    out.write( x )

out.close()

## tumblr2calepin.sh
#!/bin/sh
# Created by jet tsang <jetsanix@gmail.com>

case "$1" in
  name)
python tumblr_backup.py $2 # modify from https://github.com/bdoms/tumblr_backup
mkdir -p calepin
cd $2/posts/
for n in *.html
do
    rm -f temp.txt
    echo "==> processing" $n
    ../../html2text.py $n > temp.txt # modify from http://www.aaronsw.com/2002/html2text/
    ../../tumblr2calepin.py temp.txt
done
mv -f *.md ../../calepin/
;;
*)
 echo "E.g: $0 name YourBlogName"
 exit 3
esac

## tumblr_backup.py
#!/usr/bin/env python
# encoding: utf-8

# standard Python library imports
from __future__ import with_statement
import os
import sys
import urllib
import urllib2
from xml.sax.saxutils import escape
from xml.sax import SAXException
import codecs
import imghdr
from collections import defaultdict
import time
import locale
from glob import glob
import re

# extra required packages
import xmltramp

join = os.path.join

# add another JPEG recognizer
# see http://www.garykessler.net/library/file_sigs.html
def test_jpg(h, f):
    if h[:3] == '\xFF\xD8\xFF' and h[3] in "\xDB\xE0\xE1\xE2\xE3":
        return 'jpg'

imghdr.tests.append(test_jpg)

# variable directory names, will be set in TumblrBackup.backup()
save_folder = ''
image_folder = ''

# constant names
root_folder = os.getcwdu()
post_dir = 'posts'
xml_dir = 'xml'
image_dir = 'images'
archive_dir = 'archive'
theme_dir = 'theme'
backup_css = 'backup.css'
custom_css = 'custom.css'
avatar_base = 'avatar'

blog_name = ''
post_header = ''
post_ext = '.html'
have_custom_css = False

# ensure the right date/time format
try:
    locale.setlocale(locale.LC_TIME, '')
except locale.Error:
    pass
encoding = 'utf-8'
time_encoding = locale.getlocale(locale.LC_TIME)[1] or encoding

def log(account, s):
    if not options.quiet:
        if account:
            sys.stdout.write('%s: ' % account)
        sys.stdout.write(s[:-1] + ' ' * 20 + s[-1:])
        sys.stdout.flush()

def mkdir(dir, recursive=False):
    if not os.path.exists(dir):
        if recursive:
            os.makedirs(dir)
        else:
            os.mkdir(dir)

def path_to(*parts):
    return join(save_folder, *parts)

def open_file(open_fn, parts):
    if len(parts) > 1:
        mkdir(path_to(*parts[:-1]))
    return open_fn(path_to(*parts))

def open_text(*parts):
    return open_file(
        lambda f: codecs.open(f, 'w', encoding, 'xmlcharrefreplace'), parts
    )

def open_image(*parts):
    return open_file(lambda f: open(f, 'wb'), parts)

def strftime(format, t=None):
    if t is None:
        t = time.localtime()
    return time.strftime(format, t).decode(time_encoding)

def get_api_url(account):
    """construct the tumblr API URL"""
    global blog_name
    blog_name = account
    if '.' not in account:
        blog_name += '.tumblr.com'
    base = 'http://' + blog_name + '/api/read'
    if options.private:
        password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
        password_manager.add_password(None, base, '', options.private)
        auth_manager = urllib2.HTTPBasicAuthHandler(password_manager)
        opener = urllib2.build_opener(auth_manager)
        urllib2.install_opener(opener)
    return base

def xmlparse(url, data=None):
    for _ in range(10):
        try:
            resp = urllib2.urlopen(url, data)
        except (urllib2.URLError, urllib2.HTTPError) as e:
            sys.stderr.write('%s getting %s\n' % (e, url))
            continue
        if resp.info().gettype() == 'text/xml':
            break
    else:
        return None
    xml = resp.read()
    try:
        doc = xmltramp.parse(xml)
    except SAXException as e:
        sys.stderr.write('%s %r\n\n%r\n\n%s\n' % (resp.info().gettype(), resp.msg, e, xml))
        return None
    return doc if doc._name == 'tumblr' else None

def save_image(image_url):
    """saves an image if not saved yet, returns the local file name"""
    image_filename = image_url.split('/')[-1]
    glob_filter = '' if '.' in image_filename else '.*'
    # check if a file with this name already exists
    image_glob = glob(join(image_folder, image_filename + glob_filter))
    if image_glob:
        return os.path.split(image_glob[0])[1]
    # download the image data
    try:
        image_response = urllib2.urlopen(image_url)
    except urllib2.HTTPError:
        # return the original URL
        return image_url
    image_data = image_response.read()
    image_response.close()
    # determine the file type if it's unknown
    if '.' not in image_filename:
        image_type = imghdr.what(None, image_data[:32])
        if image_type:
            image_filename += '.' + image_type.replace('jpeg', 'jpg')
    # save the image
    with open_image(image_dir, image_filename) as image_file:
        image_file.write(image_data)
    return image_filename

def save_style():
    with open_text(backup_css) as css:
        css.write('''\
body { width: 720px; margin: 0 auto; }
img { max-width: 720px; }
blockquote { margin-left: 0; border-left: 8px #999 solid; padding: 0 24px; }
.archive h1, .subtitle, article { padding-bottom: 0.75em; border-bottom: 1px #ccc dotted; }
.post a.llink { display: none; }
.meta a { text-decoration: none; }
.avatar { float: right; }
''')

def header(heading, title='', body_class='', subtitle='', avatar=''):
    root_rel = '' if body_class == 'index' else '../'
    css_rel = root_rel + (custom_css if have_custom_css else backup_css)
    if body_class:
        body_class = ' class=' + body_class
    h = u'''<!DOCTYPE html>

<meta charset=%s>
<title>%s</title>
<link rel=stylesheet href=%s>

<body%s>

''' % (encoding, heading, css_rel, body_class)
    if avatar:
        h += '<img src=%s%s/%s alt=Avatar class=avatar>\n' % (root_rel, theme_dir, avatar)
    if title:
        h += u'<h1>%s</h1>\n' % title
    if subtitle:
        h += u'<p class=subtitle>%s</p>\n' % subtitle
    return h

def get_avatar():
    try:
        resp = urllib2.urlopen('http://api.tumblr.com/v2/blog/%s/avatar' % blog_name)
        avatar_data = resp.read()
    except:
        return
    avatar_file = avatar_base + '.' + imghdr.what(None, avatar_data[:32])
    with open_image(theme_dir, avatar_file) as f:
        f.write(avatar_data)


class TumblrBackup:

    def __init__(self):
        self.total_count = 0

    def build_index(self):
        for f in glob(path_to(post_dir, '*.html')):
            post = LocalPost(f)
            self.index[post.tm.tm_year][post.tm.tm_mon].append(post)

    def save_index(self):
        f = glob(path_to(theme_dir, avatar_base + '.*'))
        avatar = os.path.split(f[0])[1] if f else None
        with open_text('index.html') as idx:
            idx.write(header(self.title, self.title, body_class='index',
                subtitle=self.subtitle, avatar=avatar
            ))
            for year in sorted(self.index.keys(), reverse=options.reverse_index):
                self.save_year(idx, year)
            idx.write('<p>Generated on %s.</p>\n' % strftime('%x %X'))

    def save_year(self, idx, year):
        idx.write('<h3>%s</h3>\n<ul>\n' % year)
        for month in sorted(self.index[year].keys(), reverse=options.reverse_index):
            tm = time.localtime(time.mktime([year, month, 3, 0, 0, 0, 0, 0, -1]))
            month_name = self.save_month(year, month, tm)
            idx.write('    <li><a href=%s/%s title="%d post(s)">%s</a></li>\n' % (
                archive_dir, month_name, len(self.index[year][month]),
                strftime('%B', tm)
            ))
        idx.write('</ul>\n\n')

    def save_month(self, year, month, tm):
        file_name = '%d-%02d.html' % (year, month)
        with open_text(archive_dir, file_name) as arch:
            arch.write('\n\n'.join([
                header(self.title, strftime('%B %Y', tm), body_class='archive'),
                '\n'.join(p.get_post() for p in sorted(
                    self.index[year][month], key=lambda x: x.date, reverse=options.reverse_month
                )),
                '<p><a href=../ rel=contents>Index</a></p>\n'
            ]))
        return file_name

    def backup(self, account):
        """makes single files and an index for every post on a public Tumblr blog account"""

        base = get_api_url(account)

        # make sure there are folders to save in
        global save_folder, image_folder, post_ext, post_dir, have_custom_css
        if options.blosxom:
            save_folder = root_folder
            post_ext = '.txt'
            post_dir = os.curdir
            post_class = BlosxomPost
        else:
            save_folder = join(root_folder, account)
            image_folder = path_to(image_dir)
            post_class = TumblrPost
            have_custom_css = os.access(path_to(custom_css), os.R_OK)
        mkdir(save_folder, True)

        self.post_count = 0

        # prepare the period start and end timestamps
        if options.period:
            i = 0; tm = [int(options.period[:4]), 1, 1, 0, 0, 0, 0, 0, -1]
            if len(options.period) >= 6:
                i = 1; tm[1] = int(options.period[4:6])
            if len(options.period) == 8:
                i = 2; tm[2] = int(options.period[6:8])
            p_start = time.mktime(tm)
            tm[i] += 1
            p_stop = time.mktime(tm)

        # get the highest post id already saved
        ident_max = None
        if options.incremental:
            try:
                ident_max = max(
                    long(os.path.splitext(os.path.split(f)[1])[0])
                    for f in glob(path_to(post_dir, '*' + post_ext))
                )
                log(account, "Backing up posts after %d\r" % ident_max)
            except ValueError:  # max() arg is an empty sequence
                pass
        else:
            log(account, "Getting basic information\r")

        # start by calling the API with just a single post
        soup = xmlparse(base + '?num=1')
        if not soup:
            return

        # collect all the meta information
        tumblelog = soup.tumblelog
        try:
            self.title = escape(tumblelog('title'))
        except KeyError:
            self.title = account
        self.subtitle = unicode(tumblelog)

        # use the meta information to create a HTML header
        global post_header
        post_header = header(self.title, body_class='post')

        # find the total number of posts
        total_posts = options.count or int(soup.posts('total'))
        last_post = options.skip + total_posts

        def _backup(posts):
            for p in sorted(posts, key=lambda x: long(x('id')), reverse=True):
                post = post_class(p)
                if ident_max and long(post.ident) <= ident_max:
                    return False
                if options.period:
                    if post.date >= p_stop:
                        continue
                    if post.date < p_start:
                        return False
                post.generate_content()
                if post.error:
                    sys.stderr.write('%s%s\n' % (post.error, 50 * ' '))
                post.save_post()
                self.post_count += 1
            return True

        # Get the XML entries from the API, which we can only do for max 50 posts at once.
        # Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
        MAX = 50
        for i in range(options.skip, last_post, MAX):
            # find the upper bound
            j = min(i + MAX, last_post)
            log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, total_posts))

            soup = xmlparse('%s?num=%d&start=%d' % (base, j - i, i))
            if soup is None:
                return

            if not _backup(soup.posts['post':]):
                break

        if not options.blosxom and self.post_count:
            get_avatar()
            if not have_custom_css:
                save_style()
            self.index = defaultdict(lambda: defaultdict(list))
            self.build_index()
            self.save_index()

        log(account, "%d posts backed up\n" % self.post_count)
        self.total_count += self.post_count


class TumblrPost:

    def __init__(self, post):
        self.content = ''
        self.post = post
        self.xml_content = post.__repr__(1, 1)
        self.ident = post('id')
        self.url = post('url')
        self.slug = post('slug')
        self.typ = post('type')
        self.date = int(post('unix-timestamp'))
        self.tm = time.localtime(self.date)
        self.title = ''
        self.tags = []
        self.file_name = self.ident + post_ext
        self.error = None

    def generate_content(self):
        """generates the content for this post"""
        post = self.post
        content = []

        def append(s, fmt=u'%s'):
            # the %s conversion calls unicode() on the xmltramp element
            content.append(fmt % s)

        def get_try(elt):
            try:
                return unicode(post[elt])
            except KeyError:
                return ''

        def append_try(elt, fmt=u'%s'):
            elt = get_try(elt)
            if elt:
                append(elt, fmt)

        if self.typ == 'regular':
            self.title = get_try('regular-title')
            append_try('regular-body')

        elif self.typ == 'photo':
            url = escape(get_try('photo-link-url'))
            for p in post.photoset['photo':] if hasattr(post, 'photoset') else [post]:
                src = unicode(p['photo-url'])
                append(escape(self.get_image_url(src)), u'<img alt="" src="%s">')
                if url:
                    content[-1] = '<a href="%s">%s</a>' % (url, content[-1])
                content[-1] = '<p>' + content[-1] + '</p>'
                if p._name == 'photo' and p('caption'):
                    append(p('caption'), u'<p>%s</p>')
            append_try('photo-caption')

        elif self.typ == 'link':
            url = unicode(post['link-url'])
            self.title = u'<a href="%s">%s</a>' % (escape(url),
                post['link-text'] if 'link-text' in post else url
            )
            append_try('link-description')

        elif self.typ == 'quote':
            append(post['quote-text'], u'<blockquote><p>%s</p></blockquote>')
            append_try('quote-source', u'<p>%s</p>')

        elif self.typ == 'video':
            source = unicode(post['video-source']).strip()
            if source.startswith('<iframe') or source.startswith('<object'):
                append(source, u'<p>%s</p>')
                append_try('video-caption')
            else:
                append(post['video-player'], u'<p>%s</p>')
                append_try('video-caption')
                append(escape(source), u'<p><a href="%s">Original</a></p>')

        elif self.typ == 'audio':
            append(post['audio-player'])
            append_try('audio-caption')

        elif self.typ == 'answer':
            self.title = post.question
            append(post.answer)

        elif self.typ == 'conversation':
            self.title = get_try('conversation-title')
            append(
                '<br>\n'.join(escape(unicode(l)) for l in post.conversation['line':]),
                u'<p>%s</p>'
            )

        else:
            self.error = u"Unknown post type '%s' in post #%s" % (self.typ, self.ident)
            append(escape(self.xml_content), u'<pre>%s</pre>')

        self.tags = [u'%s' % t for t in post['tag':]]

        self.content = '\n'.join(content)

        # fix wrongly nested HTML tags
        for p in ('<p>(<(%s)>)', '(</(%s)>)</p>'):
            self.content = re.sub(p % 'p|ol|iframe[^>]*', r'\1', self.content)

    def get_image_url(self, url):
        return url
        #url = save_image(url)
        #if '://' in url:        # in case of download errors
        #    return url
        #return u'../%s/%s' % (image_dir, url)

    def get_post(self):
        """returns this post in HTML"""
        post = 'date!$$$!%s<br>\n' % strftime('%x %X', self.tm)
        if self.slug:
          post += u'slug!$$$!%s<br>\n' % self.slug
        if self.tags:
          post += u'tag!$$$!%s<br>\n' % u'|'.join(t for t in self.tags)
        if self.title:
            post += 'title!$$$!%s<br>\n' % self.title

        post += 'content!$$$!<br>\n'
        post += self.content
        return post

    def save_post(self):
        """saves this post locally"""
        with open_text(post_dir, self.file_name) as f:
            f.write(self.get_post())
        os.utime(path_to(post_dir, self.file_name),
            (self.date, self.date)
        )
        if options.xml:
            with open_text(xml_dir, self.ident + '.xml') as f:
                f.write(self.xml_content)

class BlosxomPost(TumblrPost):

    def get_image_url(self, url):
        return url

    def get_post(self):
        """returns this post as a Blosxom post"""
        post = self.title + '\nmeta-id: _' + self.ident + '\nmeta-url: ' + self.url
        if self.tags:
            post += '\nmeta-tags: ' + ' '.join(t.replace(' ', '+') for t in self.tags)
        post += '\n\n' + self.content
        return post

class LocalPost:

    def __init__(self, post_file):
        with codecs.open(post_file, 'r', encoding) as f:
            self.lines = f.readlines()
        # remove header and footer
        while self.lines and '<article ' not in self.lines[0]:
            del self.lines[0]
        while self.lines and '</article>' not in self.lines[-1]:
            del self.lines[-1]
        self.file_name = os.path.split(post_file)[1]
        self.ident = os.path.splitext(self.file_name)[0]
        self.date = os.stat(post_file).st_mtime
        self.tm = time.localtime(self.date)

    def get_post(self):
        return u''.join(self.lines)


if __name__ == '__main__':
    import optparse
    parser = optparse.OptionParser("Usage: %prog [options] blog-name ...",
        description="Makes a local backup of Tumblr blogs."
    )
    parser.add_option('-q', '--quiet', action='store_true',
        help="suppress progress messages"
    )
    parser.add_option('-i', '--incremental', action='store_true',
        help="incremental backup mode"
    )
    parser.add_option('-x', '--xml', action='store_true',
        help="save the original XML source"
    )
    parser.add_option('-b', '--blosxom', action='store_true',
        help="save the posts in blosxom format"
    )
    parser.add_option('-r', '--reverse-month', action='store_false', default=True,
        help="reverse the post order in the monthly archives"
    )
    parser.add_option('-R', '--reverse-index', action='store_false', default=True,
        help="reverse the index file order"
    )
    parser.add_option('-a', '--auto', type='int', metavar="HOUR",
        help="do a full backup at HOUR hours, otherwise do an incremental backup"
        " (useful for cron jobs)"
    )
    parser.add_option('-n', '--count', type='int', help="save only COUNT posts")
    parser.add_option('-s', '--skip', type='int', default=0,
        help="skip the first SKIP posts"
    )
    parser.add_option('-p', '--period', help="limit the backup to PERIOD"
        " ('y', 'm', 'd' or YYYY[MM[DD]])"
    )
    parser.add_option('-P', '--private', help="password for a private tumblr",
        metavar='PASSWORD'
    )
    options, args = parser.parse_args()

    if options.auto is not None:
        if options.auto == time.localtime().tm_hour:
            options.incremental = False
        else:
            options.incremental = True
    if options.period:
        try:
            options.period = time.strftime(
                {'y': '%Y', 'm': '%Y%m', 'd': '%Y%m%d'}[options.period]
            )
        except KeyError:
            options.period = options.period.replace('-', '')
        if len(options.period) not in (4, 6, 8):
            parser.error("Period must be 'y', 'm', 'd' or YYYY[MM[DD]]")
    if not args:
        args = ['bbolli']

    tb = TumblrBackup()
    for account in args:
        tb.backup(account)

    sys.exit(0 if tb.total_count else 1)
	#!/usr/bin/env python
	"""html2text: Turn HTML into equivalent Markdown-structured text."""
	__version__ = "3.1"
	__author__ = "Aaron Swartz (me@aaronsw.com)"
	__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
	__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]

	# TODO:
	# Support decoded entities with unifiable.

	try:
	True
	except NameError:
	setattr(__builtins__, 'True', 1)
	setattr(__builtins__, 'False', 0)

	def has_key(x, y):
	if hasattr(x, 'has_key'): return x.has_key(y)
	else: return y in x

	try:
	import htmlentitydefs
	import urlparse
	import HTMLParser
	except ImportError: #Python3
	import html.entities as htmlentitydefs
	import urllib.parse as urlparse
	import html.parser as HTMLParser
	try: #Python3
	import urllib.request as urllib
	except:
	import urllib
	import optparse, re, sys, codecs, types

	try: from textwrap import wrap
	except: pass

	# Use Unicode characters instead of their ascii psuedo-replacements
	UNICODE_SNOB = 0

	# Put the links after each paragraph instead of at the end.
	LINKS_EACH_PARAGRAPH = 0

	# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
	BODY_WIDTH = False

	# Don't show internal links (href="#local-anchor") -- corresponding link targets
	# won't be visible in the plain text file anyway.
	SKIP_INTERNAL_LINKS = True

	# Use inline, rather than reference, formatting for images and links
	INLINE_LINKS = True

	# Number of pixels Google indents nested lists
	GOOGLE_LIST_INDENT = 36

	IGNORE_ANCHORS = False
	IGNORE_IMAGES = False

	### Entity Nonsense ###

	def name2cp(k):
	if k == 'apos': return ord("'")
	if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
	return htmlentitydefs.name2codepoint[k]
	else:
	k = htmlentitydefs.entitydefs[k]
	if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
	return ord(codecs.latin_1_decode(k)[0])

	unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
	'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
	'ndash':'-', 'oelig':'oe', 'aelig':'ae',
	'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
	'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
	'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
	'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
	'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
	'lrm':'', 'rlm':''}

	unifiable_n = {}

	for k in unifiable.keys():
	unifiable_n[name2cp(k)] = unifiable[k]

	def charref(name):
	if name[0] in ['x','X']:
	c = int(name[1:], 16)
	else:
	c = int(name)

	if not UNICODE_SNOB and c in unifiable_n.keys():
	return unifiable_n[c]
	else:
	try:
	return unichr(c)
	except NameError: #Python3
	return chr(c)

	def entityref(c):
	if not UNICODE_SNOB and c in unifiable.keys():
	return unifiable[c]
	else:
	try: name2cp(c)
	except KeyError: return "&" + c + ';'
	else:
	try:
	return unichr(name2cp(c))
	except NameError: #Python3
	return chr(name2cp(c))

	def replaceEntities(s):
	s = s.group(1)
	if s[0] == "#":
	return charref(s[1:])
	else: return entityref(s)

	r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+\|\w{1,8}));")
	def unescape(s):
	return r_unescape.sub(replaceEntities, s)

	### End Entity Nonsense ###

	def onlywhite(line):
	"""Return true if the line does only consist of whitespace characters."""
	for c in line:
	if c is not ' ' and c is not ' ':
	return c is ' '
	return line

	def optwrap(text):
	"""Wrap all paragraphs in the provided text."""
	return text
	if not BODY_WIDTH:
	return text

	assert wrap, "Requires Python 2.3."
	result = ''
	newlines = 0
	for para in text.split("\n"):
	if len(para) > 0:
	if para[0] != ' ' and para[0] != '-' and para[0] != '*':
	for line in wrap(para, BODY_WIDTH):
	result += line + "\n"
	result += "\n"
	newlines = 2
	else:
	if not onlywhite(para):
	result += para + "\n"
	newlines = 1
	else:
	if newlines < 2:
	result += "\n"
	newlines += 1
	return result

	def hn(tag):
	if tag[0] == 'h' and len(tag) == 2:
	try:
	n = int(tag[1])
	if n in range(1, 10): return n
	except ValueError: return 0

	def dumb_property_dict(style):
	"""returns a hash of css attributes"""
	return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);

	def dumb_css_parser(data):
	"""returns a hash of css selectors, each of which contains a hash of css attributes"""
	# remove @import sentences
	importIndex = data.find('@import')
	while importIndex != -1:
	data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
	importIndex = data.find('@import')

	# parse the css. reverted from dictionary compehension in order to support older pythons
	elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
	elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])

	return elements

	def element_style(attrs, style_def, parent_style):
	"""returns a hash of the 'final' style attributes of the element"""
	style = parent_style.copy()
	if 'class' in attrs:
	for css_class in attrs['class'].split():
	css_style = style_def['.' + css_class]
	style.update(css_style)
	if 'style' in attrs:
	immediate_style = dumb_property_dict(attrs['style'])
	style.update(immediate_style)
	return style

	def google_list_style(style):
	"""finds out whether this is an ordered or unordered list"""
	if 'list-style-type' in style:
	list_style = style['list-style-type']
	if list_style in ['disc', 'circle', 'square', 'none']:
	return 'ul'
	return 'ol'

	def google_nest_count(style):
	"""calculate the nesting count of google doc lists"""
	nest_count = 0
	if 'margin-left' in style:
	nest_count = int(style['margin-left'][:-2]) / GOOGLE_LIST_INDENT
	return nest_count

	def google_has_height(style):
	"""check if the style of the element has the 'height' attribute explicitly defined"""
	if 'height' in style:
	return True
	return False

	def google_text_emphasis(style):
	"""return a list of all emphasis modifiers of the element"""
	emphasis = []
	if 'text-decoration' in style:
	emphasis.append(style['text-decoration'])
	if 'font-style' in style:
	emphasis.append(style['font-style'])
	if 'font-weight' in style:
	emphasis.append(style['font-weight'])
	return emphasis

	def google_fixed_width_font(style):
	"""check if the css of the current element defines a fixed width font"""
	font_family = ''
	if 'font-family' in style:
	font_family = style['font-family']
	if 'Courier New' == font_family or 'Consolas' == font_family:
	return True
	return False

	def list_numbering_start(attrs):
	"""extract numbering from list element attributes"""
	if 'start' in attrs:
	return int(attrs['start']) - 1
	else:
	return 0

	class _html2text(HTMLParser.HTMLParser):
	def __init__(self, out=None, baseurl=''):
	HTMLParser.HTMLParser.__init__(self)

	if out is None: self.out = self.outtextf
	else: self.out = out
	self.outtextlist = [] # empty list to store output characters before they are "joined"
	try:
	self.outtext = unicode()
	except NameError: # Python3
	self.outtext = str()
	self.quiet = 0
	self.p_p = 0 # number of newline character to print before next output
	self.outcount = 0
	self.start = 1
	self.space = 0
	self.a = []
	self.astack = []
	self.acount = 0
	self.list = []
	self.blockquote = 0
	self.pre = 0
	self.startpre = 0
	self.code = False
	self.br_toggle = ''
	self.lastWasNL = 0
	self.lastWasList = False
	self.style = 0
	self.style_def = {}
	self.tag_stack = []
	self.emphasis = 0
	self.drop_white_space = 0
	self.inheader = False
	self.abbr_title = None # current abbreviation definition
	self.abbr_data = None # last inner HTML (for abbr being defined)
	self.abbr_list = {} # stack of abbreviations to write later
	self.baseurl = baseurl

	if options.google_doc:
	del unifiable_n[name2cp('nbsp')]
	unifiable['nbsp'] = '&nbsp_place_holder;'

	def feed(self, data):
	data = data.replace("</' + 'script>", "</ignore>")
	HTMLParser.HTMLParser.feed(self, data)

	def outtextf(self, s):
	self.outtextlist.append(s)
	if s: self.lastWasNL = s[-1] == '\n'

	def close(self):
	HTMLParser.HTMLParser.close(self)

	self.pbr()
	self.o('', 0, 'end')

	self.outtext = self.outtext.join(self.outtextlist)

	if options.google_doc:
	self.outtext = self.outtext.replace('&nbsp_place_holder;', ' ');

	return self.outtext

	def handle_charref(self, c):
	self.o(charref(c), 1)

	def handle_entityref(self, c):
	self.o(entityref(c), 1)

	def handle_starttag(self, tag, attrs):
	self.handle_tag(tag, attrs, 1)

	def handle_endtag(self, tag):
	self.handle_tag(tag, None, 0)

	def previousIndex(self, attrs):
	""" returns the index of certain set of attributes (of a link) in the
	self.a list

	If the set of attributes is not found, returns None
	"""
	if not has_key(attrs, 'href'): return None

	i = -1
	for a in self.a:
	i += 1
	match = 0

	if has_key(a, 'href') and a['href'] == attrs['href']:
	if has_key(a, 'title') or has_key(attrs, 'title'):
	if (has_key(a, 'title') and has_key(attrs, 'title') and
	a['title'] == attrs['title']):
	match = True
	else:
	match = True

	if match: return i

	def drop_last(self, nLetters):
	if not self.quiet:
	self.outtext = self.outtext[:-nLetters]

	def handle_emphasis(self, start, tag_style, parent_style):
	"""handles various text emphases"""
	tag_emphasis = google_text_emphasis(tag_style)
	parent_emphasis = google_text_emphasis(parent_style)

	# handle Google's text emphasis
	strikethrough = 'line-through' in tag_emphasis and options.hide_strikethrough
	bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
	italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
	fixed = google_fixed_width_font(tag_style) and not \
	google_fixed_width_font(parent_style) and not self.pre

	if start:
	# crossed-out text must be handled before other attributes
	# in order not to output qualifiers unnecessarily
	if bold or italic or fixed:
	self.emphasis += 1
	if strikethrough:
	self.quiet += 1
	if italic:
	self.o("_")
	self.drop_white_space += 1
	if bold:
	self.o("**")
	self.drop_white_space += 1
	if fixed:
	self.o('`')
	self.drop_white_space += 1
	self.code = True
	else:
	if bold or italic or fixed:
	# there must not be whitespace before closing emphasis mark
	self.emphasis -= 1
	self.space = 0
	self.outtext = self.outtext.rstrip()
	if fixed:
	if self.drop_white_space:
	# empty emphasis, drop it
	self.drop_last(1)
	self.drop_white_space -= 1
	else:
	self.o('`')
	self.code = False
	if bold:
	if self.drop_white_space:
	# empty emphasis, drop it
	self.drop_last(2)
	self.drop_white_space -= 1
	else:
	self.o("**")
	if italic:
	if self.drop_white_space:
	# empty emphasis, drop it
	self.drop_last(1)
	self.drop_white_space -= 1
	else:
	self.o("_")
	# space is only allowed after all emphasis marks
	if (bold or italic) and not self.emphasis:
	self.o(" ")
	if strikethrough:
	self.quiet -= 1

	def handle_tag(self, tag, attrs, start):
	#attrs = fixattrs(attrs)
	if attrs is None:
	attrs = {}
	else:
	attrs = dict(attrs)

	if options.google_doc:
	# the attrs parameter is empty for a closing tag. in addition, we
	# need the attributes of the parent nodes in order to get a
	# complete style description for the current element. we assume
	# that google docs export well formed html.
	parent_style = {}
	if start:
	if self.tag_stack:
	parent_style = self.tag_stack[-1][2]
	tag_style = element_style(attrs, self.style_def, parent_style)
	self.tag_stack.append((tag, attrs, tag_style))
	else:
	dummy, attrs, tag_style = self.tag_stack.pop()
	if self.tag_stack:
	parent_style = self.tag_stack[-1][2]

	if hn(tag):
	self.p()
	if start:
	self.inheader = True
	self.o(hn(tag)*"#" + ' ')
	else:
	self.inheader = False
	return # prevent redundant emphasis marks on headers

	if tag in ['p', 'div']:
	if options.google_doc:
	if start and google_has_height(tag_style):
	self.p()
	else:
	self.soft_br()
	else:
	self.p()

	if tag == "br" and start: self.o(" \n")

	if tag == "hr" and start:
	self.p()
	self.o("* * *")
	self.p()

	if tag in ["head", "style"]:
	if start: self.quiet += 1
	else: self.quiet -= 1

	if tag == "style":
	if start: self.style += 1
	else: self.style -= 1

	if tag in ["body"]:
	self.quiet = 0 # sites like 9rules.com never close <head>

	if tag == "blockquote":
	if start:
	self.p(); self.o('> ', 0, 1); self.start = 1
	self.blockquote += 1
	else:
	self.blockquote -= 1
	self.p()

	if tag in ['em', 'i', 'u']: self.o("_")
	if tag in ['strong', 'b']: self.o("**")
	if tag in ['del', 'strike', "script"]:
	if start:
	self.o("<"+tag+">")
	else:
	self.o("</"+tag+">")

	if options.google_doc:
	if not self.inheader:
	# handle some font attributes, but leave headers clean
	self.handle_emphasis(start, tag_style, parent_style)

	if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
	if tag == "abbr":
	if start:
	self.abbr_title = None
	self.abbr_data = ''
	if has_key(attrs, 'title'):
	self.abbr_title = attrs['title']
	else:
	if self.abbr_title != None:
	self.abbr_list[self.abbr_data] = self.abbr_title
	self.abbr_title = None
	self.abbr_data = ''

	if tag == "a" and not IGNORE_ANCHORS:
	if start:
	if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
	self.astack.append(attrs)
	self.o("[")
	else:
	self.astack.append(None)
	else:
	if self.astack:
	a = self.astack.pop()
	if a:
	if INLINE_LINKS:
	self.o("](" + a['href'] + ")")
	else:
	i = self.previousIndex(a)
	if i is not None:
	a = self.a[i]
	else:
	self.acount += 1
	a['count'] = self.acount
	a['outcount'] = self.outcount
	self.a.append(a)
	self.o("][" + str(a['count']) + "]")

	if tag == "img" and start and not IGNORE_IMAGES:
	if has_key(attrs, 'src'):
	attrs['href'] = attrs['src']
	alt = attrs.get('alt', '')
	if INLINE_LINKS:
	self.o("![")
	self.o(alt)
	self.o("]("+ attrs['href'] +")")
	else:
	i = self.previousIndex(attrs)
	if i is not None:
	attrs = self.a[i]
	else:
	self.acount += 1
	attrs['count'] = self.acount
	attrs['outcount'] = self.outcount
	self.a.append(attrs)
	self.o("![")
	self.o(alt)
	self.o("]["+ str(attrs['count']) +"]")

	if tag == 'dl' and start: self.p()
	if tag == 'dt' and not start: self.pbr()
	if tag == 'dd' and start: self.o(' ')
	if tag == 'dd' and not start: self.pbr()

	if tag in ["ol", "ul"]:
	# Google Docs create sub lists as top level lists
	if (not self.list) and (not self.lastWasList):
	self.p()
	if start:
	if options.google_doc:
	list_style = google_list_style(tag_style)
	else:
	list_style = tag
	numbering_start = list_numbering_start(attrs)
	self.list.append({'name':list_style, 'num':numbering_start})
	else:
	if self.list: self.list.pop()
	self.lastWasList = True
	else:
	self.lastWasList = False

	if tag == 'li':
	self.pbr()
	if start:
	if self.list: li = self.list[-1]
	else: li = {'name':'ul', 'num':0}
	if options.google_doc:
	nest_count = google_nest_count(tag_style)
	else:
	nest_count = len(self.list)
	self.o(" " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
	if li['name'] == "ul": self.o(options.ul_item_mark + " ")
	elif li['name'] == "ol":
	li['num'] += 1
	self.o(str(li['num'])+". ")
	self.start = 1

	if tag in ["table", "tr"] and start: self.p()
	if tag == 'td': self.pbr()

	if tag == "pre":
	if start:
	self.startpre = 1
	self.pre = 1
	else:
	self.pre = 0
	self.p()

	if tag in ["iframe"]:
	if start:
	_tag = "<"+tag
	for k, v in attrs.iteritems():
	_tag += " "+str(k)+"='"+str(v)+"'"
	_tag += ">"
	self.o(_tag)
	else:
	self.o("</"+tag+">")

	def pbr(self):
	if self.p_p == 0: self.p_p = 1

	def p(self): self.p_p = 2

	def soft_br(self):
	self.pbr()
	self.br_toggle = ' '

	def o(self, data, puredata=0, force=0):
	if self.abbr_data is not None: self.abbr_data += data

	if not self.quiet:
	if options.google_doc:
	# prevent white space immediately after 'begin emphasis' marks ('**' and '_')
	lstripped_data = data.lstrip()
	if self.drop_white_space and not (self.pre or self.code):
	data = lstripped_data
	if lstripped_data != '':
	self.drop_white_space = 0

	if puredata and not self.pre:
	data = re.sub('\s+', ' ', data)
	if data and data[0] == ' ':
	self.space = 1
	data = data[1:]
	if not data and not force: return

	if self.startpre:
	#self.out(" :") #TODO: not output when already one there
	self.startpre = 0

	bq = (">" * self.blockquote)
	if not (force and data and data[0] == ">") and self.blockquote: bq += " "

	if self.pre:
	bq += " "
	data = data.replace("\n", "\n"+bq)

	if self.start:
	self.space = 0
	self.p_p = 0
	self.start = 0

	if force == 'end':
	# It's the end.
	self.p_p = 0
	self.out("\n")
	self.space = 0

	if self.p_p:
	self.out((self.br_toggle+'\n'+bq)*self.p_p)
	self.space = 0
	self.br_toggle = ''

	if self.space:
	if not self.lastWasNL: self.out(' ')
	self.space = 0

	if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
	if force == "end": self.out("\n")

	newa = []
	for link in self.a:
	if self.outcount > link['outcount']:
	self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
	if has_key(link, 'title'): self.out(" ("+link['title']+")")
	self.out("\n")
	else:
	newa.append(link)

	if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.

	self.a = newa

	if self.abbr_list and force == "end":
	for abbr, definition in self.abbr_list.items():
	self.out(" *[" + abbr + "]: " + definition + "\n")

	self.p_p = 0
	self.out(data)
	self.outcount += 1

	def handle_data(self, data):
	if r'\/script>' in data: self.quiet -= 1

	if self.style:
	self.style_def.update(dumb_css_parser(data))

	self.o(data, 1)

	def unknown_decl(self, data): pass

	def wrapwrite(text):
	text = text.encode('utf-8')
	try: #Python3
	sys.stdout.buffer.write(text)
	except AttributeError:
	sys.stdout.write(text)

	def html2text_file(html, out=wrapwrite, baseurl=''):
	h = _html2text(out, baseurl)
	h.feed(html)
	h.feed("")
	return h.close()

	def html2text(html, baseurl=''):
	return optwrap(html2text_file(html, None, baseurl))

	class Storage: pass
	options = Storage()
	options.google_doc = False
	options.ul_item_mark = '*'

	if __name__ == "__main__":
	baseurl = ''

	p = optparse.OptionParser('%prog [(filename\|url) [encoding]]',
	version='%prog ' + __version__)
	p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
	default=False, help="convert an html-exported Google Document")
	p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
	default=False, help="use a dash rather than a star for unordered list items")
	p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
	default=78, help="number of characters per output line, 0 for no wrap")
	p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
	default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
	p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
	default=False, help="hide strike-through text. only relevent when -g is specified as well")
	(options, args) = p.parse_args()

	# handle options
	if options.ul_style_dash:
	options.ul_item_mark = '-'
	else:
	options.ul_item_mark = '*'

	BODY_WIDTH = options.body_width
	GOOGLE_LIST_INDENT = options.list_indent

	# process input
	if len(args) > 0:
	file_ = args[0]
	encoding = None
	if len(args) == 2:
	encoding = args[1]
	if len(args) > 2:
	p.error('Too many arguments')

	if file_.startswith('http://') or file_.startswith('https://'):
	baseurl = file_
	j = urllib.urlopen(baseurl)
	text = j.read()
	if encoding is None:
	try:
	from feedparser import _getCharacterEncoding as enc
	except ImportError:
	enc = lambda x, y: ('utf-8', 1)
	encoding = enc(j.headers, text)[0]
	if encoding == 'us-ascii':
	encoding = 'utf-8'
	data = text.decode(encoding)

	else:
	data = open(file_, 'rb').read()
	if encoding is None:
	try:
	from chardet import detect
	except ImportError:
	detect = lambda x: {'encoding': 'utf-8'}
	encoding = detect(data)['encoding']
	data = data.decode(encoding)
	else:
	data = sys.stdin.read()
	wrapwrite(html2text(data, baseurl))
	#!/usr/bin/env python
	# encoding: utf-8
	"""
	tumblr2calepin.py
	Created by jet tsang <jetsanix@gmail.com> on 2013-04-15T12:50:22.889342+08:00
	"""

	import sys
	from datetime import datetime

	file1=sys.argv[1]
	f1 = open(file1, "r")

	data = f1.readlines()
	f1.close()
	y=1

	Slug = None
	Tags = None
	Title = None

	for x in data:
	x = unicode(x, "utf8")
	xsp = x.split("!$$$!")
	if "date" == xsp[0]:
	d = datetime.strptime(xsp[1].strip(),'%m/%d/%Y %H:%M:%S')
	day_string = d.strftime('%Y-%m-%d %H:%M:%S')
	day_string_corto = d.strftime('%Y')
	theDate = day_string
	if "slug" == xsp[0]:
	Slug = xsp[1].strip()
	if "tag" == xsp[0]:
	Tags = xsp[1].strip().replace("\|", ",")
	if "title" == xsp[0]:
	Title = xsp[1].strip()
	if "](ht" in Title:
	Title = Title[1:Title.lstrip("[").find("]")]

	if "content" == x[0:7]:
	break
	y += 1

	post = 'Date: '+theDate+'\n'
	if Slug:
	if Slug == "view-on-path":
	Slug = d.strftime('%Y%m%d%H%M')
	else:
	Slug = d.strftime('%Y%m%d%H%M')

	post += 'Slug: '+Slug+'\n'

	if Tags:
	post += 'Tags: '+Tags+'\n'

	if Title:
	if Title == "view-on-path":
	Title = d.strftime('%Y%m%d%H%M')
	else:
	Title = d.strftime('%Y%m%d%H%M')

	post += 'Title: '+Title+'\n'
	post += "\n"
	markdownfile = day_string_corto+"-"+Title+".md".replace(" ","-")
	print " ->", Title

	out = open( markdownfile, "w" )
	out.write( post.encode("utf-8") )
	if data[-1] == data[y]:
	out.write( data[y] )
	else:
	for x in data[y:-1]:
	out.write( x )

	out.close()
	#!/bin/sh
	# Created by jet tsang <jetsanix@gmail.com>

	case "$1" in
	name)
	python tumblr_backup.py $2 # modify from https://github.com/bdoms/tumblr_backup
	mkdir -p calepin
	cd $2/posts/
	for n in *.html
	do
	rm -f temp.txt
	echo "==> processing" $n
	../../html2text.py $n > temp.txt # modify from http://www.aaronsw.com/2002/html2text/
	../../tumblr2calepin.py temp.txt
	done
	mv -f *.md ../../calepin/
	;;
	*)
	echo "E.g: $0 name YourBlogName"
	exit 3
	esac
	#!/usr/bin/env python
	# encoding: utf-8

	# standard Python library imports
	from __future__ import with_statement
	import os
	import sys
	import urllib
	import urllib2
	from xml.sax.saxutils import escape
	from xml.sax import SAXException
	import codecs
	import imghdr
	from collections import defaultdict
	import time
	import locale
	from glob import glob
	import re

	# extra required packages
	import xmltramp

	join = os.path.join

	# add another JPEG recognizer
	# see http://www.garykessler.net/library/file_sigs.html
	def test_jpg(h, f):
	if h[:3] == '\xFF\xD8\xFF' and h[3] in "\xDB\xE0\xE1\xE2\xE3":
	return 'jpg'

	imghdr.tests.append(test_jpg)

	# variable directory names, will be set in TumblrBackup.backup()
	save_folder = ''
	image_folder = ''

	# constant names
	root_folder = os.getcwdu()
	post_dir = 'posts'
	xml_dir = 'xml'
	image_dir = 'images'
	archive_dir = 'archive'
	theme_dir = 'theme'
	backup_css = 'backup.css'
	custom_css = 'custom.css'
	avatar_base = 'avatar'

	blog_name = ''
	post_header = ''
	post_ext = '.html'
	have_custom_css = False

	# ensure the right date/time format
	try:
	locale.setlocale(locale.LC_TIME, '')
	except locale.Error:
	pass
	encoding = 'utf-8'
	time_encoding = locale.getlocale(locale.LC_TIME)[1] or encoding

	def log(account, s):
	if not options.quiet:
	if account:
	sys.stdout.write('%s: ' % account)
	sys.stdout.write(s[:-1] + ' ' * 20 + s[-1:])
	sys.stdout.flush()

	def mkdir(dir, recursive=False):
	if not os.path.exists(dir):
	if recursive:
	os.makedirs(dir)
	else:
	os.mkdir(dir)

	def path_to(*parts):
	return join(save_folder, *parts)

	def open_file(open_fn, parts):
	if len(parts) > 1:
	mkdir(path_to(*parts[:-1]))
	return open_fn(path_to(*parts))

	def open_text(*parts):
	return open_file(
	lambda f: codecs.open(f, 'w', encoding, 'xmlcharrefreplace'), parts
	)

	def open_image(*parts):
	return open_file(lambda f: open(f, 'wb'), parts)

	def strftime(format, t=None):
	if t is None:
	t = time.localtime()
	return time.strftime(format, t).decode(time_encoding)

	def get_api_url(account):
	"""construct the tumblr API URL"""
	global blog_name
	blog_name = account
	if '.' not in account:
	blog_name += '.tumblr.com'
	base = 'http://' + blog_name + '/api/read'
	if options.private:
	password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
	password_manager.add_password(None, base, '', options.private)
	auth_manager = urllib2.HTTPBasicAuthHandler(password_manager)
	opener = urllib2.build_opener(auth_manager)
	urllib2.install_opener(opener)
	return base

	def xmlparse(url, data=None):
	for _ in range(10):
	try:
	resp = urllib2.urlopen(url, data)
	except (urllib2.URLError, urllib2.HTTPError) as e:
	sys.stderr.write('%s getting %s\n' % (e, url))
	continue
	if resp.info().gettype() == 'text/xml':
	break
	else:
	return None
	xml = resp.read()
	try:
	doc = xmltramp.parse(xml)
	except SAXException as e:
	sys.stderr.write('%s %r\n\n%r\n\n%s\n' % (resp.info().gettype(), resp.msg, e, xml))
	return None
	return doc if doc._name == 'tumblr' else None

	def save_image(image_url):
	"""saves an image if not saved yet, returns the local file name"""
	image_filename = image_url.split('/')[-1]
	glob_filter = '' if '.' in image_filename else '.*'
	# check if a file with this name already exists
	image_glob = glob(join(image_folder, image_filename + glob_filter))
	if image_glob:
	return os.path.split(image_glob[0])[1]
	# download the image data
	try:
	image_response = urllib2.urlopen(image_url)
	except urllib2.HTTPError:
	# return the original URL
	return image_url
	image_data = image_response.read()
	image_response.close()
	# determine the file type if it's unknown
	if '.' not in image_filename:
	image_type = imghdr.what(None, image_data[:32])
	if image_type:
	image_filename += '.' + image_type.replace('jpeg', 'jpg')
	# save the image
	with open_image(image_dir, image_filename) as image_file:
	image_file.write(image_data)
	return image_filename

	def save_style():
	with open_text(backup_css) as css:
	css.write('''\
	body { width: 720px; margin: 0 auto; }
	img { max-width: 720px; }
	blockquote { margin-left: 0; border-left: 8px #999 solid; padding: 0 24px; }
	.archive h1, .subtitle, article { padding-bottom: 0.75em; border-bottom: 1px #ccc dotted; }
	.post a.llink { display: none; }
	.meta a { text-decoration: none; }
	.avatar { float: right; }
	''')

	def header(heading, title='', body_class='', subtitle='', avatar=''):
	root_rel = '' if body_class == 'index' else '../'
	css_rel = root_rel + (custom_css if have_custom_css else backup_css)
	if body_class:
	body_class = ' class=' + body_class
	h = u'''<!DOCTYPE html>

	<meta charset=%s>
	<title>%s</title>
	<link rel=stylesheet href=%s>

	<body%s>

	''' % (encoding, heading, css_rel, body_class)
	if avatar:
	h += '<img src=%s%s/%s alt=Avatar class=avatar>\n' % (root_rel, theme_dir, avatar)
	if title:
	h += u'<h1>%s</h1>\n' % title
	if subtitle:
	h += u'<p class=subtitle>%s</p>\n' % subtitle
	return h

	def get_avatar():
	try:
	resp = urllib2.urlopen('http://api.tumblr.com/v2/blog/%s/avatar' % blog_name)
	avatar_data = resp.read()
	except:
	return
	avatar_file = avatar_base + '.' + imghdr.what(None, avatar_data[:32])
	with open_image(theme_dir, avatar_file) as f:
	f.write(avatar_data)


	class TumblrBackup:

	def __init__(self):
	self.total_count = 0

	def build_index(self):
	for f in glob(path_to(post_dir, '*.html')):
	post = LocalPost(f)
	self.index[post.tm.tm_year][post.tm.tm_mon].append(post)

	def save_index(self):
	f = glob(path_to(theme_dir, avatar_base + '.*'))
	avatar = os.path.split(f[0])[1] if f else None
	with open_text('index.html') as idx:
	idx.write(header(self.title, self.title, body_class='index',
	subtitle=self.subtitle, avatar=avatar
	))
	for year in sorted(self.index.keys(), reverse=options.reverse_index):
	self.save_year(idx, year)
	idx.write('<p>Generated on %s.</p>\n' % strftime('%x %X'))

	def save_year(self, idx, year):
	idx.write('<h3>%s</h3>\n<ul>\n' % year)
	for month in sorted(self.index[year].keys(), reverse=options.reverse_index):
	tm = time.localtime(time.mktime([year, month, 3, 0, 0, 0, 0, 0, -1]))
	month_name = self.save_month(year, month, tm)
	idx.write(' <li><a href=%s/%s title="%d post(s)">%s</a></li>\n' % (
	archive_dir, month_name, len(self.index[year][month]),
	strftime('%B', tm)
	))
	idx.write('</ul>\n\n')

	def save_month(self, year, month, tm):
	file_name = '%d-%02d.html' % (year, month)
	with open_text(archive_dir, file_name) as arch:
	arch.write('\n\n'.join([
	header(self.title, strftime('%B %Y', tm), body_class='archive'),
	'\n'.join(p.get_post() for p in sorted(
	self.index[year][month], key=lambda x: x.date, reverse=options.reverse_month
	)),
	'<p><a href=../ rel=contents>Index</a></p>\n'
	]))
	return file_name

	def backup(self, account):
	"""makes single files and an index for every post on a public Tumblr blog account"""

	base = get_api_url(account)

	# make sure there are folders to save in
	global save_folder, image_folder, post_ext, post_dir, have_custom_css
	if options.blosxom:
	save_folder = root_folder
	post_ext = '.txt'
	post_dir = os.curdir
	post_class = BlosxomPost
	else:
	save_folder = join(root_folder, account)
	image_folder = path_to(image_dir)
	post_class = TumblrPost
	have_custom_css = os.access(path_to(custom_css), os.R_OK)
	mkdir(save_folder, True)

	self.post_count = 0

	# prepare the period start and end timestamps
	if options.period:
	i = 0; tm = [int(options.period[:4]), 1, 1, 0, 0, 0, 0, 0, -1]
	if len(options.period) >= 6:
	i = 1; tm[1] = int(options.period[4:6])
	if len(options.period) == 8:
	i = 2; tm[2] = int(options.period[6:8])
	p_start = time.mktime(tm)
	tm[i] += 1
	p_stop = time.mktime(tm)

	# get the highest post id already saved
	ident_max = None
	if options.incremental:
	try:
	ident_max = max(
	long(os.path.splitext(os.path.split(f)[1])[0])
	for f in glob(path_to(post_dir, '*' + post_ext))
	)
	log(account, "Backing up posts after %d\r" % ident_max)
	except ValueError: # max() arg is an empty sequence
	pass
	else:
	log(account, "Getting basic information\r")

	# start by calling the API with just a single post
	soup = xmlparse(base + '?num=1')
	if not soup:
	return

	# collect all the meta information
	tumblelog = soup.tumblelog
	try:
	self.title = escape(tumblelog('title'))
	except KeyError:
	self.title = account
	self.subtitle = unicode(tumblelog)

	# use the meta information to create a HTML header
	global post_header
	post_header = header(self.title, body_class='post')

	# find the total number of posts
	total_posts = options.count or int(soup.posts('total'))
	last_post = options.skip + total_posts

	def _backup(posts):
	for p in sorted(posts, key=lambda x: long(x('id')), reverse=True):
	post = post_class(p)
	if ident_max and long(post.ident) <= ident_max:
	return False
	if options.period:
	if post.date >= p_stop:
	continue
	if post.date < p_start:
	return False
	post.generate_content()
	if post.error:
	sys.stderr.write('%s%s\n' % (post.error, 50 * ' '))
	post.save_post()
	self.post_count += 1
	return True

	# Get the XML entries from the API, which we can only do for max 50 posts at once.
	# Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
	MAX = 50
	for i in range(options.skip, last_post, MAX):
	# find the upper bound
	j = min(i + MAX, last_post)
	log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, total_posts))

	soup = xmlparse('%s?num=%d&start=%d' % (base, j - i, i))
	if soup is None:
	return

	if not _backup(soup.posts['post':]):
	break

	if not options.blosxom and self.post_count:
	get_avatar()
	if not have_custom_css:
	save_style()
	self.index = defaultdict(lambda: defaultdict(list))
	self.build_index()
	self.save_index()

	log(account, "%d posts backed up\n" % self.post_count)
	self.total_count += self.post_count


	class TumblrPost:

	def __init__(self, post):
	self.content = ''
	self.post = post
	self.xml_content = post.__repr__(1, 1)
	self.ident = post('id')
	self.url = post('url')
	self.slug = post('slug')
	self.typ = post('type')
	self.date = int(post('unix-timestamp'))
	self.tm = time.localtime(self.date)
	self.title = ''
	self.tags = []
	self.file_name = self.ident + post_ext
	self.error = None

	def generate_content(self):
	"""generates the content for this post"""
	post = self.post
	content = []

	def append(s, fmt=u'%s'):
	# the %s conversion calls unicode() on the xmltramp element
	content.append(fmt % s)

	def get_try(elt):
	try:
	return unicode(post[elt])
	except KeyError:
	return ''

	def append_try(elt, fmt=u'%s'):
	elt = get_try(elt)
	if elt:
	append(elt, fmt)

	if self.typ == 'regular':
	self.title = get_try('regular-title')
	append_try('regular-body')

	elif self.typ == 'photo':
	url = escape(get_try('photo-link-url'))
	for p in post.photoset['photo':] if hasattr(post, 'photoset') else [post]:
	src = unicode(p['photo-url'])
	append(escape(self.get_image_url(src)), u'<img alt="" src="%s">')
	if url:
	content[-1] = '<a href="%s">%s</a>' % (url, content[-1])
	content[-1] = '<p>' + content[-1] + '</p>'
	if p._name == 'photo' and p('caption'):
	append(p('caption'), u'<p>%s</p>')
	append_try('photo-caption')

	elif self.typ == 'link':
	url = unicode(post['link-url'])
	self.title = u'<a href="%s">%s</a>' % (escape(url),
	post['link-text'] if 'link-text' in post else url
	)
	append_try('link-description')

	elif self.typ == 'quote':
	append(post['quote-text'], u'<blockquote><p>%s</p></blockquote>')
	append_try('quote-source', u'<p>%s</p>')

	elif self.typ == 'video':
	source = unicode(post['video-source']).strip()
	if source.startswith('<iframe') or source.startswith('<object'):
	append(source, u'<p>%s</p>')
	append_try('video-caption')
	else:
	append(post['video-player'], u'<p>%s</p>')
	append_try('video-caption')
	append(escape(source), u'<p><a href="%s">Original</a></p>')

	elif self.typ == 'audio':
	append(post['audio-player'])
	append_try('audio-caption')

	elif self.typ == 'answer':
	self.title = post.question
	append(post.answer)

	elif self.typ == 'conversation':
	self.title = get_try('conversation-title')
	append(
	'<br>\n'.join(escape(unicode(l)) for l in post.conversation['line':]),
	u'<p>%s</p>'
	)

	else:
	self.error = u"Unknown post type '%s' in post #%s" % (self.typ, self.ident)
	append(escape(self.xml_content), u'<pre>%s</pre>')

	self.tags = [u'%s' % t for t in post['tag':]]

	self.content = '\n'.join(content)

	# fix wrongly nested HTML tags
	for p in ('<p>(<(%s)>)', '(</(%s)>)</p>'):
	self.content = re.sub(p % 'p\|ol\|iframe[^>]*', r'\1', self.content)

	def get_image_url(self, url):
	return url
	#url = save_image(url)
	#if '://' in url: # in case of download errors
	# return url
	#return u'../%s/%s' % (image_dir, url)

	def get_post(self):
	"""returns this post in HTML"""
	post = 'date!$$$!%s<br>\n' % strftime('%x %X', self.tm)
	if self.slug:
	post += u'slug!$$$!%s<br>\n' % self.slug
	if self.tags:
	post += u'tag!$$$!%s<br>\n' % u'\|'.join(t for t in self.tags)
	if self.title:
	post += 'title!$$$!%s<br>\n' % self.title

	post += 'content!$$$!<br>\n'
	post += self.content
	return post

	def save_post(self):
	"""saves this post locally"""
	with open_text(post_dir, self.file_name) as f:
	f.write(self.get_post())
	os.utime(path_to(post_dir, self.file_name),
	(self.date, self.date)
	)
	if options.xml:
	with open_text(xml_dir, self.ident + '.xml') as f:
	f.write(self.xml_content)

	class BlosxomPost(TumblrPost):

	def get_image_url(self, url):
	return url

	def get_post(self):
	"""returns this post as a Blosxom post"""
	post = self.title + '\nmeta-id: _' + self.ident + '\nmeta-url: ' + self.url
	if self.tags:
	post += '\nmeta-tags: ' + ' '.join(t.replace(' ', '+') for t in self.tags)
	post += '\n\n' + self.content
	return post

	class LocalPost:

	def __init__(self, post_file):
	with codecs.open(post_file, 'r', encoding) as f:
	self.lines = f.readlines()
	# remove header and footer
	while self.lines and '<article ' not in self.lines[0]:
	del self.lines[0]
	while self.lines and '</article>' not in self.lines[-1]:
	del self.lines[-1]
	self.file_name = os.path.split(post_file)[1]
	self.ident = os.path.splitext(self.file_name)[0]
	self.date = os.stat(post_file).st_mtime
	self.tm = time.localtime(self.date)

	def get_post(self):
	return u''.join(self.lines)


	if __name__ == '__main__':
	import optparse
	parser = optparse.OptionParser("Usage: %prog [options] blog-name ...",
	description="Makes a local backup of Tumblr blogs."
	)
	parser.add_option('-q', '--quiet', action='store_true',
	help="suppress progress messages"
	)
	parser.add_option('-i', '--incremental', action='store_true',
	help="incremental backup mode"
	)
	parser.add_option('-x', '--xml', action='store_true',
	help="save the original XML source"
	)
	parser.add_option('-b', '--blosxom', action='store_true',
	help="save the posts in blosxom format"
	)
	parser.add_option('-r', '--reverse-month', action='store_false', default=True,
	help="reverse the post order in the monthly archives"
	)
	parser.add_option('-R', '--reverse-index', action='store_false', default=True,
	help="reverse the index file order"
	)
	parser.add_option('-a', '--auto', type='int', metavar="HOUR",
	help="do a full backup at HOUR hours, otherwise do an incremental backup"
	" (useful for cron jobs)"
	)
	parser.add_option('-n', '--count', type='int', help="save only COUNT posts")
	parser.add_option('-s', '--skip', type='int', default=0,
	help="skip the first SKIP posts"
	)
	parser.add_option('-p', '--period', help="limit the backup to PERIOD"
	" ('y', 'm', 'd' or YYYY[MM[DD]])"
	)
	parser.add_option('-P', '--private', help="password for a private tumblr",
	metavar='PASSWORD'
	)
	options, args = parser.parse_args()

	if options.auto is not None:
	if options.auto == time.localtime().tm_hour:
	options.incremental = False
	else:
	options.incremental = True
	if options.period:
	try:
	options.period = time.strftime(
	{'y': '%Y', 'm': '%Y%m', 'd': '%Y%m%d'}[options.period]
	)
	except KeyError:
	options.period = options.period.replace('-', '')
	if len(options.period) not in (4, 6, 8):
	parser.error("Period must be 'y', 'm', 'd' or YYYY[MM[DD]]")
	if not args:
	args = ['bbolli']

	tb = TumblrBackup()
	for account in args:
	tb.backup(account)

	sys.exit(0 if tb.total_count else 1)