alex/blogspot_importer.py

## blogspot_importer.py
import re
from urllib import urlopen

from django.contrib.auth.models import User
from django.core.management.base import BaseCommand, CommandError
from django.template.defaultfilters import slugify

from dateutil.parser import parse as parse_date
from pyquery import PyQuery

from blago.models import Post


class Command(BaseCommand):
    help = "Imports a blog post for blogspot"

    def handle(self, url=None, **options):
        if not url:
            raise CommandError("Provide a post!")
        page = PyQuery(urlopen(url).read())

        kwargs = {}
        kwargs["title"] = page.find(".post-title a").html().strip()
        kwargs["slug"] = slugify(kwargs["title"])
        kwargs["author"] = User.objects.get(username="alex")
        kwargs["published"] = parse_date(page.find(".date-header").html())

        # convert linebreaks
        body = page.find(".post-body").html().replace("<br />", "\n")
        # remove this fucked up HTML spelling error shit, wtf
        body = re.sub(
            '<span class="blsp-spelling-error" id="SPELLING_ERROR_\d+">(.*?)</span>',
            lambda match: match.groups()[0],
            body
        )
        # turn links into reST
        body = re.sub(
            '<a href="(.*?)">(.*?)</a>',
            lambda match: "`%s <%s>`_" % (match.groups()[1], match.groups()[0]),
            body
        )
        # remove this
        body = body.replace('<div style="clear: both;"></div>', "")
        # remove this
        body = body.replace("<ul>", "").replace("</ul>", "")
        body = body.replace("<ol>", "").replace("</ol>", "")
        # turn list elements into proper reST lists
        body = body.replace("</li>", "\n").replace("<li>", " * ")
        # convert code blocks
        body = re.compile("<pre>(.*?)</pre>", re.DOTALL).sub(
            lambda match: "\n.. sourcecode:: python\n" + "\n".join(["    " + x for x in match.groups()[0].splitlines()]),
            body
        )
        kwargs["body"] = body.strip()

        tags = page.find(".post-labels a").text()
        if tags is not None:
            tags = tags.split()
        p = Post.objects.create(**kwargs)
        if tags:
            p.tags.add(*tags)
        print "Imported: %s" % p.title
	import re
	from urllib import urlopen

	from django.contrib.auth.models import User
	from django.core.management.base import BaseCommand, CommandError
	from django.template.defaultfilters import slugify

	from dateutil.parser import parse as parse_date
	from pyquery import PyQuery

	from blago.models import Post


	class Command(BaseCommand):
	help = "Imports a blog post for blogspot"

	def handle(self, url=None, **options):
	if not url:
	raise CommandError("Provide a post!")
	page = PyQuery(urlopen(url).read())

	kwargs = {}
	kwargs["title"] = page.find(".post-title a").html().strip()
	kwargs["slug"] = slugify(kwargs["title"])
	kwargs["author"] = User.objects.get(username="alex")
	kwargs["published"] = parse_date(page.find(".date-header").html())

	# convert linebreaks
	body = page.find(".post-body").html().replace("<br />", "\n")
	# remove this fucked up HTML spelling error shit, wtf
	body = re.sub(
	'<span class="blsp-spelling-error" id="SPELLING_ERROR_\d+">(.*?)</span>',
	lambda match: match.groups()[0],
	body
	)
	# turn links into reST
	body = re.sub(
	'<a href="(.?)">(.?)</a>',
	lambda match: "`%s <%s>`_" % (match.groups()[1], match.groups()[0]),
	body
	)
	# remove this
	body = body.replace('<div style="clear: both;"></div>', "")
	# remove this
	body = body.replace("<ul>", "").replace("</ul>", "")
	body = body.replace("<ol>", "").replace("</ol>", "")
	# turn list elements into proper reST lists
	body = body.replace("</li>", "\n").replace("<li>", " * ")
	# convert code blocks
	body = re.compile("<pre>(.*?)</pre>", re.DOTALL).sub(
	lambda match: "\n.. sourcecode:: python\n" + "\n".join([" " + x for x in match.groups()[0].splitlines()]),
	body
	)
	kwargs["body"] = body.strip()

	tags = page.find(".post-labels a").text()
	if tags is not None:
	tags = tags.split()
	p = Post.objects.create(**kwargs)
	if tags:
	p.tags.add(*tags)
	print "Imported: %s" % p.title