Last active
July 6, 2016 07:41
-
-
Save tokiwatch/e2c81e139e393ae388a6c690f4c2df27 to your computer and use it in GitHub Desktop.
convert blogger backup file to octopress style entry files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## -*- mode: perl; coding: utf-8 -*- | |
# blogger2octopress.pl | |
# Author: Takuji Tokiwa | |
# | |
# About: | |
# Export entries from blogger backup file to octpress format entries. | |
# | |
# Usage: | |
# 1. Install XML::Simpe, DateTime::Format::ISO8601 and HTML::WikiConverter::Markdown via cpan, cpanm or 'Carton Install'. | |
# 2. Change $blogger_fn as your backupfile. | |
# 3. Change $timezone as your location. | |
# 4. Run | |
# 5. copy exported files to octopress_dir/source/_posts . | |
use strict; | |
use XML::Simple; | |
use DateTime::Format::ISO8601; | |
use HTML::WikiConverter::Markdown; | |
# config | |
my $blogger_fn = 'blog.xml'; | |
my $timezone = '+0900'; | |
my $exp_dir = "./entries/"; | |
my $xml = XML::Simple->new; | |
my $data = $xml->XMLin($blogger_fn) or die; | |
my $entries = $data->{entry} or die; | |
foreach my $id (keys %$entries) { | |
if ($id =~ /.*post.*/) { | |
my ($eid) = $id =~ /post-(\d+)$/; | |
my $published = $$entries{$id}->{published}; | |
my $title = $$entries{$id}->{title}->{content}; | |
my $author = $$entries{$id}->{author}->{name}; | |
my $html = $$entries{$id}->{content}->{content}; | |
my $categories = @$entries{$id}->{category}; | |
my $categories_str; | |
if (ref($categories) eq 'ARRAY') { | |
foreach my $category_hash (@$categories) { | |
unless ($category_hash->{term} =~ /^http.*/) { | |
$categories_str = $categories_str . "'" . $category_hash->{term} . "'" . ", "; | |
} | |
} | |
$categories_str =~ s/, $//; | |
} | |
if ($categories_str) { | |
$categories_str = '[' . $categories_str . "]"; | |
} | |
# gen date | |
my $dt = DateTime::Format::ISO8601->parse_datetime($published); | |
my $date = $dt->ymd('-') . ' ' . $dt->hms(':') . ' ' . $timezone; | |
# convert from HTML to Markdown | |
my $wc = HTML::WikiConverter->new(dialect => 'Markdown'); | |
Encode::_utf8_off($html); | |
my $content = $wc->html2wiki($html); | |
Encode::_utf8_on($content); | |
$content =~ s/<br *\/*>//g; | |
$content =~ s/<>//g; | |
$content =~ s/\n+/\n\n/g; | |
# gen filename | |
my $fn = $dt->ymd('-') . '-' . $eid . ".markdown"; | |
open my $fh, '>', $exp_dir . $fn or die; | |
binmode($fh, ":utf8"); | |
# gen string | |
my $str | |
= "---\n" | |
. "layout: post\n" | |
. 'title: "' | |
. $title . "\"\n" | |
. 'date: ' | |
. $date . "\n" | |
. "comments: false\n" | |
. 'categories: ' | |
. $categories_str . "\n" . "---\n" | |
. $content; | |
print "filename: " . $fn . "\n"; | |
# write to file | |
print $fh $str or die; | |
close $fh or die; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment