Skip to content

Instantly share code, notes, and snippets.

@hrpunio
Created October 25, 2011 14:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hrpunio/1312945 to your computer and use it in GitHub Desktop.
Save hrpunio/1312945 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
# HTML to Blogger conversion script.
#
# Usage: blogspot-import.pl file1 file2 file3... > import-file.xml
#
# The resulting file import-file.xml is Atom [ http://tools.ietf.org/html/rfc4287 ]
# compatible and thus ready to be imported with
# Blogger import facility.
#
# It is assumed the following structure of each HTML file:
#
# <title> one-line-for-title </title>
# <!-- Tags: one-line-for-tags -->
# </head><body><!-- ##Published : 2011-10-20T07:20:26CEST ##-->
# ... document body
# </body></html>
#
# everything before <title> is ignored, tags are comma-separated,
# and `##Published DateAndTime' is publication date/time.
#
# (c) 2011/10 t.przechlewski
#
use Digest::MD5 qw(md5_hex);
print '<?xml version="1.0" encoding="UTF-8"?>
<!-- id, title/updated jest wymagane w elementach feed/entry reszta opcjonalna -->
<!-- wyglada na minimalne oznakowanie -->
<feed xmlns="http://www.w3.org/2005/Atom"
xmlns:openSearch="http://a9.com/-/spec/opensearchrss/1.0/"
xmlns:georss="http://www.georss.org/georss"
xmlns:gd="http://schemas.google.com/g/2005"
xmlns:thr="http://purl.org/syndication/thread/1.0">';
print "<id>tag:blogger.com,1999:blog-1928418645181504144.archive</id>";
print "<updated>2011-10-22T12:34:14.746-07:00</updated>";
print "<title type='text'>pinkaccordions.blogspot.com</title>";
# the following is required by Blogger import facility:
print "<generator version='7.00' uri='http://www.blogger.com'>Blogger</generator>\n";
foreach $post_file (@ARGV) {
my $post_title = $post_content = $md5sum = $published = '';
my @post_kws = ();
my $body = $in_pre = 0;
my $rel_URLs = 0;
print STDERR "\n$post_file opened!\n";
open POST, "$post_file" || die "*** cannot open $post_file ***\n";
while (<POST>) {
chomp();
if (/<title>(.+)<\/title>/) {$post_title = $1 ; next ; }
if (/<!--[ \t]*Tags:[ \t]*(.+)[ \t]*-->/) {$tags = $1 ; next ; }
if (/<\/head><body>/) {
$body = 1 ;
## </head><body><!-- ##Published : 2011-10-20T07:20:26CEST ##-->
if (/##Published[ \t]+:[ \t]+([0-9T\-\:]+).+##/) { $published = $1; }
print STDERR "Published: $published\n";
next;
}
if (/<\/body><\/html>/) { $body = 0 ; next }
if ( $body ) {
## Images from pinkaccordions.homelinux.org or with relative URLs should be reported
## test for URL':
if (/src[ \t]*=/) {
if (/pinkaccordions.homelinux.org/ || !(/http:\/\// ) ) { $rel_URLs = 1; }
}
## pre should preserve line breaks:
## in other parts of HTML line breaks are generally harmful (why? should be ignored):
if (/<pre>/) { $in_pre = 1; $post_content .= "$_\n"; next ; }
if (/<\/pre>/) { $in_pre = 0; $post_content .= "$_ "; next ; }
if ( $in_pre ) { $post_content .= "$_\n"; }
else {
$post_content .= "$_ "; # ** following space is essential here **
}
}
}
### ### ###
if ($published eq '') { warn "*** something wrong with: $post_file. Not published? Skipping....\n" ;
close(POST);
next ; }
if ( $tags eq '' || $post_title eq '' ) { die "*** something wrong with: $post_file (tags: $tags/title: $post_title)\n"; }
if ($rel_URLs) { warn "*** suspicious relative URIs: $post_file\n"; }
$post_content =~ s/\&/&amp;/g;
$post_content =~ s/</&lt;/g;
$post_content =~ s/>/&gt;/g;
print STDERR "Title: $post_title Tags: $tags\n";
@post_kws = split /,/, $tags;
$md5sum = md5_hex($post_content);
print STDERR "MD5sum: $md5sum\n";
print "<entry>"; ## We use MD5sum as post ID
print "<id>tag:blogger.com,1999:post-$md5sum</id>";
print "<published>$published</published>";
print "<updated>$published</updated>";
print '<category scheme="http://schemas.google.com/g/2005#kind" term="http://schemas.google.com/blogger/2008/kind#post"/>';
## tags:
foreach $k (@post_kws) { print "<category scheme='http://www.blogger.com/atom/ns#' term='$k'/>"; }
print "<title type='text'>$post_title</title>";
print "<content type='html'>$post_content</content></entry>";
close(POST);
}
print "</feed>";
## end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment