hrpunio/blogspot-import.pl

## blogspot-import.pl
#!/usr/bin/perl
# HTML to Blogger conversion script.
#
# Usage: blogspot-import.pl file1 file2 file3... > import-file.xml
#
# The resulting file import-file.xml is Atom [ http://tools.ietf.org/html/rfc4287 ]
# compatible and thus ready to be imported with
# Blogger import facility.
#
# It is assumed the following structure of each HTML file:
#
# <title> one-line-for-title </title>
# <!-- Tags: one-line-for-tags -->
# </head><body><!-- ##Published : 2011-10-20T07:20:26CEST ##-->
#  ... document body
# </body></html>
#
# everything before <title> is ignored, tags are comma-separated,
# and `##Published DateAndTime' is publication date/time.
#
# (c) 2011/10 t.przechlewski
#

use Digest::MD5 qw(md5_hex);

print '<?xml version="1.0" encoding="UTF-8"?>
<!-- id, title/updated jest wymagane w elementach feed/entry reszta opcjonalna -->
<!-- wyglada na minimalne oznakowanie -->
<feed xmlns="http://www.w3.org/2005/Atom"
      xmlns:openSearch="http://a9.com/-/spec/opensearchrss/1.0/"
      xmlns:georss="http://www.georss.org/georss"
      xmlns:gd="http://schemas.google.com/g/2005"
      xmlns:thr="http://purl.org/syndication/thread/1.0">';

print "<id>tag:blogger.com,1999:blog-1928418645181504144.archive</id>";
print "<updated>2011-10-22T12:34:14.746-07:00</updated>";
print "<title type='text'>pinkaccordions.blogspot.com</title>";

# the following is required by Blogger import facility:
print "<generator version='7.00' uri='http://www.blogger.com'>Blogger</generator>\n";

foreach $post_file (@ARGV) {

  my $post_title = $post_content = $md5sum = $published = '';
  my @post_kws = ();
  my $body = $in_pre  = 0;
  my $rel_URLs = 0;

  print STDERR "\n$post_file opened!\n";

  open POST, "$post_file" || die "*** cannot open $post_file ***\n";

  while (<POST>) {
    chomp();

    if (/<title>(.+)<\/title>/) {$post_title = $1 ; next ; }
    if (/<!--[ \t]*Tags:[ \t]*(.+)[ \t]*-->/) {$tags = $1 ; next ; }

    if (/<\/head><body>/) {
      $body = 1 ;
      ## </head><body><!-- ##Published : 2011-10-20T07:20:26CEST ##-->
      if (/##Published[ \t]+:[ \t]+([0-9T\-\:]+).+##/) { $published = $1; }
      print STDERR "Published: $published\n";
      next;
    }

    if (/<\/body><\/html>/) { $body = 0 ; next }

    if ( $body ) {
      ## Images from pinkaccordions.homelinux.org or with relative URLs should be reported
      ## test for URL':
      if (/src[ \t]*=/) {
    	if (/pinkaccordions.homelinux.org/ || !(/http:\/\// ) ) { $rel_URLs = 1;  }
    }

      ## pre should preserve line breaks:
      ## in other parts of HTML line breaks are generally harmful (why? should be ignored):
      if (/<pre>/)   { $in_pre = 1; $post_content .= "$_\n"; next ; }
      if (/<\/pre>/) { $in_pre = 0; $post_content .= "$_ "; next ; }
      if ( $in_pre ) { $post_content .= "$_\n"; }
      else {
	$post_content .= "$_ "; # ** following space is essential here **
      }
    }
  }

### ### ###

  if ($published eq '') { warn "*** something wrong with: $post_file. Not published? Skipping....\n" ;
			  close(POST);
			  next ; }
  if ( $tags eq '' || $post_title eq '' ) { die "*** something wrong with: $post_file (tags: $tags/title: $post_title)\n"; }
  if ($rel_URLs) { warn "*** suspicious relative URIs: $post_file\n"; }

  $post_content =~ s/\&/&amp;/g;
  $post_content =~ s/</&lt;/g;
  $post_content =~ s/>/&gt;/g;

  print STDERR "Title: $post_title Tags: $tags\n";

  @post_kws = split /,/, $tags;
  $md5sum = md5_hex($post_content);
  print STDERR "MD5sum: $md5sum\n";

  print "<entry>"; ## We use MD5sum as post ID
  print "<id>tag:blogger.com,1999:post-$md5sum</id>";
  print "<published>$published</published>";
  print "<updated>$published</updated>";
  print '<category scheme="http://schemas.google.com/g/2005#kind" term="http://schemas.google.com/blogger/2008/kind#post"/>';

  ## tags:
  foreach $k (@post_kws) { print "<category scheme='http://www.blogger.com/atom/ns#' term='$k'/>"; }

  print "<title type='text'>$post_title</title>";

  print "<content type='html'>$post_content</content></entry>";

  close(POST);

}

print "</feed>";

## end
	#!/usr/bin/perl
	# HTML to Blogger conversion script.
	#
	# Usage: blogspot-import.pl file1 file2 file3... > import-file.xml
	#
	# The resulting file import-file.xml is Atom [ http://tools.ietf.org/html/rfc4287 ]
	# compatible and thus ready to be imported with
	# Blogger import facility.
	#
	# It is assumed the following structure of each HTML file:
	#
	# <title> one-line-for-title </title>
	# <!-- Tags: one-line-for-tags -->
	# </head><body><!-- ##Published : 2011-10-20T07:20:26CEST ##-->
	# ... document body
	# </body></html>
	#
	# everything before <title> is ignored, tags are comma-separated,
	# and `##Published DateAndTime' is publication date/time.
	#
	# (c) 2011/10 t.przechlewski
	#

	use Digest::MD5 qw(md5_hex);

	print '<?xml version="1.0" encoding="UTF-8"?>
	<!-- id, title/updated jest wymagane w elementach feed/entry reszta opcjonalna -->
	<!-- wyglada na minimalne oznakowanie -->
	<feed xmlns="http://www.w3.org/2005/Atom"
	xmlns:openSearch="http://a9.com/-/spec/opensearchrss/1.0/"
	xmlns:georss="http://www.georss.org/georss"
	xmlns:gd="http://schemas.google.com/g/2005"
	xmlns:thr="http://purl.org/syndication/thread/1.0">';

	print "<id>tag:blogger.com,1999:blog-1928418645181504144.archive</id>";
	print "<updated>2011-10-22T12:34:14.746-07:00</updated>";
	print "<title type='text'>pinkaccordions.blogspot.com</title>";

	# the following is required by Blogger import facility:
	print "<generator version='7.00' uri='http://www.blogger.com'>Blogger</generator>\n";

	foreach $post_file (@ARGV) {

	my $post_title = $post_content = $md5sum = $published = '';
	my @post_kws = ();
	my $body = $in_pre = 0;
	my $rel_URLs = 0;

	print STDERR "\n$post_file opened!\n";

	open POST, "$post_file" \|\| die "* cannot open $post_file *\n";

	while (<POST>) {
	chomp();

	if (/<title>(.+)<\/title>/) {$post_title = $1 ; next ; }
	if (/<!--[ \t]Tags:[ \t](.+)[ \t]*-->/) {$tags = $1 ; next ; }

	if (/<\/head><body>/) {
	$body = 1 ;
	## </head><body><!-- ##Published : 2011-10-20T07:20:26CEST ##-->
	if (/##Published[ \t]+:[ \t]+([0-9T\-\:]+).+##/) { $published = $1; }
	print STDERR "Published: $published\n";
	next;
	}

	if (/<\/body><\/html>/) { $body = 0 ; next }

	if ( $body ) {
	## Images from pinkaccordions.homelinux.org or with relative URLs should be reported
	## test for URL':
	if (/src[ \t]*=/) {
	if (/pinkaccordions.homelinux.org/ \|\| !(/http:\/\// ) ) { $rel_URLs = 1; }
	}

	## pre should preserve line breaks:
	## in other parts of HTML line breaks are generally harmful (why? should be ignored):
	if (/<pre>/) { $in_pre = 1; $post_content .= "$_\n"; next ; }
	if (/<\/pre>/) { $in_pre = 0; $post_content .= "$_ "; next ; }
	if ( $in_pre ) { $post_content .= "$_\n"; }
	else {
	$post_content .= "$_ "; # following space is essential here
	}
	}
	}

	### ### ###

	if ($published eq '') { warn "*** something wrong with: $post_file. Not published? Skipping....\n" ;
	close(POST);
	next ; }
	if ( $tags eq '' \|\| $post_title eq '' ) { die "*** something wrong with: $post_file (tags: $tags/title: $post_title)\n"; }
	if ($rel_URLs) { warn "*** suspicious relative URIs: $post_file\n"; }

	$post_content =~ s/\&/&/g;
	$post_content =~ s/</</g;
	$post_content =~ s/>/>/g;

	print STDERR "Title: $post_title Tags: $tags\n";

	@post_kws = split /,/, $tags;
	$md5sum = md5_hex($post_content);
	print STDERR "MD5sum: $md5sum\n";

	print "<entry>"; ## We use MD5sum as post ID
	print "<id>tag:blogger.com,1999:post-$md5sum</id>";
	print "<published>$published</published>";
	print "<updated>$published</updated>";
	print '<category scheme="http://schemas.google.com/g/2005#kind" term="http://schemas.google.com/blogger/2008/kind#post"/>';

	## tags:
	foreach $k (@post_kws) { print "<category scheme='http://www.blogger.com/atom/ns#' term='$k'/>"; }

	print "<title type='text'>$post_title</title>";

	print "<content type='html'>$post_content</content></entry>";

	close(POST);

	}

	print "</feed>";

	## end