Skip to content

Instantly share code, notes, and snippets.

@cromedome
Last active February 21, 2019 03:07
Show Gist options
  • Save cromedome/1d4546da87ff4f6acb9ead2c514ce7a3 to your computer and use it in GitHub Desktop.
Save cromedome/1d4546da87ff4f6acb9ead2c514ce7a3 to your computer and use it in GitHub Desktop.
Convert Tumblr blog entries to Markdown
#!/usr/bin/env perl
use lib 'lib';
use utf8;
use Modern::Perl;
use DateTime;
use WWW::Mechanize;
use Text::Unidecode;
use Cpanel::JSON::XS;
use HTML::FormatText;
use feature qw( signatures );
no warnings 'experimental::signatures';
say "Fetching posts...";
my $blog_url = "http://api.tumblr.com/v2/blog/crome-plated.tumblr.com/posts?api_key=uTcP0Xdtb0e0CmuLVsOZ8jentkORYVN2LvINs4lNrG50jy1VE8";
my @posts = get_posts();
foreach my $post ( @posts ) {
my $body = get_body( $post );
my $title = get_title( $post );
my $date = get_date( $post );
my $tags = get_tags( $post );
my $filename = get_filename( $title );
# Create blog file
print "Creating new post: $title ($date)... ";
open my $blog_fh, '>', "scraped_content/$filename" or die "Can't open $filename: $!";
print $blog_fh qq{---
title: $title
hero_image: "hero.jpg"
tags: $tags
date: $date
---
$body};
close $blog_fh;
say "done!";
}
my $dt = DateTime->now;
say "Completed at " . $dt->ymd('/') . " " . $dt->hms;
exit 1;
sub get_body( $post ) {
my $type = $post->{ type };
my $body;
if( $type eq 'text' ) {
$body = $post->{ body };
}
else {
$body = '';
}
$body =~ s/<a href="(.+?)" target="_blank">(.+?)<\/a>/\[$2\]\($1\)/g;
$body = HTML::FormatText->new( leftmargin => 0, rightmargin => 72 )->format_string( $body );
return unidecode( $body );
}
sub get_title( $post ) {
return unidecode( $post->{ title } );
}
sub get_date( $post ) {
my $date = unidecode( $post->{ date } );
$date =~ s/ /T/;
$date =~ s/ GMT$/\-00:00/;
return $date;
}
sub get_tags( $post ) {
my @tags = $post->{ tags }->@*;
my $taglist = '[' . join( ', ', map{ '"' . $_ . '"' } @tags ) . ']';
return $taglist;
}
# Make a filename that won't cause the filesystem to shit the bed
sub get_filename( $title ) {
my $filename = $title;
$filename =~ s/ /_/g;
$filename =~ s/\W//g;
$filename =~ s/_/-/g;
return lc "${filename}.md";
}
sub get_posts {
my $mech = WWW::Mechanize->new;
my $response = $mech->get( $blog_url )->decoded_content;
my $post_json = decode_json( $response );
my @post_list = $post_json->{ response }{ posts };
return @{ $post_list[0] };
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment