Skip to content

Instantly share code, notes, and snippets.

@nunq
Created May 3, 2024 09:36
Show Gist options
  • Save nunq/a2106eaec7b43cf96fae6fcb12fd34de to your computer and use it in GitHub Desktop.
Save nunq/a2106eaec7b43cf96fae6fcb12fd34de to your computer and use it in GitHub Desktop.
Parse a directory of tumblr backup html posts and output as JSON
#!/usr/bin/perl
#
# tumblrparse.pl
# Parse a directory of tumblr backup html posts and output as JSON.
# Ignoring title and media.
#
use warnings;
use strict;
use JSON::PP;
if (@ARGV != 1) {
die "Usage: $0 <path>\n";
}
my $path = $ARGV[0];
opendir(my $DIR, $path) or die "Can't open $path: $!";
my @files = sort readdir($DIR);
my @posts;
while (my $file = shift @files) {
next if $file =~ /^\./;
open(my $fh, "<", "$path/$file") or die "Can't open $file: $!";
my $content = do { local $/; <$fh> };
close($fh);
#my $title = $1 if $content =~ m{<h1>(.*?)</h1>}s;
my $post = $1 if $content =~ m{<p>(.*)</p>}s;
my $time = $1 if $content =~ m{<span id="timestamp"> (.*) </span>}s;
#print "Post: $post. Time: $time\n";
my %hash = (
$file => {
"time" => $time,
"post" => $post
}
);
push @posts, JSON::PP->new->encode(\%hash);
}
print "[" . join(",\n", @posts) . "]\n";
closedir($DIR);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment