Skip to content

Instantly share code, notes, and snippets.

@zengargoyle
Created March 1, 2012 12:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zengargoyle/1949635 to your computer and use it in GitHub Desktop.
Save zengargoyle/1949635 to your computer and use it in GitHub Desktop.
http://metatalk.metafilter.com/21489 - comment dump to bookmarks file
#!/usr/bin/env perl
use 5.010;
use strict;
use warnings;
use utf8::all;
our $VERSION = '0.01';
use Getopt::Long::Descriptive;
use Pod::Usage;
my ($opt, $usage) = describe_options(
'%c %o',
[ 'in|i=s' => 'input file (default: my-mefi-comments.txt)',
{ default => 'my-mefi-comments.txt' } ],
[ 'out|o=s' => 'output file (default: bookmarks.html)',
{ default => 'bookmarks.html' } ],
[],
[ 'stdout' => 'output to STDOUT only' ],
[ 'fancy|f' => 'make bookmarks more fancy' ],
[],
[ 'help|h|?' => 'print usage message and exit' ],
[ 'manual|man' => 'print manua page and exit' ],
);
if ($opt->{help} ) { print $usage->text; exit }
if ($opt->{manual}) { pod2usage({ verbose => 2, exit => 0 }) }
#
# here for demo purposes.
# t title, u uri, b blurb, s strftime, n time, c sub-site
# Emitter variables be overridden to customize output:
# {page,thread,comment}_{open,close}
#
if ($opt->{fancy}) {
$Emitter::thread_open = <<'_EOT_';
<DT><H3 FOLDED ADD_DATE="%n">%c: %t</H3>
<DL><p>
_EOT_
$Emitter::comment_open = <<'_EOT_';
<DT><A HREF="%u" ADD_DATE="%n" LAST_VISIT="%n" LAST_MODIFIED="%n">(%{%Y-%m-%d %T}m) %b</A>
_EOT_
}
my $page = bookmarks_from_file($opt->{in});
if ($opt->{fancy}) {
$page->title('My Fancy Bookmarks');
}
if ( $opt->{stdout} ) {
print Emitter->emit_as_text($page);
}
else {
Emitter->emit_to_file($page, $opt->{out});
}
exit;
#######################################################################
#
# create our Page{ Thread{ Comment+ }+ } structure.
#
sub bookmarks_from_file {
my $filename = shift;
open my $fh, '< :utf8 :crlf', $filename
or die "Can't open '$filename' for reading: $!\n";
return bookmarks_from_fh($fh);
}
sub bookmarks_from_fh {
my $fh = shift;
# TODO pass in a class composed of a base Page class
# with additional roles composited in.
my $Class = shift || 'Page';
local $/ = "\n-----\n";
my %thread_for_comment;
my $page = $Class->new;
while (my $rec = <$fh>) {
chomp $rec;
my ($date, $url, $text) = split /\n/, $rec, 3;
# take a comment post and add it to the thread
# which itself is added to the page (of threads of comments).
my $comment = Comment->new(
timestamp => $date,
uri => $url,
text => $text,
);
my $title = $comment->title;
my $thread = $thread_for_comment{ $title };
unless ( $thread ) {
# clone the post url and remove the #anchor (fragment).
my $thread_uri = $comment->uri->clone;
$thread_uri->fragment(undef);
$thread = Thread->new(
uri => $thread_uri,
);
# add thread to the page and keep track of it for later.
$page->add_entry($thread);
$thread_for_comment{ $title } = $thread;
}
$thread->add_entry($comment);
}
return $page;
}
BEGIN {
#
# classes and roles. Page{ Thread{ Comment+ }+ }
#
package Entries::Role {
use Moo::Role;
has entries => (
is => 'ro',
default => sub { [] },
);
sub add_entry {
my $self = shift;
push @{ $self->entries }, @_;
}
}
package Page {
use Moo;
with 'Entries::Role';
has title => (
is => 'rw',
default => sub { 'Bookmarks' },
);
}
package URI::Role {
use Moo::Role;
use URI;
has uri => (
is => 'ro',
isa => sub {
die "$_[0] is not a URI!"
unless $_[0]->isa('URI');
},
coerce => sub {
URI->new($_[0]);
},
required => 1,
);
has title => (
is => 'ro',
lazy => 1,
builder => '_build_title',
);
sub _build_title {
my $self = shift;
my $t = $self->uri->path; # /blah/the-long-title
$t =~ s!.*/!!; # the-long-title
$t =~ s!-! !g; # the long title
return $t;
}
}
package Thread {
use Moo;
with 'URI::Role';
with 'Entries::Role';
}
package Comment {
use Moo;
use Time::Piece;
use HTML::Strip;
use Text::Wrap;
use utf8;
with 'URI::Role';
has timestamp => (
is => 'ro',
isa => sub {
die "$_[0] is not a Time::Piece!"
unless $_[0]->isa('Time::Piece');
},
coerce => sub {
(my $t = $_[0]) =~ s/\.\d+$//; # remove fractional seconds.
Time::Piece->strptime($t, '%Y-%m-%d %T');
},
required => 1,
handles => {
strftime => 'strftime',
},
);
has text => (
is => 'ro',
required => 1,
);
has blurb => (
is => 'ro',
lazy => 1,
builder => '_build_blurb',
);
sub _build_blurb {
my $self = shift;
# strip HTML
my $p = HTML::Strip->new;
my $b = $p->parse( $self->text );
$p->eof;
# HTML::Strip being lame.
utf8::decode($b);
# do some smart wrapping to ~72 columns.
# then take the first line as our blurb.
$b = wrap('', '', $b);
$b = (split /\n/, $b)[0];
return $b;
}
}
package Emitter {
use utf8::all;
use String::Formatter method_stringf => {
-as => '_stringf',
codes => {
t => 'title',
b => 'blurb',
u => 'uri',
m => 'strftime',
n => sub { time },
c => sub {
my $h = $_[0]->uri->host;
$h =~ s/\..*//;
__PACKAGE__->sitemap($h);
},
},
};
my %sitemap = (
www => 'MetaFilter',
ask => 'AskMeFi',
projects => 'Projects',
music => 'Music',
jobs => 'Jobs',
irl => 'IRL',
metatalk => 'MetaTalk',
);
sub sitemap {
my ($class, $host) = @_;
return $sitemap{$host};
}
our $page_open = <<'_EOT_';
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<!--This is an automatically generated file.
It will be read and overwritten.
Do Not Edit! -->
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<Title>%t</Title>
<H1>%t</H1>
<DL>
_EOT_
our $page_close = <<'_EOT_';
</DL>
_EOT_
our $thread_open = <<'_EOT_';
<DT><H3 FOLDED ADD_DATE="%n">%t</H3>
<DL><p>
_EOT_
our $thread_close = <<'_EOT_';
</DL><p>
_EOT_
our $comment_open = <<'_EOT_';
<DT><A HREF="%u" ADD_DATE="%n" LAST_VISIT="%n" LAST_MODIFIED="%n">%b</A>
_EOT_
our $comment_close = '';
sub emit_as_text {
my ($self, $page) = @_;
my $output = "";
open my $fh, '>', \$output or die "Can't open a string!";
$self->emit_to_fh($page, $fh);
# so, the logic goes: when using Perl::IO::scalar to do IO to a
# string, the string is treated as a File, and files are by nature
# a stream of *bytes*. so we need to turn that string of bytes
# into a true unicode thing. otherwise when we print it back out
# to a utf8 filehandle (stdout or a file) Perl will notice any
# non-ASCII (>127) characters and end up double-encoding them
# to utf8 again. the alternative would be to set the output
# filehandle to :std or :raw when outputting this string.
# sigh, unicode can be such a PITA.
utf8::decode($output);
return $output;
}
sub emit_to_file {
my ($self, $page, $filename) = @_;
open my $fh, '> :utf8', $filename
or die "Can't open '$filename' for writing $!";
$self->emit_to_fh($page, $fh);
}
sub emit_to_fh {
my ($self, $page, $fh) = @_;
# open, (open, (open, close)+ close)+, close
print $fh _stringf $page_open, $page;
for my $t ( @{ $page->entries } ) {
print $fh _stringf $thread_open, $t;
for my $c ( @{ $t->entries } ) {
print $fh _stringf $comment_open, $c;
print $fh _stringf $comment_close, $c;
}
print $fh _stringf $thread_close, $t;
}
print $fh _stringf $page_close, $page;
}
}
} # BEGIN
__END__
=head1 NAME
my-mefi-bookmarks - convert Metafilter comment dump to Netscape bookmarks file.
=head1 VERSION
0.01
=head1 SYNOPSIS
my-mefi-bookmarks
my-mefi-bookmarks --fancy --out ~/public_html/mefi-bookmarks.html
=head1 OPTIONS
Some options can be abbreviated.
=over 4
=item B<--in> I<filename>
Specify an alternate input file. Default: F<my-mefi-comments.txt>
=item B<--out> I<filename>
Specify an alternate output file. Default: F<bookmarks.html>
=item B<--stdout>
Output to STDOUT instead of a file.
=item B<--fancy>
Make the output a bit fancier. This is also a little demo of customization possibilities. See the source for ideas.
=item B<--help>
Show a brief help message and exit.
=item B<--manual>
Show this manual page and exit.
=back
=cut
@zengargoyle
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment