Skip to content

Instantly share code, notes, and snippets.

@mmattozzi
Created May 17, 2010 14:25
Show Gist options
  • Save mmattozzi/403814 to your computer and use it in GitHub Desktop.
Save mmattozzi/403814 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl -w
use LWP::Simple;
use HTML::LinkExtor;
use Data::Dumper;
use DBI;
my $dbh = DBI->connect("DBI:mysql:db", "user", "password", {RaiseError => 1});
my $url = "http://lostpedia.wikia.com/wiki/Pilot,_Part_1_transcript";
my $link_content = get($url);
my @links = ($link_content =~ /(<a href="[^"]+?_transcript" title="[^"]+?">[0-9 \/]+?<\/a>)/g);
my %transcripts = ( );
$transcripts{$url} = "Pilot, Part 1";
for my $link (@links) {
$link =~ /<a href="([^"]+?_transcript)" title="([^"]+?) transcript">[0-9 \/]+?<\/a>/;
my $href = "http://lostpedia.wikia.com" . $1;
my $title = $2;
$title =~ s/&amp;/&/g;
$title =~ s/&#39;/'/g;
$transcripts{$href} = $title;
}
for my $link (keys %transcripts) {
print $transcripts{$link} . " - " . $link . "\n";
my $title = $transcripts{$link};
my $ep_transcript = get($link);
process_content($ep_transcript, $title);
sleep(1);
}
$dbh->disconnect();
sub process_content {
my ($content, $title) = @_;
my @lines = split /\n/, $content;
my $num = 1;
my $update = $dbh->prepare("insert into lost (episode, person, line, num) values (?, ?, ?, ?)");
for my $line (@lines) {
if ($line =~ /<p>(.*?): (.*)$/) {
my $person = $1;
my $the_line = $2;
$person =~ s/\s*\[.*\]//;
if (length($person) < 100) {
print $person . " - " . $the_line . "\n";
$update->execute( $title, lc($person), $the_line, $num );
$num = $num + 1;
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment