Skip to content

Instantly share code, notes, and snippets.

@evandhoffman
Created March 13, 2012 18:23
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save evandhoffman/2030469 to your computer and use it in GitHub Desktop.
Save evandhoffman/2030469 to your computer and use it in GitHub Desktop.
Perl script to fetch all topics in a phpBB forum and save them to disk
#!/usr/bin/perl
use strict;
use warnings;
require LWP::UserAgent;
use POSIX qw/ceil/;
my $ua = LWP::UserAgent->new(agent => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:10.0.2) Gecko/20100101 Firefox/10.0.2');
$ua->timeout(10);
my $sid = '<sid goes here>';
$ua->default_header('Host' => '<host>');
$ua->default_header('Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8');
$ua->default_header('Accept-Language' => 'en-us,en;q=0.5');
$ua->default_header('Accept-Encoding' => 'gzip, deflate');
$ua->default_header('DNT' => '1');
$ua->default_header('Connection' => 'keep-alive');
$ua->default_header('Cache-Control' => 'max-age=0');
#$ua->default_header();
my $max_topic_id = 20000;
my $posts_per_topic_page = 30;
my $topics_per_forum_page = 50;
my $output_file_name_base = 'sitename-';
my $base_url = 'http://hostname.com/forums/viewtopic.php';
my %status = ();
for (my $i = 1; $i <= $max_topic_id; $i++) {
my $posts_in_topic = 0;
my $url = $base_url . '?sid=' .$sid . '&t='.$i;
my $response = $ua->get($url );
if ($response->is_success) {
my $page_content = $response->decoded_content;
# [ 621 posts ]
if ($page_content =~ /topic does not exist/) {
print STDERR "$url: Topic $i does not exist\n";
$status{'invalid topic'}++;
} elsif ($page_content =~ /not authorised/) {
print STDERR "$url: Not authorized\n";
$status{'unauthorized'}++;
} elsif ($page_content =~ /requires you to be registered and logged in/) {
print STDERR "$url: Login prompt.\n";
$status{'login message'}++;
} elsif ($page_content =~ /forum you selected does not exist/) {
print STDERR "$url: Forum does not exist\n";
$status{'forum dne'}++;
} else {
$posts_in_topic = ($page_content =~ / ([\d,]+) post(s?) /)[0];
$posts_in_topic =~ s/,//;
$status{'valid post'}++;
# print "Topic #$i contains $posts_in_topic posts\n";
my $title = ($page_content =~ /<title>(.+)<\/title>/)[0];
$title =~ s/[^\w\s]+//g;
$title =~ s/\s+/_/g;
# <td class="postbottom" align="center">Sat Nov 01, 2008 9:00 pm</td>
my ($crap, $wd, $month, $day, $year) = ($page_content =~ />((\w{3}) (\w{3}) (\d{2}), (\d{4})) \d+:\d{2} ..<\/td/);
my $date_str = "$year-$month-$day";
my $output_filename = sprintf("$output_file_name_base"."topic-%05d-page-000-$date_str-$title.html",$i);
open(FILE, ">$output_filename") or die $!;
print FILE $page_content;
close(FILE);
print "Saved $output_filename ($posts_in_topic total posts)\n";
if ($posts_in_topic > $posts_per_topic_page) {
my $total_pages = ceil($posts_in_topic / $posts_per_topic_page);
for (my $j = 1; $j < $total_pages; $j++) {
my $start = $j * $posts_per_topic_page;
my $page_url = $url . '&start='.$start;
my $page_filename = sprintf("$output_file_name_base".'topic-%05d-page-%03d.html',$i, $j);
my $page_response = $ua->get($page_url, ':content_file' => $page_filename);
if ($page_response->is_success) {
print "Saved $page_filename\n";
} else {
print STDERR "Error fetching $page_url: {$response->status_line}\n";
}
}
}
# print $page_content;
}
} else {
print STDERR "Error fetching $url: {$response->status_line}\n";
$status{'error:'.$response->status_line}++;
}
}
@slrslr
Copy link

slrslr commented Sep 15, 2016

In my case it always returned only errors like:
Error fetching http://myforumurlhere.com/viewtopic.php?sid=1174568dc0f59879f5cf8610ed3bd82f&t=9: {HTTP::Response=HASH(0x2c44d30)->status_line}
Error fetching http://www.myforumurlhere.com/viewtopic.php?sid=1174568dc0f59879f5cf8610ed3bd82f&t=10: {HTTP::Response=HASH(0x2a7c6b8)->status_line}
even topic exist when i open mentioned URLs in browser. Tried on 2 sites, always errors, cant see any saved files in same script dir..

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment