Skip to content

Instantly share code, notes, and snippets.

@IcyEyeG
Created October 20, 2019 11:08
Show Gist options
  • Save IcyEyeG/f513d5e69e19104106079844e27c6e33 to your computer and use it in GitHub Desktop.
Save IcyEyeG/f513d5e69e19104106079844e27c6e33 to your computer and use it in GitHub Desktop.
Storing a wayback-machine-downloader website in a git repository
#!/usr/bin/perl
#This is a perl script intended to be used with a JSON file generated by https://github.com/hartator/wayback-machine-downloader and aims to convert an entire website archived with the wayback machine into a git repository with commits that correspond to a modification in a snapshot file.
#Some limitations of wayback-machine-downloader are dealt with, making this script quite slow:
# - wget is used so files are downloaded with proper modification timestamp
# - HTML files are scraped from their embeded Internet Archive code and links
# - duplications are found and discarded using MD5 comparison
#This is just a proof of concept that only works in Linux and it uses quite a few hacks to get it done
#If you want to convert or port this concept into a project, please follow GPLv3 (https://www.gnu.org/licenses/gpl-3.0.html)
use Cwd 'abs_path';
if ( ( $ARGV[0] eq "--help" ) || ( $ARGV[0] eq "" ) ) {
die
"Usage:\n$0 json_file\n";
}
# Setup working folder based on the JSON file name and create a git repo there
$json = $ARGV[0];
$json =~ m/(.*)\.json$/;
$folder = $1;
runcmd("mkdir $folder");
$folder = abs_path("$folder");
runcmd("git -C $folder init");
#Parse the JSON file and get its contents into arrays
@file_url;
@timestamp;
@file_id;
open( JSON, "<", "$json" ) or die "Can't open $json for reading: $!\n";
while (<JSON>) {
chomp;
if ( $_ =~ /\{\"file_url\":\"(.*)\",\"timestamp\":(\d*),\"file_id\":\"(.*)"\},/ ) {
push @file_url,$1;
push @timestamp,$2;
push @file_id,$3;
}
}
close JSON;
#sort arrays based on timestamps
@index = sort { $timestamp[$a] <=> $timestamp[$b] } 0 .. $#timestamp;
@file_url = @file_url[@index];
@timestamp = @timestamp[@index];
@file_id = @file_id[@index];
#setting up some aditional variables and solving a few formating problems
$commit_timestamp = $timestamp[0];
$commit_message;
$base_url = $file_url[0];
$base_url =~ s/:80\//\//g;
for (my $i=0; $i <= $#file_url; $i++) {
$file_id[$i] =~ s/ /_/g; #wayback-machine-downloader converts some special characters in file_url into whitespaces in file_id so they are converted into underscores
if ( $file_url[$i] =~ /\/$/ ) { #directory urls must be stored into index.html
$file_id[$i] .= "/index.html";
}
#setting up where files are actually saved to in the repo
$file_id[$i] =~ m/\d{14}\/(.*)/;
my $destination = "$folder/$1";
#checks whether a file is new, an update or a duplicate, and either adds it to the repo or skips it
if (-e $destination) {
my $destination_new = $destination . "." . $timestamp[$i];
getpage($file_url[$i], $timestamp[$i], $destination_new);
my $filetype = runcmdout("file $destination_new");
if ( $filetype =~ /HTML document/ ) {
clean_html($destination_new, $base_url);
}
my $checksum = read_checksum($destination);
my $checksum_new = read_checksum($destination_new);
if ( $checksum eq $checksum_new ) {
print "MD5 match! Removing most recent file...\n";
runcmd("rm $destination_new");
$commit_message .= "Skipped snapshot https://web.archive.org/web/$timestamp[$i]/$file_url[$i] due to no changes in file\n";
} else {
my $git_date = git_date($commit_timestamp);
print $git_date . "\n";
runcmd("GIT_AUTHOR_DATE=\"$git_date\" GIT_COMMITTER_DATE=\"$git_date\" git -C $folder commit -m \"$commit_message\" --author=\"Internet Archive <info\@archive.org>\"");
$commit_message = "";
runcmd("rm $destination");
runcmd("mv $destination_new $destination");
runcmd("git -C $folder add $destination");
$commit_message .= "Updated file from snapshot https://web.archive.org/web/$timestamp[$i]/$file_url[$i] File last modified on: " . read_date($destination);
$commit_timestamp = $timestamp[$i];
}
} else {
getpage($file_url[$i], $timestamp[$i], $destination);
my $filetype = runcmdout("file $destination");
if ( $filetype =~ /HTML document/ ) {
clean_html($destination, $base_url);
}
runcmd("git -C $folder add $destination");
$commit_message .= "Added file from snapshot https://web.archive.org/web/$timestamp[$i]/$file_url[$i] File last modified on: " . read_date($destination);
$commit_timestamp = $timestamp[$i];
}
}
my $git_date = git_date($commit_timestamp);
print $git_date . "\n";
runcmd("GIT_AUTHOR_DATE=\"$git_date\" GIT_COMMITTER_DATE=\"$git_date\" git -C $folder commit -m \"$commit_message\" --author=\"Internet Archive <info\@archive.org>\"");
# Subroutines:
sub getpage { #wgets a file from archive.org with proper modification timestamps
my ($file_url, $timestamp, $destination) = @_;
$timestamp .= "_id";
runcmd("mkdir -p \"\$\(dirname \"$destination\"\)\"");
runcmd("wget -S -O $destination https://web.archive.org/web/$timestamp/$file_url");
}
sub clean_html { # scraps HTML files from their embeded Internet Archive code and links
my ($file, $base_url) = @_;
my $date = read_date($file);
my $contents;
open( HTML, "<", "$file" ) or die "Can't open $file for reading: $!\n";
local $/;
$contents = <HTML>;
close HTML;
$contents =~ s/<head><script src=\"\/\/archive.org.*<\!\-\- End Wayback Rewrite JS Include \-\->\n\n/<head>\n/sg;
$contents =~ s/<!--\n FILE ARCHIVED ON.*//sg;
$contents =~ s/https:\/\/web.archive.org\/web\/\d{14}.{0,2}_?\///g;
$contents =~ s/\/web\/\d{14}.{0,2}_?\///g;
$contents =~ s/$base_url//g;
$contents =~ s/<a href=\"\">/<a href=\"index.html\">/g;
open( OUTPUT, ">", "$file" ) or die "Can't create $file: $!\n";
print OUTPUT "$contents";
close OUTPUT;
write_date($file, $date);
}
sub runcmd { #runs a custom linux command
my $cmd = shift;
my $cmdout = `$cmd`;
print "$cmdout\n" unless $cmdout eq "";
}
sub runcmdout { #runs a custom linux command and records STOUT into a variable
my $cmd = shift;
my $cmdout = `$cmd`;
print "$cmdout\n" unless $cmdout eq "";
return $cmdout;
}
sub read_checksum { #reads an MD5 checksum
my $file = shift;
my $cmd = "md5sum \"$file\"";
my $cmdout = `$cmd`;
$cmdout =~ m/([0-9a-fA-F]{32})/;
my $checksum = $1;
return $checksum;
}
sub read_date { #reads a modification timestamp to a specified file
my $file = shift;
my $cmd = "stat -c '%y' \"$file\"";
my $cmdout = `$cmd`;
return $cmdout;
}
sub write_date { #writes a modification timestamp to a specified file
my ($file, $date) = @_;
my $cmd = "touch -d \"$date\" \"$file\"";
my $cmdout = `$cmd`;
return $cmdout;
}
sub git_date { #converts an Internet Archive timestamp into a date format git can understand
my $date = shift;
$date =~ m/(\d\d\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)/;
my $git_date = "$1-$2-$3T$4:$5:$6";
return $git_date;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment