Created
October 20, 2019 11:08
-
-
Save IcyEyeG/f513d5e69e19104106079844e27c6e33 to your computer and use it in GitHub Desktop.
Storing a wayback-machine-downloader website in a git repository
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
#This is a perl script intended to be used with a JSON file generated by https://github.com/hartator/wayback-machine-downloader and aims to convert an entire website archived with the wayback machine into a git repository with commits that correspond to a modification in a snapshot file. | |
#Some limitations of wayback-machine-downloader are dealt with, making this script quite slow: | |
# - wget is used so files are downloaded with proper modification timestamp | |
# - HTML files are scraped from their embeded Internet Archive code and links | |
# - duplications are found and discarded using MD5 comparison | |
#This is just a proof of concept that only works in Linux and it uses quite a few hacks to get it done | |
#If you want to convert or port this concept into a project, please follow GPLv3 (https://www.gnu.org/licenses/gpl-3.0.html) | |
use Cwd 'abs_path'; | |
if ( ( $ARGV[0] eq "--help" ) || ( $ARGV[0] eq "" ) ) { | |
die | |
"Usage:\n$0 json_file\n"; | |
} | |
# Setup working folder based on the JSON file name and create a git repo there | |
$json = $ARGV[0]; | |
$json =~ m/(.*)\.json$/; | |
$folder = $1; | |
runcmd("mkdir $folder"); | |
$folder = abs_path("$folder"); | |
runcmd("git -C $folder init"); | |
#Parse the JSON file and get its contents into arrays | |
@file_url; | |
@timestamp; | |
@file_id; | |
open( JSON, "<", "$json" ) or die "Can't open $json for reading: $!\n"; | |
while (<JSON>) { | |
chomp; | |
if ( $_ =~ /\{\"file_url\":\"(.*)\",\"timestamp\":(\d*),\"file_id\":\"(.*)"\},/ ) { | |
push @file_url,$1; | |
push @timestamp,$2; | |
push @file_id,$3; | |
} | |
} | |
close JSON; | |
#sort arrays based on timestamps | |
@index = sort { $timestamp[$a] <=> $timestamp[$b] } 0 .. $#timestamp; | |
@file_url = @file_url[@index]; | |
@timestamp = @timestamp[@index]; | |
@file_id = @file_id[@index]; | |
#setting up some aditional variables and solving a few formating problems | |
$commit_timestamp = $timestamp[0]; | |
$commit_message; | |
$base_url = $file_url[0]; | |
$base_url =~ s/:80\//\//g; | |
for (my $i=0; $i <= $#file_url; $i++) { | |
$file_id[$i] =~ s/ /_/g; #wayback-machine-downloader converts some special characters in file_url into whitespaces in file_id so they are converted into underscores | |
if ( $file_url[$i] =~ /\/$/ ) { #directory urls must be stored into index.html | |
$file_id[$i] .= "/index.html"; | |
} | |
#setting up where files are actually saved to in the repo | |
$file_id[$i] =~ m/\d{14}\/(.*)/; | |
my $destination = "$folder/$1"; | |
#checks whether a file is new, an update or a duplicate, and either adds it to the repo or skips it | |
if (-e $destination) { | |
my $destination_new = $destination . "." . $timestamp[$i]; | |
getpage($file_url[$i], $timestamp[$i], $destination_new); | |
my $filetype = runcmdout("file $destination_new"); | |
if ( $filetype =~ /HTML document/ ) { | |
clean_html($destination_new, $base_url); | |
} | |
my $checksum = read_checksum($destination); | |
my $checksum_new = read_checksum($destination_new); | |
if ( $checksum eq $checksum_new ) { | |
print "MD5 match! Removing most recent file...\n"; | |
runcmd("rm $destination_new"); | |
$commit_message .= "Skipped snapshot https://web.archive.org/web/$timestamp[$i]/$file_url[$i] due to no changes in file\n"; | |
} else { | |
my $git_date = git_date($commit_timestamp); | |
print $git_date . "\n"; | |
runcmd("GIT_AUTHOR_DATE=\"$git_date\" GIT_COMMITTER_DATE=\"$git_date\" git -C $folder commit -m \"$commit_message\" --author=\"Internet Archive <info\@archive.org>\""); | |
$commit_message = ""; | |
runcmd("rm $destination"); | |
runcmd("mv $destination_new $destination"); | |
runcmd("git -C $folder add $destination"); | |
$commit_message .= "Updated file from snapshot https://web.archive.org/web/$timestamp[$i]/$file_url[$i] File last modified on: " . read_date($destination); | |
$commit_timestamp = $timestamp[$i]; | |
} | |
} else { | |
getpage($file_url[$i], $timestamp[$i], $destination); | |
my $filetype = runcmdout("file $destination"); | |
if ( $filetype =~ /HTML document/ ) { | |
clean_html($destination, $base_url); | |
} | |
runcmd("git -C $folder add $destination"); | |
$commit_message .= "Added file from snapshot https://web.archive.org/web/$timestamp[$i]/$file_url[$i] File last modified on: " . read_date($destination); | |
$commit_timestamp = $timestamp[$i]; | |
} | |
} | |
my $git_date = git_date($commit_timestamp); | |
print $git_date . "\n"; | |
runcmd("GIT_AUTHOR_DATE=\"$git_date\" GIT_COMMITTER_DATE=\"$git_date\" git -C $folder commit -m \"$commit_message\" --author=\"Internet Archive <info\@archive.org>\""); | |
# Subroutines: | |
sub getpage { #wgets a file from archive.org with proper modification timestamps | |
my ($file_url, $timestamp, $destination) = @_; | |
$timestamp .= "_id"; | |
runcmd("mkdir -p \"\$\(dirname \"$destination\"\)\""); | |
runcmd("wget -S -O $destination https://web.archive.org/web/$timestamp/$file_url"); | |
} | |
sub clean_html { # scraps HTML files from their embeded Internet Archive code and links | |
my ($file, $base_url) = @_; | |
my $date = read_date($file); | |
my $contents; | |
open( HTML, "<", "$file" ) or die "Can't open $file for reading: $!\n"; | |
local $/; | |
$contents = <HTML>; | |
close HTML; | |
$contents =~ s/<head><script src=\"\/\/archive.org.*<\!\-\- End Wayback Rewrite JS Include \-\->\n\n/<head>\n/sg; | |
$contents =~ s/<!--\n FILE ARCHIVED ON.*//sg; | |
$contents =~ s/https:\/\/web.archive.org\/web\/\d{14}.{0,2}_?\///g; | |
$contents =~ s/\/web\/\d{14}.{0,2}_?\///g; | |
$contents =~ s/$base_url//g; | |
$contents =~ s/<a href=\"\">/<a href=\"index.html\">/g; | |
open( OUTPUT, ">", "$file" ) or die "Can't create $file: $!\n"; | |
print OUTPUT "$contents"; | |
close OUTPUT; | |
write_date($file, $date); | |
} | |
sub runcmd { #runs a custom linux command | |
my $cmd = shift; | |
my $cmdout = `$cmd`; | |
print "$cmdout\n" unless $cmdout eq ""; | |
} | |
sub runcmdout { #runs a custom linux command and records STOUT into a variable | |
my $cmd = shift; | |
my $cmdout = `$cmd`; | |
print "$cmdout\n" unless $cmdout eq ""; | |
return $cmdout; | |
} | |
sub read_checksum { #reads an MD5 checksum | |
my $file = shift; | |
my $cmd = "md5sum \"$file\""; | |
my $cmdout = `$cmd`; | |
$cmdout =~ m/([0-9a-fA-F]{32})/; | |
my $checksum = $1; | |
return $checksum; | |
} | |
sub read_date { #reads a modification timestamp to a specified file | |
my $file = shift; | |
my $cmd = "stat -c '%y' \"$file\""; | |
my $cmdout = `$cmd`; | |
return $cmdout; | |
} | |
sub write_date { #writes a modification timestamp to a specified file | |
my ($file, $date) = @_; | |
my $cmd = "touch -d \"$date\" \"$file\""; | |
my $cmdout = `$cmd`; | |
return $cmdout; | |
} | |
sub git_date { #converts an Internet Archive timestamp into a date format git can understand | |
my $date = shift; | |
$date =~ m/(\d\d\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)/; | |
my $git_date = "$1-$2-$3T$4:$5:$6"; | |
return $git_date; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment