IcyEyeG/wmd2git.pl

## wmd2git.pl
#!/usr/bin/perl

#This is a perl script intended to be used with a JSON file generated by https://github.com/hartator/wayback-machine-downloader and aims to convert an entire website archived with the wayback machine into a git repository with commits that correspond to a modification in a snapshot file.
#Some limitations of wayback-machine-downloader are dealt with, making this script quite slow:
# - wget is used so files are downloaded with proper modification timestamp
# - HTML files are scraped from their embeded Internet Archive code and links
# - duplications are found and discarded using MD5 comparison
#This is just a proof of concept that only works in Linux and it uses quite a few hacks to get it done
#If you want to convert or port this concept into a project, please follow GPLv3 (https://www.gnu.org/licenses/gpl-3.0.html)

use Cwd 'abs_path';

if ( ( $ARGV[0] eq "--help" ) || ( $ARGV[0] eq "" ) ) {
	die
"Usage:\n$0 json_file\n";
}

# Setup working folder based on the JSON file name and create a git repo there

$json = $ARGV[0];
$json =~ m/(.*)\.json$/;
$folder = $1;

runcmd("mkdir $folder");

$folder = abs_path("$folder");

runcmd("git -C $folder init");

#Parse the JSON file and get its contents into arrays

@file_url;
@timestamp;
@file_id;

open( JSON, "<", "$json" ) or die "Can't open $json for reading: $!\n";
while (<JSON>) {
	chomp;
	if ( $_ =~ /\{\"file_url\":\"(.*)\",\"timestamp\":(\d*),\"file_id\":\"(.*)"\},/ ) {
	push @file_url,$1;
	push @timestamp,$2;
	push @file_id,$3;
    }
}
close JSON;

#sort arrays based on timestamps

@index = sort { $timestamp[$a] <=> $timestamp[$b] } 0 .. $#timestamp;

@file_url = @file_url[@index];
@timestamp = @timestamp[@index];
@file_id = @file_id[@index];

#setting up some aditional variables and solving a few formating problems

$commit_timestamp = $timestamp[0];
$commit_message;
$base_url = $file_url[0];
$base_url =~ s/:80\//\//g;

for (my $i=0; $i <= $#file_url; $i++) {

$file_id[$i] =~ s/ /_/g; #wayback-machine-downloader converts some special characters in file_url into whitespaces in file_id so they are converted into underscores

if ( $file_url[$i] =~ /\/$/ ) { #directory urls must be stored into index.html

$file_id[$i] .= "/index.html";

}

#setting up where files are actually saved to in the repo

$file_id[$i] =~ m/\d{14}\/(.*)/;

my $destination = "$folder/$1";

#checks whether a file is new, an update or a duplicate, and either adds it to the repo or skips it

if (-e $destination) {

my $destination_new = $destination . "." . $timestamp[$i];

getpage($file_url[$i], $timestamp[$i], $destination_new);

my $filetype = runcmdout("file $destination_new");

if ( $filetype =~ /HTML document/ ) {

clean_html($destination_new, $base_url);

}

my $checksum = read_checksum($destination);
my $checksum_new = read_checksum($destination_new);

if ( $checksum eq $checksum_new ) {

print "MD5 match! Removing most recent file...\n";

runcmd("rm $destination_new");

$commit_message .= "Skipped snapshot https://web.archive.org/web/$timestamp[$i]/$file_url[$i] due to no changes in file\n";

} else {

my $git_date = git_date($commit_timestamp);

print $git_date . "\n";

runcmd("GIT_AUTHOR_DATE=\"$git_date\" GIT_COMMITTER_DATE=\"$git_date\" git -C $folder commit -m \"$commit_message\" --author=\"Internet Archive <info\@archive.org>\"");

$commit_message = "";

runcmd("rm $destination");

runcmd("mv $destination_new $destination");

runcmd("git -C $folder add $destination");

$commit_message .= "Updated file from snapshot https://web.archive.org/web/$timestamp[$i]/$file_url[$i] File last modified on: " . read_date($destination);

$commit_timestamp = $timestamp[$i];

}

} else {

getpage($file_url[$i], $timestamp[$i], $destination);

my $filetype = runcmdout("file $destination");

if ( $filetype =~ /HTML document/ ) {

clean_html($destination, $base_url);

}

runcmd("git -C $folder add $destination");

$commit_message .= "Added file from snapshot https://web.archive.org/web/$timestamp[$i]/$file_url[$i] File last modified on: " . read_date($destination);

$commit_timestamp = $timestamp[$i];

}

}

my $git_date = git_date($commit_timestamp);

print $git_date . "\n";

runcmd("GIT_AUTHOR_DATE=\"$git_date\" GIT_COMMITTER_DATE=\"$git_date\" git -C $folder commit -m \"$commit_message\" --author=\"Internet Archive <info\@archive.org>\"");


# Subroutines:

sub getpage { #wgets a file from archive.org with proper modification timestamps

my ($file_url, $timestamp, $destination) = @_;

$timestamp .= "_id";

runcmd("mkdir -p \"\$\(dirname \"$destination\"\)\"");

runcmd("wget -S -O $destination https://web.archive.org/web/$timestamp/$file_url");

}

sub clean_html { # scraps HTML files from their embeded Internet Archive code and links

my ($file, $base_url) = @_;
my $date = read_date($file);
my $contents;

open( HTML, "<", "$file" ) or die "Can't open $file for reading: $!\n";
local $/;
$contents = <HTML>;
close HTML;

$contents =~ s/<head><script src=\"\/\/archive.org.*<\!\-\- End Wayback Rewrite JS Include \-\->\n\n/<head>\n/sg;
$contents =~ s/<!--\n     FILE ARCHIVED ON.*//sg;
$contents =~ s/https:\/\/web.archive.org\/web\/\d{14}.{0,2}_?\///g;
$contents =~ s/\/web\/\d{14}.{0,2}_?\///g;
$contents =~ s/$base_url//g;
$contents =~ s/<a href=\"\">/<a href=\"index.html\">/g;
open( OUTPUT, ">", "$file" ) or die "Can't create $file: $!\n";
print OUTPUT "$contents";
close OUTPUT;

write_date($file, $date);

}

sub runcmd { #runs a custom linux command

my $cmd = shift;
my $cmdout = `$cmd`;
print "$cmdout\n" unless $cmdout eq "";

}

sub runcmdout { #runs a custom linux command and records STOUT into a variable

my $cmd = shift;
my $cmdout = `$cmd`;
print "$cmdout\n" unless $cmdout eq "";
return $cmdout;
}

sub read_checksum { #reads an MD5 checksum

my $file = shift;
my $cmd = "md5sum \"$file\"";
my $cmdout = `$cmd`;
$cmdout =~ m/([0-9a-fA-F]{32})/;
my $checksum = $1;

return $checksum;

}

sub read_date { #reads a modification timestamp to a specified file

my $file = shift;
my $cmd = "stat -c '%y' \"$file\"";
my $cmdout = `$cmd`;

return $cmdout;

}

sub write_date { #writes a modification timestamp to a specified file

my ($file, $date) = @_;
my $cmd = "touch -d \"$date\" \"$file\"";
my $cmdout = `$cmd`;

return $cmdout;
}

sub git_date { #converts an Internet Archive timestamp into a date format git can understand

my $date = shift;
$date =~ m/(\d\d\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)/;
my $git_date = "$1-$2-$3T$4:$5:$6";

return $git_date;

}
	#!/usr/bin/perl

	#This is a perl script intended to be used with a JSON file generated by https://github.com/hartator/wayback-machine-downloader and aims to convert an entire website archived with the wayback machine into a git repository with commits that correspond to a modification in a snapshot file.
	#Some limitations of wayback-machine-downloader are dealt with, making this script quite slow:
	# - wget is used so files are downloaded with proper modification timestamp
	# - HTML files are scraped from their embeded Internet Archive code and links
	# - duplications are found and discarded using MD5 comparison
	#This is just a proof of concept that only works in Linux and it uses quite a few hacks to get it done
	#If you want to convert or port this concept into a project, please follow GPLv3 (https://www.gnu.org/licenses/gpl-3.0.html)

	use Cwd 'abs_path';

	if ( ( $ARGV[0] eq "--help" ) \|\| ( $ARGV[0] eq "" ) ) {
	die
	"Usage:\n$0 json_file\n";
	}

	# Setup working folder based on the JSON file name and create a git repo there

	$json = $ARGV[0];
	$json =~ m/(.*)\.json$/;
	$folder = $1;

	runcmd("mkdir $folder");

	$folder = abs_path("$folder");

	runcmd("git -C $folder init");

	#Parse the JSON file and get its contents into arrays

	@file_url;
	@timestamp;
	@file_id;

	open( JSON, "<", "$json" ) or die "Can't open $json for reading: $!\n";
	while (<JSON>) {
	chomp;
	if ( $_ =~ /\{\"file_url\":\"(.)\",\"timestamp\":(\d),\"file_id\":\"(.*)"\},/ ) {
	push @file_url,$1;
	push @timestamp,$2;
	push @file_id,$3;
	}
	}
	close JSON;

	#sort arrays based on timestamps

	@index = sort { $timestamp[$a] <=> $timestamp[$b] } 0 .. $#timestamp;

	@file_url = @file_url[@index];
	@timestamp = @timestamp[@index];
	@file_id = @file_id[@index];

	#setting up some aditional variables and solving a few formating problems

	$commit_timestamp = $timestamp[0];
	$commit_message;
	$base_url = $file_url[0];
	$base_url =~ s/:80\//\//g;

	for (my $i=0; $i <= $#file_url; $i++) {

	$file_id[$i] =~ s/ /_/g; #wayback-machine-downloader converts some special characters in file_url into whitespaces in file_id so they are converted into underscores

	if ( $file_url[$i] =~ /\/$/ ) { #directory urls must be stored into index.html

	$file_id[$i] .= "/index.html";

	}

	#setting up where files are actually saved to in the repo

	$file_id[$i] =~ m/\d{14}\/(.*)/;

	my $destination = "$folder/$1";

	#checks whether a file is new, an update or a duplicate, and either adds it to the repo or skips it

	if (-e $destination) {

	my $destination_new = $destination . "." . $timestamp[$i];

	getpage($file_url[$i], $timestamp[$i], $destination_new);

	my $filetype = runcmdout("file $destination_new");

	if ( $filetype =~ /HTML document/ ) {

	clean_html($destination_new, $base_url);

	}

	my $checksum = read_checksum($destination);
	my $checksum_new = read_checksum($destination_new);

	if ( $checksum eq $checksum_new ) {

	print "MD5 match! Removing most recent file...\n";

	runcmd("rm $destination_new");

	$commit_message .= "Skipped snapshot https://web.archive.org/web/$timestamp[$i]/$file_url[$i] due to no changes in file\n";

	} else {

	my $git_date = git_date($commit_timestamp);

	print $git_date . "\n";

	runcmd("GIT_AUTHOR_DATE=\"$git_date\" GIT_COMMITTER_DATE=\"$git_date\" git -C $folder commit -m \"$commit_message\" --author=\"Internet Archive <info\@archive.org>\"");

	$commit_message = "";

	runcmd("rm $destination");

	runcmd("mv $destination_new $destination");

	runcmd("git -C $folder add $destination");

	$commit_message .= "Updated file from snapshot https://web.archive.org/web/$timestamp[$i]/$file_url[$i] File last modified on: " . read_date($destination);

	$commit_timestamp = $timestamp[$i];

	}

	} else {

	getpage($file_url[$i], $timestamp[$i], $destination);

	my $filetype = runcmdout("file $destination");

	if ( $filetype =~ /HTML document/ ) {

	clean_html($destination, $base_url);

	}

	runcmd("git -C $folder add $destination");

	$commit_message .= "Added file from snapshot https://web.archive.org/web/$timestamp[$i]/$file_url[$i] File last modified on: " . read_date($destination);

	$commit_timestamp = $timestamp[$i];

	}

	}

	my $git_date = git_date($commit_timestamp);

	print $git_date . "\n";

	runcmd("GIT_AUTHOR_DATE=\"$git_date\" GIT_COMMITTER_DATE=\"$git_date\" git -C $folder commit -m \"$commit_message\" --author=\"Internet Archive <info\@archive.org>\"");


	# Subroutines:

	sub getpage { #wgets a file from archive.org with proper modification timestamps

	my ($file_url, $timestamp, $destination) = @_;

	$timestamp .= "_id";

	runcmd("mkdir -p \"\$\(dirname \"$destination\"\)\"");

	runcmd("wget -S -O $destination https://web.archive.org/web/$timestamp/$file_url");

	}

	sub clean_html { # scraps HTML files from their embeded Internet Archive code and links

	my ($file, $base_url) = @_;
	my $date = read_date($file);
	my $contents;

	open( HTML, "<", "$file" ) or die "Can't open $file for reading: $!\n";
	local $/;
	$contents = <HTML>;
	close HTML;

	$contents =~ s/<head><script src=\"\/\/archive.org.*<\!\-\- End Wayback Rewrite JS Include \-\->\n\n/<head>\n/sg;
	$contents =~ s/<!--\n FILE ARCHIVED ON.*//sg;
	$contents =~ s/https:\/\/web.archive.org\/web\/\d{14}.{0,2}_?\///g;
	$contents =~ s/\/web\/\d{14}.{0,2}_?\///g;
	$contents =~ s/$base_url//g;
	$contents =~ s/<a href=\"\">/<a href=\"index.html\">/g;
	open( OUTPUT, ">", "$file" ) or die "Can't create $file: $!\n";
	print OUTPUT "$contents";
	close OUTPUT;

	write_date($file, $date);

	}

	sub runcmd { #runs a custom linux command

	my $cmd = shift;
	my $cmdout = `$cmd`;
	print "$cmdout\n" unless $cmdout eq "";

	}

	sub runcmdout { #runs a custom linux command and records STOUT into a variable

	my $cmd = shift;
	my $cmdout = `$cmd`;
	print "$cmdout\n" unless $cmdout eq "";
	return $cmdout;
	}

	sub read_checksum { #reads an MD5 checksum

	my $file = shift;
	my $cmd = "md5sum \"$file\"";
	my $cmdout = `$cmd`;
	$cmdout =~ m/([0-9a-fA-F]{32})/;
	my $checksum = $1;

	return $checksum;

	}

	sub read_date { #reads a modification timestamp to a specified file

	my $file = shift;
	my $cmd = "stat -c '%y' \"$file\"";
	my $cmdout = `$cmd`;

	return $cmdout;

	}

	sub write_date { #writes a modification timestamp to a specified file

	my ($file, $date) = @_;
	my $cmd = "touch -d \"$date\" \"$file\"";
	my $cmdout = `$cmd`;

	return $cmdout;
	}

	sub git_date { #converts an Internet Archive timestamp into a date format git can understand

	my $date = shift;
	$date =~ m/(\d\d\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)/;
	my $git_date = "$1-$2-$3T$4:$5:$6";

	return $git_date;

	}