Skip to content

Instantly share code, notes, and snippets.

@mikeytown2
Created December 23, 2020 22:14
Show Gist options
  • Save mikeytown2/cd8f571af70efe3ae9a5394024bc81dc to your computer and use it in GitHub Desktop.
Save mikeytown2/cd8f571af70efe3ae9a5394024bc81dc to your computer and use it in GitHub Desktop.
<?php
error_reporting(-1);
ini_set("display_errors", 1);
header('Content-Type: text/plain');
set_time_limit(900);
$date = 'https://web.archive.org/web/20201221160810/';
$starting_point = "{$date}https://file.wikileaks.org/file/";
function escapefile_url($url){
$parts = parse_url($url);
$path_parts = array_map('rawurldecode', explode('/', $parts['path']));
return
$parts['scheme'] . '://' .
$parts['host'] .
implode('/', array_map('rawurlencode', $path_parts))
;
}
function get_file_list($url) {
// wait 100ms.
usleep(100000);
$context = stream_context_create(array(
'http' => array(
'follow_location' => TRUE,
)
));
$contents = '<html><body>' . stristr(file_get_contents(escapefile_url($url), FALSE, $context), '<!-- END WAYBACK TOOLBAR INSERT -->');
$contents = stristr($contents, '</html>', TRUE) . '</html>';
preg_match_all('/<a[^>]+href=([\'"])(?<href>.+?)\1[^>]*>/i', $contents, $result);
// $parsed_html = new SimpleXMLElement($contents);
unset($result['href'][0]);
return $result['href'];
}
function get_subfolders($urls, $current_folder) {
$list = array();
foreach ($urls as $k => $url) {
$url = urldecode($url);
$list[] = "$current_folder$url";
if (substr($url, -1) !== '/') {
continue;
}
if (stripos($url, 'https://') === 0 || stripos($url, 'http://') === 0 ) {
continue;
}
// if ($k < 11) {
if (strpos($url, '/') === 0) {
$parsed = parse_url($current_folder);
$list += get_subfolders(get_file_list("{$parsed['scheme']}://{$parsed['host']}{$url}"), $current_folder . $url);
}
else {
$list += get_subfolders(get_file_list("{$current_folder}{$url}"), $current_folder . $url);
}
// }
}
return $list;
}
function remove_archive_prefix($prefix, &$list) {
foreach ($list as &$url) {
$url = str_replace($prefix, '', $url);
}
}
$urls = get_file_list($starting_point);
$list = get_subfolders($urls, $starting_point);
remove_archive_prefix($date, $list);
print_r($list);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment