Skip to content

Instantly share code, notes, and snippets.

@adactio
Last active January 2, 2024 23:24
Show Gist options
  • Save adactio/3d6983bea9b30c993a65b12537ce930c to your computer and use it in GitHub Desktop.
Save adactio/3d6983bea9b30c993a65b12537ce930c to your computer and use it in GitHub Desktop.
A proxy that tries to redirect bad links to the internet archive.
<?php
// Check that the request is coming from my site.
if (!isset($_SERVER['HTTP_REFERER']) || !stristr(strtolower($_SERVER['HTTP_REFERER']), strtolower($_SERVER['SERVER_NAME']))) {
http_response_code(403);
exit;
}
// There has to be a URL provided in the query string.
if (!isset($_GET['url'])) {
http_response_code(400);
exit;
}
// Make a very quick curl request to get the response headers from the URL.
// The time limit is set to 1 second.
$options = array(
CURLOPT_URL => $_GET['url'],
CURLOPT_USERAGENT => $_SERVER['SERVER_NAME'],
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_HEADER => TRUE,
CURLOPT_NOBODY => TRUE,
CURLOPT_TIMEOUT_MS => 1000
);
$curl = curl_init();
curl_setopt_array($curl, $options);
$curl_errno = curl_errno($curl);
$response = curl_exec($curl);
curl_close($curl);
// If there was any error (like a time out), give up and go to the URL.
if ($curl_errno > 0 || empty($response)) {
header('Location: '.$_GET['url']);
exit;
}
// Pick the response headers apart to get the HTTP status code.
$data = preg_split("/[\r\n]+/", $response);
$status = $data[0];
// If the response is OK, go to the URL.
if (stristr($status, '200') || stristr($status, '304')) {
header('Location: '.$_GET['url']);
exit;
}
// If the response is a redirect, go around again but this time use the redirect URL.
if (stristr($status, '30')) {
$headers = array();
foreach($data as $line) {
if (stristr($line, ': ')) {
$pieces = explode(': ', $line);
$name = trim($pieces[0]);
$value = trim($pieces[1]);
$headers[$name] = $value;
}
}
if (isset($headers['Location'])) {
$redirect = '/redirect?url='.urlencode($headers['Location']);
if (isset($_GET['date'])) {
$redirect.= '&date='.urlencode($_GET['date']);
}
header('Location: '.$redirect);
exit;
}
}
// Construct the archive.org search endpoint.
$wayback = 'https://web.archive.org/cdx/search/cdx?output=json&filter=statuscode:200&url='.urlencode($_GET['url']);
// If we have a date, provide it. Otherwise ask for the latest snapshot.
if (isset($_GET['date'])) {
$wayback.= '&from='.urlencode($_GET['date']).'&limit=1';
} else {
$wayback.= '&limit=-1';
}
// Ping that archive.org URL.
// This time there's no time limit; this might take a while.
$options = array(
CURLOPT_URL => $wayback,
CURLOPT_USERAGENT => $_SERVER['SERVER_NAME'],
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_HEADER => FALSE
);
$curl = curl_init();
curl_setopt_array($curl, $options);
$result = curl_exec($curl);
curl_close($curl);
$response = json_decode($result,true);
// If there's an archived copy, redirect to that.
if (isset($response[1]) && $response[1][4] == '200') {
header('Location: https://web.archive.org/web/'.$response[1][1].'/'.$response[1][2]);
exit;
}
// There's no archived copy. Give up and go the URL anyway.
header('Location: '.$_GET['url']);
?>
@adactio
Copy link
Author

adactio commented Oct 31, 2023

Here's the corresponding client-side JavaScript code that passes links to this script:

https://gist.github.com/adactio/8be51468ff8c9591f9c98e0612bfae16

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment