Skip to content

Instantly share code, notes, and snippets.

@jpluimers
Last active August 22, 2016 17:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jpluimers/52e12399a2822a4f5687950c46772a1a to your computer and use it in GitHub Desktop.
Save jpluimers/52e12399a2822a4f5687950c46772a1a to your computer and use it in GitHub Desktop.
Convert a url to a wayback web.archive.org html fragment - gets youngest or creates entry
<!DOCTYPE html>
<!--
Convert a url to a wayback web.archive.org html fragment - gets youngest or creates entry
Paste any URL, and it will find the youngest web.archive.org URL or create one if it doesn't exist yet.
Use http://esprima.org/demo/validate.html for finding syntax errors
Use http://jsbin.com/wamavacuco/12/edit?html,console,output for testing
-->
<html>
<body>
<h1>Convert a url to a wayback web.archive.org html fragment - gets youngest or creates entry</h1>
<p>Please enter a URL:</p>
<!-- http://stackoverflow.com/questions/13820477/html5-input-tag-validation-for-url
Which in practice doesn't work for the submit button probably because the input is not part of a form.
-->
<input id="sourceUrl" type="url" pattern="^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?" required />
<button type="button" onclick="handleSourceUrlMethod()">Submit</button>
<div id="handledUrlsCaptionDiv">
<p id="handledUrlsCaption">Below you will see the handled URLs transformed into HTML so you can copy/paste them into for instance the WordPress editor</p>
<p>They will be of the form Page Title [WayBack] where "Page Title" is obtained from the original URL and "WayBack" points to the most recently web.archive.org filing for the URL. When none existed, it will be created (but takes a while).</p>
</div>
<ul id="handledUrlsUl">
</ul>
<script>
function XmlHttpRequestToJsonString(xmlHttpRequest) {
var result = {
onreadystatechange: xmlHttpRequest.onreadystatechange,
readyState: xmlHttpRequest.readyState,
responseText: xmlHttpRequest.responseText,
responseXML: xmlHttpRequest.responseXML,
status: xmlHttpRequest.status,
statusText: xmlHttpRequest.statusText
};
return(JSON.stringify(result));
}
////////////////////// from https://github.com/internetarchive/FirefoxNoMore404s/blob/master/src/scripts/background.js
var VERSION = "1.5.5";
/**
* Checks Wayback Machine API for url snapshot
*/
function wmAvailabilityCheck(url, onsuccess, onfail) {
window.alert("wmAvailabilityCheck: " + url);
var xhr = new XMLHttpRequest();
var requestUrl = "https://archive.org/wayback/available";
var requestParams = "url=" + encodeURI(url);
xhr.open("POST", requestUrl, true);
xhr.setRequestHeader("Content-type", "application/x-www-form-urlencoded");
xhr.setRequestHeader("Wayback-Api-Version", 2);
xhr.onload = function() {
window.alert("wmAvailabilityCheck: onload: " + xhr.responseText);
var response = JSON.parse(xhr.responseText);
var wayback_url = getWaybackUrlFromResponse(response);
if (wayback_url !== null) {
onsuccess(wayback_url, url);
} else if (onfail) {
onfail();
}
};
// because of http://stackoverflow.com/questions/9181090/is-onload-equal-to-readystate-4-in-xmlhttprequest
xhr.onerror = function () {
window.alert("wmAvailabilityCheck: onerror: " + XmlHttpRequestToJsonString(xhr));
onfail();
}
xhr.send(requestParams);
}
/**
* @param response {object}
* @return {string or null}
*/
function getWaybackUrlFromResponse(response) {
if (response.results &&
response.results[0] &&
response.results[0].archived_snapshots &&
response.results[0].archived_snapshots.closest &&
response.results[0].archived_snapshots.closest.available &&
response.results[0].archived_snapshots.closest.available === true &&
response.results[0].archived_snapshots.closest.status.indexOf("2") === 0 &&
isValidSnapshotUrl(response.results[0].archived_snapshots.closest.url)) {
return response.results[0].archived_snapshots.closest.url;
} else {
return null;
}
}
/**
* Makes sure response is a valid URL to prevent code injection
* @param url {string}
* @return {bool}
*/
function isValidSnapshotUrl(url) {
return ((typeof url) === "string" &&
(url.indexOf("http://") === 0 || url.indexOf("https://") === 0));
}
////////////////////// own work:
// http://stackoverflow.com/questions/1303872/trying-to-validate-url-using-javascript
// http://stackoverflow.com/questions/28735459/how-to-validate-youtube-url-in-client-side-in-text-box
function isValidUri(uri) {
// ensure to escape back-slashes: http://wiert.me/2017/03/01/javascript-sigh-no-real-regexp-support-sigh-google-search-results-sigh/
var uriRegExPattern = "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
var uriRegEx = new RegExp(uriRegExPattern);
return (uriRegEx.test(uri));
}
function getTitleOfUrl(url, titleCallback, doneCallback) {
// adpoted from http://stackoverflow.com/questions/8610725/get-the-title-of-a-page-using-a-url-from-within-a-chrome-extension
// callback is function(title) where title is a string and get called when there is a title
var xhr = new XMLHttpRequest();
xhr.open("GET", url, true);
xhr.onreadystatechange = function() {
// https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/readyState
if (xhr.readyState === 4) { // 4 DONE The operation is complete.
var responseText = xhr.responseText;
if (responseText) {
var titleMatches = (/<title>(.*?)<\/title>/m).exec(responseText);
if (titleMatches && titleMatches[1]) {
var title = titleMatches[1];
if (callback) {
callback(title);
}
}
}
doneCallback();
}
}
xhr.send();
}
/**
* Checks Wayback Machine API for url snapshot
*/
function wmSaveUrl(url, onsuccess, onfail) {
window.alert("wmSaveUrl: " + url);
var xhr = new XMLHttpRequest();
var requestUrl = url;
var requestParams = "url=" + encodeURI(url);
xhr.open("GET", requestUrl, true);
xhr.setRequestHeader("Wayback-Api-Version", 2);
xhr.onload = function() {
window.alert("wmSaveUrl: onload: " + xhr.responseText);
// var response = JSON.parse(xhr.responseText);
// var wayback_url = getWaybackUrlFromResponse(response);
// if (wayback_url !== null) {
// onsuccess(wayback_url, url);
// } else if (onfail) {
// onfail();
// }
};
// because of http://stackoverflow.com/questions/9181090/is-onload-equal-to-readystate-4-in-xmlhttprequest
xhr.onerror = function () {
window.alert("wmSaveUrl: onerror " + XmlHttpRequestToJsonString(xhr));
onfail();
}
xhr.send();
}
// based on http://www.w3schools.com/js/tryit.asp?filename=tryjs_intro_validate
function handleSourceUrlMethod() {
var errorText = "";
var wayBackBrowseCalendarUrl = ""
var targetUrl = "";
// Get the value of the input field with id="sourceUrl"
var sourceUrl = document.getElementById("sourceUrl").value;
if (isValidUri(sourceUrl)) {
// Since encodeURI is in wmAvailabilityCheck, there is oo need to replace & by %26 in sourceUrl first:
// http://stackoverflow.com/questions/36806100/how-to-use-the-wayback-machine-api-with-url-query-string/36806159#36806159
// var escapedSourceUrl = str = sourceUrl.replace(/&/g, "%26");
// http://c2.com/cgi/wiki?WaybackMachine
// https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md#collapsing
wayBackBrowseCalendarUrl = "https://web.archive.org/web/*/" + sourceUrl;
wayBackBrowseSaveUrl = "https://web.archive.org/save/" + sourceUrl;
// inspired by https://github.com/internetarchive/FirefoxNoMore404s/blob/master/src/scripts/background.js
// and https://github.com/internetarchive/FirefoxNoMore404s/blob/master/src/scripts/client.js
// ensure this uses callbacks: http://stackoverflow.com/questions/16080655/why-is-this-function-not-waiting-until-it-has-data-from-xhr-request
var wmAvailabilityCheckCompleted = false;
var wmUrl = "";
wmAvailabilityCheck(sourceUrl, function(wayback_url, url) {
wmUrl = wayback_url;
wmAvailabilityCheck = true;
}, function() {
// the page is not in the wayback machine: save it like https://web.archive.org/save/http://example.com
wmSaveUrl(wayBackBrowseSaveUrl, function() {
window.alert("wm saved");
wmAvailabilityCheck = true;
}, function() {
window.alert("wm save failed");
wmAvailabilityCheck = true;
});
});
var sourceTitleCompleted = false;
var sourceTitle = sourceUrl;
// wait for multiple XMLHttpRequest calls to complete
// jQuery: http://stackoverflow.com/questions/2768293/waiting-on-multiple-asynchronous-calls-to-complete-before-continuing/2768306#2768306
// general: http://stackoverflow.com/questions/7595206/javascript-xmlhttprequest-how-to-send-multiple-simultaneous-requests/7595852#7595852
getTitleOfUrl(sourceUrl, function(title) {
sourceTitle = title;
}, function() {
sourceTitleCompleted = true;
});
targetUrl = wayBackBrowseCalendarUrl;
// 2. wayback helper: voer URL in en krijg URL terug met laatste WayBack archief (de UI van web.archive.org is zo ... slecht)
// Voor 2. zijn er APIs: https://archive.org/help/wayback_api.php en http://ws-dl.blogspot.nl/2013/07/2013-07-15-wayback-machine-upgrades.html
// Er is geen save-page API, maar wel een URL die je daarvoor kunt gebruiken: https://blog.archive.org/2013/10/25/fixing-broken-links/
} else {
errorText = "Invalid URL: " + sourceUrl;
}
// for now, just add the output; in the future, only add unique output.
// Based on http://stackoverflow.com/questions/5519747/how-to-add-anchor-tags-dynamically-to-a-div-in-javascript/5519795#5519795
var handledUrlsUl = document.getElementById("handledUrlsUl");
var handledUrlLi = document.createElement('li');
if (errorText === ""){
var aTag = document.createElement('a');
aTag.setAttribute('href', targetUrl);
aTag.innerHTML = targetUrl;
handledUrlLi.appendChild(aTag);
} else {
var aDiv = document.createElement('div');
aDiv.innerHTML = errorText + " (" + sourceUrl + ")";
handledUrlLi.appendChild(aDiv);
}
handledUrlsUl.appendChild(handledUrlLi);
}
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment