Last active
August 22, 2016 17:34
-
-
Save jpluimers/52e12399a2822a4f5687950c46772a1a to your computer and use it in GitHub Desktop.
Convert a url to a wayback web.archive.org html fragment - gets youngest or creates entry
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<!-- | |
Convert a url to a wayback web.archive.org html fragment - gets youngest or creates entry | |
Paste any URL, and it will find the youngest web.archive.org URL or create one if it doesn't exist yet. | |
Use http://esprima.org/demo/validate.html for finding syntax errors | |
Use http://jsbin.com/wamavacuco/12/edit?html,console,output for testing | |
--> | |
<html> | |
<body> | |
<h1>Convert a url to a wayback web.archive.org html fragment - gets youngest or creates entry</h1> | |
<p>Please enter a URL:</p> | |
<!-- http://stackoverflow.com/questions/13820477/html5-input-tag-validation-for-url | |
Which in practice doesn't work for the submit button probably because the input is not part of a form. | |
--> | |
<input id="sourceUrl" type="url" pattern="^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?" required /> | |
<button type="button" onclick="handleSourceUrlMethod()">Submit</button> | |
<div id="handledUrlsCaptionDiv"> | |
<p id="handledUrlsCaption">Below you will see the handled URLs transformed into HTML so you can copy/paste them into for instance the WordPress editor</p> | |
<p>They will be of the form Page Title [WayBack] where "Page Title" is obtained from the original URL and "WayBack" points to the most recently web.archive.org filing for the URL. When none existed, it will be created (but takes a while).</p> | |
</div> | |
<ul id="handledUrlsUl"> | |
</ul> | |
<script> | |
function XmlHttpRequestToJsonString(xmlHttpRequest) { | |
var result = { | |
onreadystatechange: xmlHttpRequest.onreadystatechange, | |
readyState: xmlHttpRequest.readyState, | |
responseText: xmlHttpRequest.responseText, | |
responseXML: xmlHttpRequest.responseXML, | |
status: xmlHttpRequest.status, | |
statusText: xmlHttpRequest.statusText | |
}; | |
return(JSON.stringify(result)); | |
} | |
////////////////////// from https://github.com/internetarchive/FirefoxNoMore404s/blob/master/src/scripts/background.js | |
var VERSION = "1.5.5"; | |
/** | |
* Checks Wayback Machine API for url snapshot | |
*/ | |
function wmAvailabilityCheck(url, onsuccess, onfail) { | |
window.alert("wmAvailabilityCheck: " + url); | |
var xhr = new XMLHttpRequest(); | |
var requestUrl = "https://archive.org/wayback/available"; | |
var requestParams = "url=" + encodeURI(url); | |
xhr.open("POST", requestUrl, true); | |
xhr.setRequestHeader("Content-type", "application/x-www-form-urlencoded"); | |
xhr.setRequestHeader("Wayback-Api-Version", 2); | |
xhr.onload = function() { | |
window.alert("wmAvailabilityCheck: onload: " + xhr.responseText); | |
var response = JSON.parse(xhr.responseText); | |
var wayback_url = getWaybackUrlFromResponse(response); | |
if (wayback_url !== null) { | |
onsuccess(wayback_url, url); | |
} else if (onfail) { | |
onfail(); | |
} | |
}; | |
// because of http://stackoverflow.com/questions/9181090/is-onload-equal-to-readystate-4-in-xmlhttprequest | |
xhr.onerror = function () { | |
window.alert("wmAvailabilityCheck: onerror: " + XmlHttpRequestToJsonString(xhr)); | |
onfail(); | |
} | |
xhr.send(requestParams); | |
} | |
/** | |
* @param response {object} | |
* @return {string or null} | |
*/ | |
function getWaybackUrlFromResponse(response) { | |
if (response.results && | |
response.results[0] && | |
response.results[0].archived_snapshots && | |
response.results[0].archived_snapshots.closest && | |
response.results[0].archived_snapshots.closest.available && | |
response.results[0].archived_snapshots.closest.available === true && | |
response.results[0].archived_snapshots.closest.status.indexOf("2") === 0 && | |
isValidSnapshotUrl(response.results[0].archived_snapshots.closest.url)) { | |
return response.results[0].archived_snapshots.closest.url; | |
} else { | |
return null; | |
} | |
} | |
/** | |
* Makes sure response is a valid URL to prevent code injection | |
* @param url {string} | |
* @return {bool} | |
*/ | |
function isValidSnapshotUrl(url) { | |
return ((typeof url) === "string" && | |
(url.indexOf("http://") === 0 || url.indexOf("https://") === 0)); | |
} | |
////////////////////// own work: | |
// http://stackoverflow.com/questions/1303872/trying-to-validate-url-using-javascript | |
// http://stackoverflow.com/questions/28735459/how-to-validate-youtube-url-in-client-side-in-text-box | |
function isValidUri(uri) { | |
// ensure to escape back-slashes: http://wiert.me/2017/03/01/javascript-sigh-no-real-regexp-support-sigh-google-search-results-sigh/ | |
var uriRegExPattern = "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"; | |
var uriRegEx = new RegExp(uriRegExPattern); | |
return (uriRegEx.test(uri)); | |
} | |
function getTitleOfUrl(url, titleCallback, doneCallback) { | |
// adpoted from http://stackoverflow.com/questions/8610725/get-the-title-of-a-page-using-a-url-from-within-a-chrome-extension | |
// callback is function(title) where title is a string and get called when there is a title | |
var xhr = new XMLHttpRequest(); | |
xhr.open("GET", url, true); | |
xhr.onreadystatechange = function() { | |
// https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/readyState | |
if (xhr.readyState === 4) { // 4 DONE The operation is complete. | |
var responseText = xhr.responseText; | |
if (responseText) { | |
var titleMatches = (/<title>(.*?)<\/title>/m).exec(responseText); | |
if (titleMatches && titleMatches[1]) { | |
var title = titleMatches[1]; | |
if (callback) { | |
callback(title); | |
} | |
} | |
} | |
doneCallback(); | |
} | |
} | |
xhr.send(); | |
} | |
/** | |
* Checks Wayback Machine API for url snapshot | |
*/ | |
function wmSaveUrl(url, onsuccess, onfail) { | |
window.alert("wmSaveUrl: " + url); | |
var xhr = new XMLHttpRequest(); | |
var requestUrl = url; | |
var requestParams = "url=" + encodeURI(url); | |
xhr.open("GET", requestUrl, true); | |
xhr.setRequestHeader("Wayback-Api-Version", 2); | |
xhr.onload = function() { | |
window.alert("wmSaveUrl: onload: " + xhr.responseText); | |
// var response = JSON.parse(xhr.responseText); | |
// var wayback_url = getWaybackUrlFromResponse(response); | |
// if (wayback_url !== null) { | |
// onsuccess(wayback_url, url); | |
// } else if (onfail) { | |
// onfail(); | |
// } | |
}; | |
// because of http://stackoverflow.com/questions/9181090/is-onload-equal-to-readystate-4-in-xmlhttprequest | |
xhr.onerror = function () { | |
window.alert("wmSaveUrl: onerror " + XmlHttpRequestToJsonString(xhr)); | |
onfail(); | |
} | |
xhr.send(); | |
} | |
// based on http://www.w3schools.com/js/tryit.asp?filename=tryjs_intro_validate | |
function handleSourceUrlMethod() { | |
var errorText = ""; | |
var wayBackBrowseCalendarUrl = "" | |
var targetUrl = ""; | |
// Get the value of the input field with id="sourceUrl" | |
var sourceUrl = document.getElementById("sourceUrl").value; | |
if (isValidUri(sourceUrl)) { | |
// Since encodeURI is in wmAvailabilityCheck, there is oo need to replace & by %26 in sourceUrl first: | |
// http://stackoverflow.com/questions/36806100/how-to-use-the-wayback-machine-api-with-url-query-string/36806159#36806159 | |
// var escapedSourceUrl = str = sourceUrl.replace(/&/g, "%26"); | |
// http://c2.com/cgi/wiki?WaybackMachine | |
// https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md#collapsing | |
wayBackBrowseCalendarUrl = "https://web.archive.org/web/*/" + sourceUrl; | |
wayBackBrowseSaveUrl = "https://web.archive.org/save/" + sourceUrl; | |
// inspired by https://github.com/internetarchive/FirefoxNoMore404s/blob/master/src/scripts/background.js | |
// and https://github.com/internetarchive/FirefoxNoMore404s/blob/master/src/scripts/client.js | |
// ensure this uses callbacks: http://stackoverflow.com/questions/16080655/why-is-this-function-not-waiting-until-it-has-data-from-xhr-request | |
var wmAvailabilityCheckCompleted = false; | |
var wmUrl = ""; | |
wmAvailabilityCheck(sourceUrl, function(wayback_url, url) { | |
wmUrl = wayback_url; | |
wmAvailabilityCheck = true; | |
}, function() { | |
// the page is not in the wayback machine: save it like https://web.archive.org/save/http://example.com | |
wmSaveUrl(wayBackBrowseSaveUrl, function() { | |
window.alert("wm saved"); | |
wmAvailabilityCheck = true; | |
}, function() { | |
window.alert("wm save failed"); | |
wmAvailabilityCheck = true; | |
}); | |
}); | |
var sourceTitleCompleted = false; | |
var sourceTitle = sourceUrl; | |
// wait for multiple XMLHttpRequest calls to complete | |
// jQuery: http://stackoverflow.com/questions/2768293/waiting-on-multiple-asynchronous-calls-to-complete-before-continuing/2768306#2768306 | |
// general: http://stackoverflow.com/questions/7595206/javascript-xmlhttprequest-how-to-send-multiple-simultaneous-requests/7595852#7595852 | |
getTitleOfUrl(sourceUrl, function(title) { | |
sourceTitle = title; | |
}, function() { | |
sourceTitleCompleted = true; | |
}); | |
targetUrl = wayBackBrowseCalendarUrl; | |
// 2. wayback helper: voer URL in en krijg URL terug met laatste WayBack archief (de UI van web.archive.org is zo ... slecht) | |
// Voor 2. zijn er APIs: https://archive.org/help/wayback_api.php en http://ws-dl.blogspot.nl/2013/07/2013-07-15-wayback-machine-upgrades.html | |
// Er is geen save-page API, maar wel een URL die je daarvoor kunt gebruiken: https://blog.archive.org/2013/10/25/fixing-broken-links/ | |
} else { | |
errorText = "Invalid URL: " + sourceUrl; | |
} | |
// for now, just add the output; in the future, only add unique output. | |
// Based on http://stackoverflow.com/questions/5519747/how-to-add-anchor-tags-dynamically-to-a-div-in-javascript/5519795#5519795 | |
var handledUrlsUl = document.getElementById("handledUrlsUl"); | |
var handledUrlLi = document.createElement('li'); | |
if (errorText === ""){ | |
var aTag = document.createElement('a'); | |
aTag.setAttribute('href', targetUrl); | |
aTag.innerHTML = targetUrl; | |
handledUrlLi.appendChild(aTag); | |
} else { | |
var aDiv = document.createElement('div'); | |
aDiv.innerHTML = errorText + " (" + sourceUrl + ")"; | |
handledUrlLi.appendChild(aDiv); | |
} | |
handledUrlsUl.appendChild(handledUrlLi); | |
} | |
</script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment