Skip to content

Instantly share code, notes, and snippets.

Created July 10, 2018 16:35
Show Gist options
  • Save ollybritton/64efc0671f9f79cad9a62401db6fbd54 to your computer and use it in GitHub Desktop.
Save ollybritton/64efc0671f9f79cad9a62401db6fbd54 to your computer and use it in GitHub Desktop.
'use strict';
* @todo Make it so that that the `rectifyDocumentURLs` function will not select URLs which do not to be fixed.
* @file This file deals with parsing URLs and parsing them into a single, resources-less HTML string.
* @summary Parses URLs to HTML.
* @author Olly Britton
* @
/* Import various libraries that will be used for manipulationg the document. */
const request = require('request')
const DOMParser = require('xmldom').DOMParser;
const xmlserializer = require('xmlserializer');
const debug = require('debug')('app:parser')
* @description This is a useful constant which contains regex patterns which match to different types of URL.
* @example
* // Returns "true"
* "".match(url_types.full_url)
const url_patterns = {
/* Attempts to find the HTTP/HTTPS prefix. */
http_section: /(https?):\/\//,
/*, */
full_url: /^(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}(?:\/(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)?)$/gm,
/* //, // */
double_slash_url: /^(\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}(?:\/(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)?)$/gm,
/*, */
httpless_url: /^((?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.(?!js|css|html|xml|xhtml)[a-z]{2,6}(?:\/(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)?)$/gm,
/* /script.js, /css.css */
slash_resource_name: /^\/([-a-zA-Z0-9@:%._\+~#=]{2,256}\.(?:js|css|html|xml|xhtml))$/gm,
/* script.js, css.css */
resource_name: /^([-a-zA-Z0-9@:%._\+~#=]{2,256}\.(?:js|css|html|xml|xhtml))$/gm,
/* Any type of URL. TODO: THIS IS A MONSTER, and should probably be fixed. */
any: /(?:^(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}(?:\/(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)?)$)|(?:^(\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}(?:\/(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)?)$)|(?:^((?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.(?!js|css|html|xml|xhtml)[a-z]{2,6}(?:\/(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)?)$)|(?:^\/([-a-zA-Z0-9@:%._\+~#=]{2,256}\.(?:js|css|html|xml|xhtml))$)|(?:^([-a-zA-Z0-9@:%._\+~#=]{2,256}\.(?:js|css|html|xml|xhtml))$)/
* @description This is another useful constant which contains Regex patterns that can extract the URLs out of `script` tags and `link` tags.
const tag_patterns = {
/* Matches valid script tags with the capture group being the URL. */
script: /<script.*src="(.+\.js)".*(?:\/>|>.*<\/script>)/g,
/* Matches valid <link> tags with the capture group being the HREF. */
link: /<link.*href="(.+\.css)".*(?:\/>|>.*<\/link>)/g
* @description Detects the type of a URL and returns a string containing that information.
* @param {string} url The URL which needs to be detected.
* @param {function} callback
* @example
* // Logs "full_url"
* detectURL("", (err, res) => {console.log(res)} )
exports.detectURL = function(url, callback) {
if (url.match(url_patterns.full_url)) {
/* Url is fine, no manipulation is required. */
callback(null, "full_url")
} else if (url.match(url_patterns.double_slash_url)) {
/* The URL lacks a HTTP/HTTPS prefix, yet still has two slashes. For example: "//". */
callback(null, "double_slash_url")
} else if (url.match(url_patterns.httpless_url)) {
/* The URL lacks a HTTP/HTTPS prefix and doesn't have two slashes. For example: "". However this is ambigous with a resource name, so we need to do some futher testing: */
callback(null, "httpless_url")
} else if (url.match(url_patterns.slash_resource_name)) {
/* The URL is just the name of a resource with a slash, and nothing else. For example: "/script.js" */
callback(null, "slash_resource_name")
} else if (url.match(url_patterns.resource_name)) {
/* The URL is a plain resource name, and has no "HTTP://" or root url. */
callback(null, "resource_name")
} else {
/* EEK! We have no idea. */
callback(Error(`Strange URL: ${url}`), null)
* @description This function detects whether the base url is using HTTP or HTTPS and returns the result accordingly.
* @param {string} url
exports.detectProtocol = function(url) {
if (url.includes("https")) {
return "https"
} else {
return "http"
* @description Takes a given resource URL and a requested base URL and will return a resource URL that is a full URL.
* @example
* // Returns ""
* fixUrl("script.js", "")
* @param {string} resource_url The URL of the resource, ie "script.js".
* @param {string} base_url The base url, ie "".
* @param {function} callback The callback.
* @todo Add in IP support.
exports.fixUrl = function(resource_url, base_url, callback) {
if (base_url[base_url.length - 1] != "/") {
base_url += "/"
exports.detectURL(resource_url, function(err, res) {
if (err) {
callback(err, null)
let url_type = res
let http_prefix = "http"
if (url_type === "full_url") {
// Resource URL is fine, no fixing needed.
callback(null, resource_url)
} else if (url_type === "double_slash_url") {
// Missing the "http:" or "https:".
resource_url = `${http_prefix}:` + resource_url
callback(null, resource_url)
} else if (url_type === "httpless_url") {
// Missing the "http://" or "https://".
resource_url = `${http_prefix}://` + resource_url
callback(null, resource_url)
} else if (url_type == "slash_resource_name") {
// A resource name, but with a slash. "/script.js" or "/css.css".
resource_url = base_url + resource_url.slice(1, resource_url.length)
callback(null, resource_url)
} else if (url_type == "resource_name") {
// A resource name, like "script.js" or "styles.css".
resource_url = base_url + resource_url
callback(null, resource_url)
} else {
// Hmmm... The resource is of a URL we do not recognise.
Error(`Strange Resource URL: '${resource_url}'`), null
* @description This function is passed a HTML document represnted as a string, and will return a version of that document where each URL is represented in full. For example, `` becomes ``.
* @param {string} baseUrl The base url of the page, for example: "".
* @param {string} document The document, as a string.
* @param {function} callback The callback function.
exports.rectifyDocumentURLs = function(baseUrl, document, callback) {
let new_document = document.replace(url_patterns.any, match => {
return exports.fixUrl(match, baseUrl, (err, res) => {
return res
* @description This function takes a URL, like "" and returns a version of the HTML which has all the external resources and scripts which are specified in the file. For example, a `<script src="..."></script>` would become `<script>...</script>`.
* @param {string} url The url which has been requested.
* @param {function} callback The callback function.
exports.parser = function(url, callback) {
request(url, (err, res) => {
if (err) {
callback(err, null)
let document = res
exports.rectifyDocumentURLs(url, document, (err, res) => {
if (err) {
callback(err, null)
let fixedDocument = res
exports.rectifyDocumentURLs("", "", (rectifyErr, rectifyRes) => {
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment