Skip to content

Instantly share code, notes, and snippets.

@HenkPoley
Forked from dperini/regex-weburl.js
Last active February 7, 2024 12:06
Show Gist options
  • Star 20 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save HenkPoley/8899766 to your computer and use it in GitHub Desktop.
Save HenkPoley/8899766 to your computer and use it in GitHub Desktop.
//
// Regular Expression for URL validation
//
// Author: Diego Perini
// Updated: 2010/12/05
// License: MIT
//
// Copyright (c) 2010-2013 Diego Perini (http://www.iport.it)
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// the regular expression composed & commented
// could be easily tweaked for RFC compliance,
// it was expressly modified to fit & satisfy
// these test for an URL shortener:
//
// http://mathiasbynens.be/demo/url-regex
//
// Notes on possible differences from a standard/generic validation:
//
// - utf-8 char class take in consideration the full Unicode range
// - TLDs have been made mandatory so single names like "localhost" fails
// - protocols have been restricted to ftp, http and https only as requested
//
// Changes:
//
// - IP address dotted notation validation, range: 1.0.0.0 - 223.255.255.255
// first and last IP address of each class is considered invalid
// (since they are broadcast/network addresses)
//
// - Added exclusion of private, reserved and/or local networks ranges
//
// - Added punycode support in the host/tld/domain part.
// - Added support for dashes in punycoded parts: xn----stqb.tld / к-п.tld
// - Limit punycode to 63 - 'xn--' = 59 bytes, due to max label size in DNS
// the other label sizes are harder to limit properly
// - Some size limits added to the other labels in the DNS domain
// - Added IPv6 support
// - Added a single commonly used reserved domain: localhost
//
// Compressed one-line versions:
//
// Javascript version
//
//
//
// PHP version
//
//
//
var re_weburl = new RegExp(
"^" +
// protocol identifier
"(?:(?:https?|ftp)://)" +
// user:pass authentication
"(?:\\S+(?::\\S*)?@)?" +
"(?:" +
// IP address exclusion
// private & local networks
"(?!10(?:\\.\\d{1,3}){3})" +
"(?!127(?:\\.\\d{1,3}){3})" +
"(?!169\\.254(?:\\.\\d{1,3}){2})" +
"(?!192\\.168(?:\\.\\d{1,3}){2})" +
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
// IP address dotted notation octets
// excludes loopback network 0.0.0.0
// excludes reserved space >= 224.0.0.0
// excludes network & broacast addresses
// (first & last IP address of each class)
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
"|" +
// IPv6 RegEx - http://stackoverflow.com/a/17871737/273668
"\\[(" +
"([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|" + // 1:2:3:4:5:6:7:8
"([0-9a-fA-F]{1,4}:){1,7}:|" + // 1:: 1:2:3:4:5:6:7::
"([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|" + // 1::8 1:2:3:4:5:6::8 1:2:3:4:5:6::8
"([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|" + // 1::7:8 1:2:3:4:5::7:8 1:2:3:4:5::8
"([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|" + // 1::6:7:8 1:2:3:4::6:7:8 1:2:3:4::8
"([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|" + // 1::5:6:7:8 1:2:3::5:6:7:8 1:2:3::8
"([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|" + // 1::4:5:6:7:8 1:2::4:5:6:7:8 1:2::8
"[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|" + // 1::3:4:5:6:7:8 1::3:4:5:6:7:8 1::8
":((:[0-9a-fA-F]{1,4}){1,7}|:)|" + // ::2:3:4:5:6:7:8 ::2:3:4:5:6:7:8 ::8 ::
"fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|" + // fe80::7:8%eth0 fe80::7:8%1 (link-local IPv6 addresses with zone index)
"::(ffff(:0{1,4}){0,1}:){0,1}" +
"((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}" +
"(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|" + // ::255.255.255.255 ::ffff:255.255.255.255 ::ffff:0:255.255.255.255 (IPv4-mapped IPv6 addresses and IPv4-translated addresses)
"([0-9a-fA-F]{1,4}:){1,4}:" +
"((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}" +
"(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])" + // 2001:db8:3:4::192.0.2.33 64:ff9b::192.0.2.33 (IPv4-Embedded IPv6 Address)
")\\]" +
"|" +
"localhost" +
"|" +
// host name
"(?:xn--[a-z0-9\\-]{1,59}|(?:(?:[a-z\\u00a1-\\uffff0-9]+-?){0,62}[a-z\\u00a1-\\uffff0-9]{1,63}))" +
// domain name
"(?:\\.(?:xn--[a-z0-9\\-]{1,59}|(?:[a-z\\u00a1-\\uffff0-9]+-?){0,62}[a-z\\u00a1-\\uffff0-9]{1,63}))*" +
// TLD identifier
"(?:\\.(?:xn--[a-z0-9\\-]{1,59}|(?:[a-z\\u00a1-\\uffff]{2,63})))" +
")" +
// port number
"(?::\\d{2,5})?" +
// resource path
"(?:/[^\\s]*)?" +
"$", "i"
);
<html>
<head>
<meta charset="utf-8" />
<title>URL Regex Tests</title>
<script src="regex-weburl.js"></script>
</head>
<body>
<script src="tests.js"></script>
</body>
</html>
//Unit tests for web url regular expression
//
// It contains all of the test URLs are from http://mathiasbynens.be/demo/url-regex
//
var urls = {validUrls: [
"ftp://foo.bar/baz",
"http://1337.net",
"http://foo.bar/?q=Test%20URL-encoded%20stuff",
"http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com",
"http://142.42.1.1",
"http://142.42.1.1/",
"http://142.42.1.1/foo/bar/baz",
"http://142.42.1.1:8080",
"http://142.42.1.1:8080/",
"http://142.42.1.1:8080/foo/bar/",
"http://223.255.255.254",
"http://2915201185/search?q=hello", // Google
"http://3628126748", // Some server from Coca Cola brazil
"http://3628126748.com",
"http://a.b-c.de",
"http://a.b.c.d.e.f.g.h.i.j.k.l.m.n.o.p.q.r.s.t.u.v.w.x.y.z.com",
"http://code.google.com/events/#&product=browser",
"http://j.mp",
"http://example.com/index.html",
"http://foo-bar.com/baz/quo/",
"http://foo.com/blah_(wikipedia)#cite-1",
"http://foo.com/blah_(wikipedia)_blah#cite-1",
"http://foo.com/unicode_(✪)_in_parens",
"http://foo.com/blah_blah",
"http://foo.com/blah_blah/",
"http://foo.com/blah_blah_(wikipedia)",
"http://foo.com/blah_blah_(wikipedia)_(again)",
"http://foo.com/(something)?after=parens",
"http://☺.damowmow.com/",
"http://google.com:80/",
"http://google3.com",
"http://mw1.google.com/mw-earth-vectordb/kml-samples/gp/seattle/gigapxl/$[level]/r$[y]_c$[x].jpg",
"http://nic.xn--unup4y",
"http://nic.游戏",
"http://user:pass@example.com:123/one/two.three?q1=a1&q2=a2#body",
"http://userid:password@example.com:8080",
"http://userid:password@example.com:8080/",
"http://userid@example.com",
"http://userid@example.com/",
"http://userid@example.com:8080",
"http://userid@example.com:8080/",
"http://userid:password@example.com",
"http://userid:password@example.com/",
"http://usern%40me:password@example.com/",
"http://username:password@example.com/",
"http://www.example.com/wpstyle/?p=364",
"http://www.example.xn--really-long-punycode-test-string-test-tests-123-tests-tests/",
"http://www.microsoft.xn--comindex-g03d.html.irongeek.com/", // YUCK!
"http://xn--h32b13vza.xn--3e0b707e/",
"http://xn—y3h.tk/",
"http://[1080:0:0:0:8:800:200C:417A]/index.html",
"http://[1080::8:800:200C:417A]/foo",
"http://[2010:836B:4179::836B:4179]",
"http://[3ffe:2a00:100:7031::1]",
"http://[::192.9.5.5]/ipng",
"http://[::FFFF:129.144.52.38]:80/index.html",
"http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html",
"http://مثال.إختبار",
"http://例子.测试",
"http://उदाहरण.परीक्षा",
"http://✪df.ws/123",
"http://➡.ws/䨹",
"http://⌘.ws",
"http://⌘.ws/",
"https://asd@xn----7sbbtkohtqhvkc8j.xn--p1ai",
"https://foo-bar.com",
"https://foo.com",
"https://localhost/",
"https://www.example.com/foo/?bar=baz&inga=42&quux",
"http://ABC.com/%7Esmith/home.html",
],
invalidUrls: [
"//",
"///",
"///a",
"//a",
":// should fail",
"foo.com",
"ftps://foo.bar/",
"h://test",
"htt://google.com",
"http:// shouldfail.com",
"http://",
"http://#",
"http://##",
"http://##/",
"http://-a.b.co",
"http://-error-.invalid/",
"http://.",
"http://..",
"http://../",
"http://.www.foo.bar./",
"http://.www.foo.bar/",
"http:///a",
"http://0.0.0.0",
"http://1.1.1.1.1",
"http://10.1.1.0",
"http://10.1.1.1",
"http://10.1.1.254",
"http://10.1.1.255",
"http://123.123.123",
"http://142.42.1.1:8080:30/",
"http://224.1.1.1",
"http://362812.34",
"http://3628126748", // Technically valid though
"http://900.900.900.900/",
"http://?",
"http://??",
"http://??/",
"http://?example.?om/",
"http://@",
"http://@a.ch/",
"http://a.b--c.de/",
"http://a.b-.co",
"http://a@@a.ch/",
"http://a@a@a.ch/",
"http://abc..com/",
"http://foo.bar/foo(bar)baz quux",
"http://foo.bar?q=Spaces should be encoded",
"http://foo_bar.com",
"http://foo_bar.google.com",
"http://go/ogle.com",
"http://google.com/ /",
"http://google\\.com",
"http://www(google.com",
"http://www.example.xn--overly-long-punycode-test-string-test-tests-123-test-test123/",
"http://www.foo.bar./",
"http://www=google.com",
"https://-foo.com",
"https://foo-.com",
"https://foo_bar",
"https://www.g.com/error\n/bleh/bleh",
"rdar://1234",
]
}
var evaluateUrls = function(matchExpected, testUrls, resultParent) {
for (var x = 0; x < testUrls.length; x++) {
var url = testUrls[x];
var result = url.match(re_weburl);
var resultDiv = document.createElement("div");
if (((result == null) && matchExpected) || ((result != null) && !matchExpected)) {
resultDiv.innerText = url + " FAILED!"
} else {
resultDiv.innerText = url + " Success"
}
resultParent.appendChild(resultDiv)
}
}
var testHeader = function(testLabel){var elem = document.createElement("h1"); elem.innerText = testLabel; return elem}
var validDiv = document.createElement("div");
document.body.appendChild(validDiv)
validDiv.appendChild(testHeader("test Valid URLs"));
evaluateUrls(true, urls.validUrls, validDiv)
var invalidDiv = document.createElement("div");
document.body.appendChild(invalidDiv)
invalidDiv.appendChild(testHeader("test invalid URLs"));
evaluateUrls(false, urls.invalidUrls, invalidDiv)
@HenkPoley
Copy link
Author

Even though you can't obviously see it from the punycode, the second url is also invalid because it "ends" in a hyphen / dash. The code thinks the second one is valid.

http://к-.tld/ <--> http://xn----stb.tld/

@CMCDragonkai
Copy link

Wow nice. What do you think of creating a github repo of this with multiple languages implementations, that way we can get pull-requests?

@tunnckoCore
Copy link

@HenkPoley
Nice but something goes wrong here - Ubuntu / Chrome latest version / 18 November 2014.

VALID URL TEST says: these are NOT valid

"http://2915201185/search?q=hello", // Google
"http://3628126748", // Some server from Coca Cola brazil

INVALID URL TEST says: these are valid

"http://a@@a.ch/",
"http://a@a@a.ch/",

pretty demo - only styles changed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment