public
Last active

  • Download Gist
regex-weburl.js
JavaScript
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
//
// Regular Expression for URL validation
//
// Author: Diego Perini
// Updated: 2010/12/05
// License: MIT
//
// Copyright (c) 2010-2013 Diego Perini (http://www.iport.it)
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// the regular expression composed & commented
// could be easily tweaked for RFC compliance,
// it was expressly modified to fit & satisfy
// these test for an URL shortener:
//
// http://mathiasbynens.be/demo/url-regex
//
// Notes on possible differences from a standard/generic validation:
//
// - utf-8 char class take in consideration the full Unicode range
// - TLDs have been made mandatory so single names like "localhost" fails
// - protocols have been restricted to ftp, http and https only as requested
//
// Changes:
//
// - IP address dotted notation validation, range: 1.0.0.0 - 223.255.255.255
// first and last IP address of each class is considered invalid
// (since they are broadcast/network addresses)
//
// - Added exclusion of private, reserved and/or local networks ranges
//
// Compressed one-line versions:
//
// Javascript version
//
// /^(?:(?:https?|ftp):\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:\/[^\s]*)?$/i
//
// PHP version
//
// _^(?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)*[a-z\x{00a1}-\x{ffff}0-9]+)(?:\.(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)*[a-z\x{00a1}-\x{ffff}0-9]+)*(?:\.(?:[a-z\x{00a1}-\x{ffff}]{2,})))(?::\d{2,5})?(?:/[^\s]*)?$_iuS
//
var re_weburl = new RegExp(
"^" +
// protocol identifier
"(?:(?:https?|ftp)://)" +
// user:pass authentication
"(?:\\S+(?::\\S*)?@)?" +
"(?:" +
// IP address exclusion
// private & local networks
"(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
// IP address dotted notation octets
// excludes loopback network 0.0.0.0
// excludes reserved space >= 224.0.0.0
// excludes network & broacast addresses
// (first & last IP address of each class)
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
"|" +
// host name
"(?:(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)" +
// domain name
"(?:\\.(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)*" +
// TLD identifier
"(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" +
")" +
// port number
"(?::\\d{2,5})?" +
// resource path
"(?:/[^\\s]*)?" +
"$", "i"
);

In PHP (for use with preg_match), this becomes:

'%^(?:(?:https?|ftp)://)(?:\S+(?::\S*)?@|\d{1,3}(?:\.\d{1,3}){3}|(?:(?:[a-z\d\x{00a1}-\x{ffff}]+-?)*[a-z\d\x{00a1}-\x{ffff}]+)(?:\.(?:[a-z\d\x{00a1}-\x{ffff}]+-?)*[a-z\d\x{00a1}-\x{ffff}]+)*(?:\.[a-z\x{00a1}-\x{ffff}]{2,6}))(?::\d+)?(?:[^\s]*)?$%iu'

Thanks for the regex Diego, I’ve added it to the test case and it seems to pass all the tests :) Nice job!

I have added simple network ranges validation, the rules I used are:
- valid range 1.0.0.0 - 223.255.255.255, network adresses above and including 224.0.0.0 are reserved addresses
- first and last IP address of each class is excluded since they are used as network broadcast addresses
since I don't think this is worth implementing completely in a regular expression, a following pass should exclude the Intranet address space:
10.0.0.0 - 10.255.255.255
172.16.0.0 - 172.31.255.255
192.168.0.0 - 192.168.255.255
the loopback and the automatic configuration address space:
127.0.0.0 - 127.255.255.255
169.254.0.0 - 169.254.255.255
while the local, multicast and and the reserved address spaces:
0.0.0.0 - 0.255.255.255 (SPECIAL-IPV4-LOCAL-ID-IANA-RESERVED)
224.0.0.0 - 239.255.255 (MCAST-NET)
240.0.0.0 - 255.255.255.255 (SPECIAL-IPV4-FUTURE-USE-IANA-RESERVED)
should already be excluded by the above regular expression.

This a very minimal list of tests to add to your testings:

PASS
"http://10.1.1.1",
"http://10.1.1.254",
"http://223.255.255.254"

FAIL
"http://0.0.0.0",
"http://10.1.1.0",
"http://10.1.1.255",
"http://224.1.1.1",
"http://1.1.1.1.1"

Need testing :)

Need to mention I took the idea of validating the possible IP address ranges in the URL while looking at other developers regular expressions I have seen in your tests, especially the one from @scottgonzales. He also sliced up the Unicode ranges :=), that's the reason his one is so long :)

Awesome stuff Diego!!

Added IP address validation tweaking and optimizations suggested by @abozhilov

Added exclusion of private, reserved, auto-configuration and local network ranges as described in the previous message.
Network 0.0.0.0/8 and all networks >= 224.0.0.0/8 are excluded by the second validation block.
The second validation block also takes care of excluding IP address terminating with 0 or 255 (non usable network and broadcast addresses of each class C network).

It is easy to just remove the unwanted parts of the validation to fit different scopes (length, precision) so I will probably add more options like the list of existing TLD (possibly grouped), the list of existing protocols and/or a fall back for a more generic protocol match too.

Hey, just randomly came across this... my JavaScript URI parsing library does strict URI validation as per RFC 3986. It uses a much larger regular expression then this one. Code can be found at: https://github.com/garycourt/uri-js

I changed it a little bit so that it's valid in Ruby. Here it is:

/\A(?:(?:https?|ftp):\/\/)(?:\S+(?::\S*)?@)?(?:(?!10(?:.\d{1,3}){3})(?!127(?:.\d{1,3}){3})(?!169.254(?:.\d{1,3}){2})(?!192.168(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)(?:.(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)(?:.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:\/[^\s])?\z/i

Hi Diego,

Just came across this awesome code. I'd like to use this as a basis, and I'm hoping you can help me with a simple tweak. I'd like to let through URL's without the protocol specified (HTTP(S) or FTP). For some reason I can't seem to get it to work.

Thanks,
NMMM

Hey Diego, Nice work. You make it a bit shorter though:

(?!10(?:\\.\\d{1,3}){3})
(?!127(?:\\.\\d{1,3}){3})
(?!(10|127)(?:\\.\\d{1,3}){3})

Similarly with the 0.0.255.255 subnets

@dperini Can you assign a license to this? MIT or BSD?

+1 for the license information

+1 for the license information from me, too

+infinity on the license Diego

I have added the MIT License to the gist as requested.

Thank you all for the support.

@dperini: Could you add support for url such this?

//dc8hdnsmzapvm.cloudfront.net/assets/styles/application.css

thanks

Is there a Java version of the regex available? That would be great for my android app!

@mparodi Ruby version untouched by markdown

/\A(?:(?:https?|ftp):\/\/)(?:\S+(?::\S*)?@)?(?:(?!10(?:\.\d{1,3}){3})(?!127(?:\.\d{1,3}){3})(?!169\.254(?:\.\d{1,3}){2})(?!192\.168(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:\/[^\s]*)?\z/i

Ruby port:

class Regexp

  PERFECT_URL_PATTERN = %r{
    \A

    # protocol identifier
    (?:(?:https?|ftp)://)

    # user:pass authentication
    (?:\S+(?::\S*)?@)?

    (?:
      # IP address exclusion
      # private & local networks
      (?!10(?:\.\d{1,3}){3})
      (?!127(?:\.\d{1,3}){3})
      (?!169\.254(?:\.\d{1,3}){2})
      (?!192\.168(?:\.\d{1,3}){2})
      (?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})

      # IP address dotted notation octets
      # excludes loopback network 0.0.0.0
      # excludes reserved space >= 224.0.0.0
      # excludes network & broacast addresses
      # (first & last IP address of each class)
      (?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])
      (?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}
      (?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))
    |
      # host name
      (?:(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)

      # domain name
      (?:\.(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)*

      # TLD identifier
      (?:\.(?:[a-z\u00a1-\uffff]{2,}))
    )

    # port number
    (?::\d{2,5})?

    # resource path
    (?:/[^\s]*)?

    \z
  }xi

end

And specs:

# encoding: utf-8


require "spec_helper"


describe "Regexp::PERFECT_URL_PATTERN" do

  [
    "http://✪df.ws/123",
    "http://userid:password@example.com:8080",
    "http://userid:password@example.com:8080/",
    "http://userid@example.com",
    "http://userid@example.com/",
    "http://userid@example.com:8080",
    "http://userid@example.com:8080/",
    "http://userid:password@example.com",
    "http://userid:password@example.com/",
    "http://142.42.1.1/",
    "http://142.42.1.1:8080/",
    "http://➡.ws/䨹",
    "http://⌘.ws",
    "http://⌘.ws/",
    "http://foo.com/blah_(wikipedia)#cite-1",
    "http://foo.com/blah_(wikipedia)_blah#cite-1",
    "http://foo.com/unicode_(✪)_in_parens",
    "http://foo.com/(something)?after=parens",
    "http://☺.damowmow.com/",
    "http://code.google.com/events/#&product=browser",
    "http://j.mp",
    "ftp://foo.bar/baz",
    "http://foo.bar/?q=Test%20URL-encoded%20stuff",
    "http://مثال.إختبار",
    "http://例子.测试"
  ].each do |valid_url|
    it "matches #{valid_url}" do
      expect(Regexp::PERFECT_URL_PATTERN =~ valid_url).to eq 0
    end
  end



  [
    "http://",
    "http://.",
    "http://..",
    "http://../",
    "http://?",
    "http://??",
    "http://??/",
    "http://#",
    "http://##",
    "http://##/",
    "http://foo.bar?q=Spaces should be encoded",
    "//",
    "//a",
    "///a",
    "///",
    "http:///a",
    "foo.com",
    "rdar://1234",
    "h://test",
    "http:// shouldfail.com",
    ":// should fail",
    "http://foo.bar/foo(bar)baz quux",
    "ftps://foo.bar/",
    "http://-error-.invalid/",
    "http://a.b--c.de/",
    "http://-a.b.co",
    "http://a.b-.co",
    "http://0.0.0.0",
    "http://10.1.1.0",
    "http://10.1.1.255",
    "http://224.1.1.1",
    "http://1.1.1.1.1",
    "http://123.123.123",
    "http://3628126748",
    "http://.www.foo.bar/",
    "http://www.foo.bar./",
    "http://.www.foo.bar./",
    "http://10.1.1.1",
    "http://10.1.1.254"
  ].each do |invalid_url|
    it "does not match #{invalid_url}" do
      expect(Regexp::PERFECT_URL_PATTERN =~ invalid_url).to be_nil
    end
  end

end

very good, thank you for share

Updated the gist with reductions/shortenings suggested by "jpillora".

Thank you !

raitucarp,

to do that you can change line 65 from:

"(?:(?:https?|ftp)://)" +

to

"(?:(?:(?:https?|ftp):)?//)" +

this way the protocol and colon becomes an optional macth.

You can also just leave the double slash on that line if no URLs have the protocol prefix:

"//" +

Why can't the maximum range for Unicode strings extend to U0010ffff (instead of uffff)?

What about relative URLs?

../
./
/

@stevenvachon relatives wouldn't be URLs they would be paths, which wouldn't need this validation at that point.

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.