// | |
// Regular Expression for URL validation | |
// | |
// Author: Diego Perini | |
// Updated: 2010/12/05 | |
// License: MIT | |
// | |
// Copyright (c) 2010-2013 Diego Perini (http://www.iport.it) | |
// | |
// Permission is hereby granted, free of charge, to any person | |
// obtaining a copy of this software and associated documentation | |
// files (the "Software"), to deal in the Software without | |
// restriction, including without limitation the rights to use, | |
// copy, modify, merge, publish, distribute, sublicense, and/or sell | |
// copies of the Software, and to permit persons to whom the | |
// Software is furnished to do so, subject to the following | |
// conditions: | |
// | |
// The above copyright notice and this permission notice shall be | |
// included in all copies or substantial portions of the Software. | |
// | |
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | |
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
// OTHER DEALINGS IN THE SOFTWARE. | |
// | |
// the regular expression composed & commented | |
// could be easily tweaked for RFC compliance, | |
// it was expressly modified to fit & satisfy | |
// these test for an URL shortener: | |
// | |
// http://mathiasbynens.be/demo/url-regex | |
// | |
// Notes on possible differences from a standard/generic validation: | |
// | |
// - utf-8 char class take in consideration the full Unicode range | |
// - TLDs have been made mandatory so single names like "localhost" fails | |
// - protocols have been restricted to ftp, http and https only as requested | |
// | |
// Changes: | |
// | |
// - IP address dotted notation validation, range: 1.0.0.0 - 223.255.255.255 | |
// first and last IP address of each class is considered invalid | |
// (since they are broadcast/network addresses) | |
// | |
// - Added exclusion of private, reserved and/or local networks ranges | |
// | |
// - Made starting path slash optional (http://example.com?foo=bar) | |
// | |
// - Allow a dot (.) at the end of hostnames (http://example.com.) | |
// | |
// Compressed one-line versions: | |
// | |
// Javascript version | |
// | |
// /^(?:(?:https?|ftp):\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,}))\.?)(?::\d{2,5})?(?:[/?#]\S*)?$/i | |
// | |
// PHP version | |
// | |
// _^(?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\x{00a1}-\x{ffff}0-9]-*)*[a-z\x{00a1}-\x{ffff}0-9]+)(?:\.(?:[a-z\x{00a1}-\x{ffff}0-9]-*)*[a-z\x{00a1}-\x{ffff}0-9]+)*(?:\.(?:[a-z\x{00a1}-\x{ffff}]{2,}))\.?)(?::\d{2,5})?(?:[/?#]\S*)?$_iuS | |
// | |
var re_weburl = new RegExp( | |
"^" + | |
// protocol identifier | |
"(?:(?:https?|ftp)://)" + | |
// user:pass authentication | |
"(?:\\S+(?::\\S*)?@)?" + | |
"(?:" + | |
// IP address exclusion | |
// private & local networks | |
"(?!(?:10|127)(?:\\.\\d{1,3}){3})" + | |
"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" + | |
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" + | |
// IP address dotted notation octets | |
// excludes loopback network 0.0.0.0 | |
// excludes reserved space >= 224.0.0.0 | |
// excludes network & broacast addresses | |
// (first & last IP address of each class) | |
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" + | |
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" + | |
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" + | |
"|" + | |
// host name | |
"(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)" + | |
// domain name | |
"(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*" + | |
// TLD identifier | |
"(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" + | |
// TLD may end with dot | |
"\\.?" + | |
")" + | |
// port number | |
"(?::\\d{2,5})?" + | |
// resource path | |
"(?:[/?#]\\S*)?" + | |
"$", "i" | |
); |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
mathiasbynens
commented
Dec 5, 2010
In PHP (for use with
Thanks for the regex Diego, I’ve added it to the test case and it seems to pass all the tests :) Nice job! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Dec 6, 2010
I have added simple network ranges validation, the rules I used are:
- valid range 1.0.0.0 - 223.255.255.255, network adresses above and including 224.0.0.0 are reserved addresses
- first and last IP address of each class is excluded since they are used as network broadcast addresses
since I don't think this is worth implementing completely in a regular expression, a following pass should exclude the Intranet address space:
10.0.0.0 - 10.255.255.255
172.16.0.0 - 172.31.255.255
192.168.0.0 - 192.168.255.255
the loopback and the automatic configuration address space:
127.0.0.0 - 127.255.255.255
169.254.0.0 - 169.254.255.255
while the local, multicast and and the reserved address spaces:
0.0.0.0 - 0.255.255.255 (SPECIAL-IPV4-LOCAL-ID-IANA-RESERVED)
224.0.0.0 - 239.255.255 (MCAST-NET)
240.0.0.0 - 255.255.255.255 (SPECIAL-IPV4-FUTURE-USE-IANA-RESERVED)
should already be excluded by the above regular expression.
This a very minimal list of tests to add to your testings:
PASS
"http://10.1.1.1",
"http://10.1.1.254",
"http://223.255.255.254"
FAIL
"http://0.0.0.0",
"http://10.1.1.0",
"http://10.1.1.255",
"http://224.1.1.1",
"http://1.1.1.1.1"
Need testing :)
I have added simple network ranges validation, the rules I used are: This a very minimal list of tests to add to your testings: PASS FAIL Need testing :) |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Dec 6, 2010
Need to mention I took the idea of validating the possible IP address ranges in the URL while looking at other developers regular expressions I have seen in your tests, especially the one from @scottgonzales. He also sliced up the Unicode ranges :=), that's the reason his one is so long :)
Need to mention I took the idea of validating the possible IP address ranges in the URL while looking at other developers regular expressions I have seen in your tests, especially the one from @scottgonzales. He also sliced up the Unicode ranges :=), that's the reason his one is so long :) |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
jgornick
commented
Dec 6, 2010
Awesome stuff Diego!! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Added IP address validation tweaking and optimizations suggested by @abozhilov |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Dec 9, 2010
Added exclusion of private, reserved, auto-configuration and local network ranges as described in the previous message.
Network 0.0.0.0/8 and all networks >= 224.0.0.0/8 are excluded by the second validation block.
The second validation block also takes care of excluding IP address terminating with 0 or 255 (non usable network and broadcast addresses of each class C network).
It is easy to just remove the unwanted parts of the validation to fit different scopes (length, precision) so I will probably add more options like the list of existing TLD (possibly grouped), the list of existing protocols and/or a fall back for a more generic protocol match too.
Added exclusion of private, reserved, auto-configuration and local network ranges as described in the previous message. It is easy to just remove the unwanted parts of the validation to fit different scopes (length, precision) so I will probably add more options like the list of existing TLD (possibly grouped), the list of existing protocols and/or a fall back for a more generic protocol match too. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
garycourt
Dec 10, 2010
Hey, just randomly came across this... my JavaScript URI parsing library does strict URI validation as per RFC 3986. It uses a much larger regular expression then this one. Code can be found at: https://github.com/garycourt/uri-js
garycourt
commented
Dec 10, 2010
Hey, just randomly came across this... my JavaScript URI parsing library does strict URI validation as per RFC 3986. It uses a much larger regular expression then this one. Code can be found at: https://github.com/garycourt/uri-js |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
xttam
Feb 8, 2013
I changed it a little bit so that it's valid in Ruby. Here it is:
/\A(?:(?:https?|ftp)://)(?:\S+(?::\S_)?@)?(?:(?!10(?:.\d{1,3}){3})(?!127(?:.\d{1,3}){3})(?!169.254(?:.\d{1,3}){2})(?!192.168(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)(?:.(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)_(?:.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/[^\s]*)?\z/i
xttam
commented
Feb 8, 2013
I changed it a little bit so that it's valid in Ruby. Here it is: /\A(?:(?:https?|ftp)://)(?:\S+(?::\S_)?@)?(?:(?!10(?:.\d{1,3}){3})(?!127(?:.\d{1,3}){3})(?!169.254(?:.\d{1,3}){2})(?!192.168(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)(?:.(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)_(?:.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/[^\s]*)?\z/i |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
newmediamadman
Feb 27, 2013
Hi Diego,
Just came across this awesome code. I'd like to use this as a basis, and I'm hoping you can help me with a simple tweak. I'd like to let through URL's without the protocol specified (HTTP(S) or FTP). For some reason I can't seem to get it to work.
Thanks,
NMMM
newmediamadman
commented
Feb 27, 2013
Hi Diego, Just came across this awesome code. I'd like to use this as a basis, and I'm hoping you can help me with a simple tweak. I'd like to let through URL's without the protocol specified (HTTP(S) or FTP). For some reason I can't seem to get it to work. Thanks, |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
jpillora
Jun 2, 2013
Hey Diego, Nice work. You make it a bit shorter though:
(?!10(?:\\.\\d{1,3}){3})
(?!127(?:\\.\\d{1,3}){3})
(?!(10|127)(?:\\.\\d{1,3}){3})
Similarly with the 0.0.255.255 subnets
jpillora
commented
Jun 2, 2013
Hey Diego, Nice work. You make it a bit shorter though:
Similarly with the 0.0.255.255 subnets |
gauthierm
commented
Sep 17, 2013
@dperini Can you assign a license to this? MIT or BSD? |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
nghuuphuoc
commented
Oct 2, 2013
+1 for the license information |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dkart
commented
Oct 11, 2013
+1 for the license information from me, too |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
utopiaio
commented
Nov 1, 2013
+infinity on the license Diego |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Nov 5, 2013
I have added the MIT License to the gist as requested.
Thank you all for the support.
I have added the MIT License to the gist as requested. Thank you all for the support. |
raitucarp
commented
Nov 27, 2013
@dperini: Could you add support for url such this? //dc8hdnsmzapvm.cloudfront.net/assets/styles/application.css thanks |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
pjacobs
Dec 4, 2013
Is there a Java version of the regex available? That would be great for my android app!
pjacobs
commented
Dec 4, 2013
Is there a Java version of the regex available? That would be great for my android app! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
phiyangt
Dec 10, 2013
@mparodi Ruby version untouched by markdown
/\A(?:(?:https?|ftp):\/\/)(?:\S+(?::\S*)?@)?(?:(?!10(?:\.\d{1,3}){3})(?!127(?:\.\d{1,3}){3})(?!169\.254(?:\.\d{1,3}){2})(?!192\.168(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:\/[^\s]*)?\z/i
phiyangt
commented
Dec 10, 2013
@mparodi Ruby version untouched by markdown
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
ixti
Dec 19, 2013
Ruby port:
class Regexp
PERFECT_URL_PATTERN = %r{
\A
# protocol identifier
(?:(?:https?|ftp)://)
# user:pass authentication
(?:\S+(?::\S*)?@)?
(?:
# IP address exclusion
# private & local networks
(?!10(?:\.\d{1,3}){3})
(?!127(?:\.\d{1,3}){3})
(?!169\.254(?:\.\d{1,3}){2})
(?!192\.168(?:\.\d{1,3}){2})
(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})
# IP address dotted notation octets
# excludes loopback network 0.0.0.0
# excludes reserved space >= 224.0.0.0
# excludes network & broacast addresses
# (first & last IP address of each class)
(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])
(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}
(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))
|
# host name
(?:(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)
# domain name
(?:\.(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)*
# TLD identifier
(?:\.(?:[a-z\u00a1-\uffff]{2,}))
)
# port number
(?::\d{2,5})?
# resource path
(?:/[^\s]*)?
\z
}xi
end
And specs:
# encoding: utf-8
require "spec_helper"
describe "Regexp::PERFECT_URL_PATTERN" do
[
"http://✪df.ws/123",
"http://userid:password@example.com:8080",
"http://userid:password@example.com:8080/",
"http://userid@example.com",
"http://userid@example.com/",
"http://userid@example.com:8080",
"http://userid@example.com:8080/",
"http://userid:password@example.com",
"http://userid:password@example.com/",
"http://142.42.1.1/",
"http://142.42.1.1:8080/",
"http://➡.ws/䨹",
"http://⌘.ws",
"http://⌘.ws/",
"http://foo.com/blah_(wikipedia)#cite-1",
"http://foo.com/blah_(wikipedia)_blah#cite-1",
"http://foo.com/unicode_(✪)_in_parens",
"http://foo.com/(something)?after=parens",
"http://☺.damowmow.com/",
"http://code.google.com/events/#&product=browser",
"http://j.mp",
"ftp://foo.bar/baz",
"http://foo.bar/?q=Test%20URL-encoded%20stuff",
"http://مثال.إختبار",
"http://例子.测试"
].each do |valid_url|
it "matches #{valid_url}" do
expect(Regexp::PERFECT_URL_PATTERN =~ valid_url).to eq 0
end
end
[
"http://",
"http://.",
"http://..",
"http://../",
"http://?",
"http://??",
"http://??/",
"http://#",
"http://##",
"http://##/",
"http://foo.bar?q=Spaces should be encoded",
"//",
"//a",
"///a",
"///",
"http:///a",
"foo.com",
"rdar://1234",
"h://test",
"http:// shouldfail.com",
":// should fail",
"http://foo.bar/foo(bar)baz quux",
"ftps://foo.bar/",
"http://-error-.invalid/",
"http://a.b--c.de/",
"http://-a.b.co",
"http://a.b-.co",
"http://0.0.0.0",
"http://10.1.1.0",
"http://10.1.1.255",
"http://224.1.1.1",
"http://1.1.1.1.1",
"http://123.123.123",
"http://3628126748",
"http://.www.foo.bar/",
"http://www.foo.bar./",
"http://.www.foo.bar./",
"http://10.1.1.1",
"http://10.1.1.254"
].each do |invalid_url|
it "does not match #{invalid_url}" do
expect(Regexp::PERFECT_URL_PATTERN =~ invalid_url).to be_nil
end
end
end
ixti
commented
Dec 19, 2013
Ruby port: class Regexp
PERFECT_URL_PATTERN = %r{
\A
# protocol identifier
(?:(?:https?|ftp)://)
# user:pass authentication
(?:\S+(?::\S*)?@)?
(?:
# IP address exclusion
# private & local networks
(?!10(?:\.\d{1,3}){3})
(?!127(?:\.\d{1,3}){3})
(?!169\.254(?:\.\d{1,3}){2})
(?!192\.168(?:\.\d{1,3}){2})
(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})
# IP address dotted notation octets
# excludes loopback network 0.0.0.0
# excludes reserved space >= 224.0.0.0
# excludes network & broacast addresses
# (first & last IP address of each class)
(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])
(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}
(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))
|
# host name
(?:(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)
# domain name
(?:\.(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)*
# TLD identifier
(?:\.(?:[a-z\u00a1-\uffff]{2,}))
)
# port number
(?::\d{2,5})?
# resource path
(?:/[^\s]*)?
\z
}xi
end And specs: # encoding: utf-8
require "spec_helper"
describe "Regexp::PERFECT_URL_PATTERN" do
[
"http://✪df.ws/123",
"http://userid:password@example.com:8080",
"http://userid:password@example.com:8080/",
"http://userid@example.com",
"http://userid@example.com/",
"http://userid@example.com:8080",
"http://userid@example.com:8080/",
"http://userid:password@example.com",
"http://userid:password@example.com/",
"http://142.42.1.1/",
"http://142.42.1.1:8080/",
"http://➡.ws/䨹",
"http://⌘.ws",
"http://⌘.ws/",
"http://foo.com/blah_(wikipedia)#cite-1",
"http://foo.com/blah_(wikipedia)_blah#cite-1",
"http://foo.com/unicode_(✪)_in_parens",
"http://foo.com/(something)?after=parens",
"http://☺.damowmow.com/",
"http://code.google.com/events/#&product=browser",
"http://j.mp",
"ftp://foo.bar/baz",
"http://foo.bar/?q=Test%20URL-encoded%20stuff",
"http://مثال.إختبار",
"http://例子.测试"
].each do |valid_url|
it "matches #{valid_url}" do
expect(Regexp::PERFECT_URL_PATTERN =~ valid_url).to eq 0
end
end
[
"http://",
"http://.",
"http://..",
"http://../",
"http://?",
"http://??",
"http://??/",
"http://#",
"http://##",
"http://##/",
"http://foo.bar?q=Spaces should be encoded",
"//",
"//a",
"///a",
"///",
"http:///a",
"foo.com",
"rdar://1234",
"h://test",
"http:// shouldfail.com",
":// should fail",
"http://foo.bar/foo(bar)baz quux",
"ftps://foo.bar/",
"http://-error-.invalid/",
"http://a.b--c.de/",
"http://-a.b.co",
"http://a.b-.co",
"http://0.0.0.0",
"http://10.1.1.0",
"http://10.1.1.255",
"http://224.1.1.1",
"http://1.1.1.1.1",
"http://123.123.123",
"http://3628126748",
"http://.www.foo.bar/",
"http://www.foo.bar./",
"http://.www.foo.bar./",
"http://10.1.1.1",
"http://10.1.1.254"
].each do |invalid_url|
it "does not match #{invalid_url}" do
expect(Regexp::PERFECT_URL_PATTERN =~ invalid_url).to be_nil
end
end
end |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
jasonhzy
commented
Dec 30, 2013
very good, thank you for share |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
HenkPoley
Feb 9, 2014
I added support for punycoded domain names: https://gist.github.com/HenkPoley/8899766
HenkPoley
commented
Feb 9, 2014
I added support for punycoded domain names: https://gist.github.com/HenkPoley/8899766 |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Feb 14, 2014
Updated the gist with reductions/shortenings suggested by "jpillora".
Thank you !
Updated the gist with reductions/shortenings suggested by "jpillora". Thank you ! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Feb 14, 2014
raitucarp,
to do that you can change line 65 from:
"(?:(?:https?|ftp)://)" +
to
"(?:(?:(?:https?|ftp):)?//)" +
this way the protocol and colon becomes an optional macth.
You can also just leave the double slash on that line if no URLs have the protocol prefix:
"//" +
raitucarp, to do that you can change line 65 from:
to
this way the protocol and colon becomes an optional macth. You can also just leave the double slash on that line if no URLs have the protocol prefix:
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
ghost
Mar 11, 2014
Why can't the maximum range for Unicode strings extend to U0010ffff (instead of uffff)?
ghost
commented
Mar 11, 2014
Why can't the maximum range for Unicode strings extend to U0010ffff (instead of uffff)? |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
stevenvachon
commented
Mar 24, 2014
What about relative URLs?
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
silentworks
Mar 24, 2014
@stevenvachon relatives wouldn't be URLs they would be paths, which wouldn't need this validation at that point.
silentworks
commented
Mar 24, 2014
@stevenvachon relatives wouldn't be URLs they would be paths, which wouldn't need this validation at that point. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
jkj
Jun 6, 2014
I recently needed this but have a dumb question. In the very last part for the resource path, why do you use [^\\s]
rather than \\S
? To my understanding they are equivalent, with the latter being a bit shorter.
jkj
commented
Jun 6, 2014
I recently needed this but have a dumb question. In the very last part for the resource path, why do you use |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dimroc
Jun 9, 2014
For the following Regex and the one pasted by ixti:
URL = /\A(?:(?:https?):\/\/)?(?:\S+(?::\S*)?@)?(?:(?:(?:[a-z0-9][a-z0-9\-]+)*[a-z0-9]+)(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*(?:\.(?:[a-z]{2,})(:\d{1,5})?))(?:\/[^\s]*)?\z/i
You will end up with extremely slow matching, to the point where you suspect an infinite loop, if you have a long subdomain for a URL ending with a period:
ie:
it { should_not match "http://aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.randomstring." }
The longer the subdomain "aaa....", the longer it'll take.
dimroc
commented
Jun 9, 2014
For the following Regex and the one pasted by ixti: URL = /\A(?:(?:https?):\/\/)?(?:\S+(?::\S*)?@)?(?:(?:(?:[a-z0-9][a-z0-9\-]+)*[a-z0-9]+)(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*(?:\.(?:[a-z]{2,})(:\d{1,5})?))(?:\/[^\s]*)?\z/i You will end up with extremely slow matching, to the point where you suspect an infinite loop, if you have a long subdomain for a URL ending with a period: ie: it { should_not match "http://aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.randomstring." } The longer the subdomain "aaa....", the longer it'll take. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
phiyangt
Jun 9, 2014
Fixed the URL Regex to make the subdomain match non-recursive thereby improving performance. Long story short: it passed our existing test suite and improved performance dramatically.
URL = /\A(?:(?:https?):\/\/)?(?:\S+(?::\S*)?@)?(?:(?:([a-z0-9][a-z0-9\-]*)?[a-z0-9]+)(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*(?:\.(?:[a-z]{2,})(:\d{1,5})?))(?:\/[^\s]*)?\z/i
phiyangt
commented
Jun 9, 2014
Fixed the URL Regex to make the subdomain match non-recursive thereby improving performance. Long story short: it passed our existing test suite and improved performance dramatically. URL = /\A(?:(?:https?):\/\/)?(?:\S+(?::\S*)?@)?(?:(?:([a-z0-9][a-z0-9\-]*)?[a-z0-9]+)(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*(?:\.(?:[a-z]{2,})(:\d{1,5})?))(?:\/[^\s]*)?\z/i |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
brifordwylie
Jun 22, 2014
Anyone have a python port? My recollection was that the python regexp engine does have some differences.
brifordwylie
commented
Jun 22, 2014
Anyone have a python port? My recollection was that the python regexp engine does have some differences. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
CMCDragonkai
Jun 23, 2014
@dperini you should add support for 32bit addresses and ipv6 addresses.
https://news.ycombinator.com/item?id=7928990
I vote that this should be turned into a git repository with multi-language ports.
CMCDragonkai
commented
Jun 23, 2014
@dperini you should add support for 32bit addresses and ipv6 addresses. https://news.ycombinator.com/item?id=7928990 I vote that this should be turned into a git repository with multi-language ports. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Feendish
Jun 23, 2014
I'm also using the top of the page gist regex in JS and finding it very slow to process long invalid URLs such as:
http://qweqweqweqwesadasdqweeqweqwsd
The more letters added there the slower the response.
It sounds like what @phiyangt is referring to above.
Is there any solution for this for JS?
Thanks.
Feendish
commented
Jun 23, 2014
I'm also using the top of the page gist regex in JS and finding it very slow to process long invalid URLs such as: The more letters added there the slower the response. It sounds like what @phiyangt is referring to above. Is there any solution for this for JS? Thanks. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Jun 24, 2014
Well after a few test I can say the slowdown and further browser crash is a Chrome only problem.
I tried the same in Firefox and everything works correctly with these REGEXP, no problem or slowdown.
I have reduced the original REGEXP to a minimal to be able to show the problem.
Try the following line in Chrome console, it will crash the browser:
/^(?:\w+)(?:.(?:[\w]+-?)[\w]+)(?:.[a-z]{2,})$/i.test('www.isjdfofjasodfjsodifjosadifjsdoiafjaisdjfisdfjs');
So I believe this is just a bug in Chrome RE engine.
Well after a few test I can say the slowdown and further browser crash is a Chrome only problem. I have reduced the original REGEXP to a minimal to be able to show the problem. /^(?:\w+)(?:.(?:[\w]+-?)[\w]+)(?:.[a-z]{2,})$/i.test('www.isjdfofjasodfjsodifjosadifjsdoiafjaisdjfisdfjs'); So I believe this is just a bug in Chrome RE engine. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Feendish
Jun 25, 2014
Hi Diego,
Yeah I'm on latest stable Chrome (Version 35.0.1916.153 m).
This is the "bad" url I'm checking http://qweqweqweqwesadasdqweeqweqwsdqweqweqweqwesadasdqweeqweqwsd
The original regex I'm using (the one from the Gist on top - 1 liner or full version) locks the browser in Chrome as you say. It also locks up IE11.
In Firefox 29 it gave this error:
InternalError: an error occurred while executing regular expression
I updated to latest Firefox v30. The regex runs and gives false which is correct.
From some research online it appears Chrome does not halt execution when there is catastrophic backtracking in a regex. Safari, Firefox and IE could just report 'no match' after some arbitrary number of backtracks.
I also tried your recent regex above and it doesn't lock any browsers.
However it returns true for 'isjdfofjasodfjsodifjosadifjsdoiafjaisdjfisdfjs' which is invalid.
It also returns false for 'http://isjdfofjasodfjsodifjosadifjsdoiafjaisdjfisdfjs.com' which is incorrect.
Are you sure there isn't a runaway loop in there somewhere?
Feendish
commented
Jun 25, 2014
Hi Diego, Yeah I'm on latest stable Chrome (Version 35.0.1916.153 m). This is the "bad" url I'm checking http://qweqweqweqwesadasdqweeqweqwsdqweqweqweqwesadasdqweeqweqwsd The original regex I'm using (the one from the Gist on top - 1 liner or full version) locks the browser in Chrome as you say. It also locks up IE11. In Firefox 29 it gave this error: I updated to latest Firefox v30. The regex runs and gives false which is correct. From some research online it appears Chrome does not halt execution when there is catastrophic backtracking in a regex. Safari, Firefox and IE could just report 'no match' after some arbitrary number of backtracks. I also tried your recent regex above and it doesn't lock any browsers. However it returns true for 'isjdfofjasodfjsodifjosadifjsdoiafjaisdjfisdfjs' which is invalid. Are you sure there isn't a runaway loop in there somewhere? |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Jun 25, 2014
@Feendish
I don't know why copying and pasting the above RE in Chrome console mangles some character, it actually doesn't crash the console of the browser window.
Please try to cut and paste the RE from this tweet:
https://twitter.com/diegoperini/status/481449088270229504
I retested it and it actually crashes the console in that it doesn't answer to commands anymore after running that RE test that you can find in the above tweet.
The fact that the original RE also works on Safari pushes me to believe it's a Chrome problem but I need to do more tests. The "weburl" RE also work in PHP and other environments.
I am testing on the same Chrome Version 35.0.1916.153 under OS X 10.9.3.
Suggestion and help on this matter are welcome !
@Feendish Please try to cut and paste the RE from this tweet: I retested it and it actually crashes the console in that it doesn't answer to commands anymore after running that RE test that you can find in the above tweet. The fact that the original RE also works on Safari pushes me to believe it's a Chrome problem but I need to do more tests. The "weburl" RE also work in PHP and other environments. I am testing on the same Chrome Version 35.0.1916.153 under OS X 10.9.3. Suggestion and help on this matter are welcome ! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
mathiasbynens
Jun 26, 2014
@dperini This seems to be a V8 issue. Relevant bug ticket: https://code.google.com/p/v8/issues/detail?id=430
mathiasbynens
commented
Jun 26, 2014
@dperini This seems to be a V8 issue. Relevant bug ticket: https://code.google.com/p/v8/issues/detail?id=430 |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Feendish
Jun 26, 2014
@dperini I ran the RE from the tweet in RegexBuddy analyser and it says "Your regular expression leads to "catastrophic backtracking", making it too complex to be run to completion."
It locks up Chrome & Opera but not Firefox. As the ticket @mathiasbynens linked to suggests, certain browsers are more lenient when catastrophic backtracking happens. Chrome V8 seems to not have any fail limit for this and puts the onus on the regex format.
Feendish
commented
Jun 26, 2014
@dperini I ran the RE from the tweet in RegexBuddy analyser and it says "Your regular expression leads to "catastrophic backtracking", making it too complex to be run to completion." It locks up Chrome & Opera but not Firefox. As the ticket @mathiasbynens linked to suggests, certain browsers are more lenient when catastrophic backtracking happens. Chrome V8 seems to not have any fail limit for this and puts the onus on the regex format. |
@Feendish |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Feendish
commented
Jun 30, 2014
Sure sent it there now. Thanks. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
EtaiG
Jul 2, 2014
@dperini, we've found this issue too... looks like there's a highly exponential recursion into infinity on simple strings.
I've managed to reduce this to the way the hostname check is written (since it's followed later (eventually) by TLD).
It's this simple format that will cause the problem:
var regx = new RegExp('^(\\w+)*[^\\w]$');
regx.test('aaaaaaaaaaaaaaaaaaaaaaaaaa'); //chrome will crash
In other words, when you have a repeat of something 1 -> infinity times, and this group is repeated 0->infinity times, and the next match is for anything not in the group (obviously... but I put [^w] just to illustrate), then chrome will keep recursion to search for a possible group of (1->n) which repeats (0->m) times which has that letter matching.
Of course, internally, the regex should first be run 'greedily' to check if there's a possible match by making sure required letters are there..
Essentially, if I were to write the implementation for a regex, when encountering such a group, I would internally be doing this:
var regx = new RegExp('^(?=\w*[^\w])(?:\w+)*[^\w]$');
regx.test('aaaaaaaaaaaaaaaaaaaaaaaaaa'); //chrome will not crash
because first I'm doing a positive lookahead to check if this is even possible... though the complexity for this rises as the nested groups become more complex
Finally, I think this can be fixed here, by changing the host name from:
(?:(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)
to:
(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)
which is really the same thing, if you think about it.
EtaiG
commented
Jul 2, 2014
@dperini, we've found this issue too... looks like there's a highly exponential recursion into infinity on simple strings. I've managed to reduce this to the way the hostname check is written (since it's followed later (eventually) by TLD).
In other words, when you have a repeat of something 1 -> infinity times, and this group is repeated 0->infinity times, and the next match is for anything not in the group (obviously... but I put [^w] just to illustrate), then chrome will keep recursion to search for a possible group of (1->n) which repeats (0->m) times which has that letter matching. Of course, internally, the regex should first be run 'greedily' to check if there's a possible match by making sure required letters are there.. Essentially, if I were to write the implementation for a regex, when encountering such a group, I would internally be doing this:
because first I'm doing a positive lookahead to check if this is even possible... though the complexity for this rises as the nested groups become more complex Finally, I think this can be fixed here, by changing the host name from:
to:
which is really the same thing, if you think about it. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
EtaiG
Jul 2, 2014
In fact, I believe the whole host-domain-TLD identifier is the same as this (but this should be more performant and not crash):
// host name
"(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+" +
// domain name
"(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9])*" +
// TLD identifier
"\\.[a-z\\u00a1-\\uffff]{2,}" +
There's no need to add non-capturing groups if you're not doing anything with the group... if you plan to modify a group with a repeater, lookahead or just use an OR operator in it, then use a group, but otherwise there's really no point (since all you want, is to make sure everything in the group is present... which you don't need to use a group for!)
EtaiG
commented
Jul 2, 2014
In fact, I believe the whole host-domain-TLD identifier is the same as this (but this should be more performant and not crash):
There's no need to add non-capturing groups if you're not doing anything with the group... if you plan to modify a group with a repeater, lookahead or just use an OR operator in it, then use a group, but otherwise there's really no point (since all you want, is to make sure everything in the group is present... which you don't need to use a group for!) |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Jul 4, 2014
Thank you @EtaiG,
your expression looks good too.
However I have been pushed to "re-read" the specifications throughly and was answered on a V8 ticket here: https://code.google.com/p/v8/issues/detail?id=430
In post #21 @erik suggested I consider rewriting the labels matching parts using lookahead.
Since most wanted a Javascript to use as a pattern checking inputs I did tests in Javascript only.
This is the result of following his advice, no ftp protocol no special IP handling, only the minimal:
var re_weburl = new RegExp(
"^" +
// protocol identifier (optional) + //
"(?:(?:https?:)?//)?" +
// user:pass authentication (optional)
"(?:\\S+(?::\\S*)?@)?" +
// host (optional) + domain + tld
"(?:(?!-)[-a-z0-9\\u00a1-\\uffff]*[a-z0-9\\u00a1-\\uffff]+(?!./|\\.$)\\.?){2,}" +
// server port number (optional)
"(?::\\d{2,5})?" +
// resource path (optional)
"(?:/\\S*)?" +
"$", "i"
);
This RE fits in a tweet ! But let's see how it works for you.
I also changed [^\s] with a \S as suggested by @jkj and relaxed the match on protocol identifiers.
Consecutive hyphens are allowed by specifications but they must not be found in both 3rd and 4th positions, those sequences are reserved for "xn--" and similar ASCII Compatible Encodings. If that exclusion were necessary maybe a simple lookahead (?|..--) will help there too.
Thank you @EtaiG, However I have been pushed to "re-read" the specifications throughly and was answered on a V8 ticket here: https://code.google.com/p/v8/issues/detail?id=430 Since most wanted a Javascript to use as a pattern checking inputs I did tests in Javascript only. This is the result of following his advice, no ftp protocol no special IP handling, only the minimal:
This RE fits in a tweet ! But let's see how it works for you. I also changed [^\s] with a \S as suggested by @jkj and relaxed the match on protocol identifiers. Consecutive hyphens are allowed by specifications but they must not be found in both 3rd and 4th positions, those sequences are reserved for "xn--" and similar ASCII Compatible Encodings. If that exclusion were necessary maybe a simple lookahead (?|..--) will help there too. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
EtaiG
Jul 5, 2014
@dperini , thanks for responding.
I read all the specifications too last week (RFC's 5890 - 5894 and RFC 3492, several times), due to this issue. I'm also poster #24 in the google v8 thread.
Please note that I will be analysing this issue in depth below, and if I come off critical - that is not my intent, so I apologize in advance.
I disagree with the negative lookaheads. There are rare cases when they are truly useful.
I believe in minimizing them whenever possible, especially when repeating something up to an 'infinite' amount of times, since they can cause dreadful performance for complicated matches..
I like being more explicit about the regex- which may make it more verbose, but it's very clear what the javascript engine needs to do to match it.
For example, when you have:
// host (optional) + domain + tld
"(?:(?!-)[-a-z0-9\\u00a1-\\uffff]*[a-z0-9\\u00a1-\\uffff]+(?!./|\\.$)\\.?){2,}" +
This part can match long strings in too many different ways, and the regex is too general, so for characters which would match both the first character group and the second (namely, almost anything except for a dot and a hyphen), it can match an exponential number of times.
For example, it can match 'ab' as:
a b | ab
and it can match 'abc' as:
a b c | a bc | abc | ab c
and it can match 'abcd' as:
a b c d | a b cd | a bc d | a bcd | abcd | ab c d | ab cd | abc d
It's easy to see that for a string of length n, it has 2^(n-1) possible matches.
The way a greedy quantifier works is that it will stop as soon as it finds a possible match - otherwise it will try the next possibility in order to continue matching the regular expression.
This means that a sufficiently long string (i.e n = 21) which would result in a non-match, such as:
'aaaaaaaaaaaaaaaaaaaa.' (note the period at the end)
can cause it to take extremely long, an possibly crash (2^20 > 1,000,000)
Ignoring what's actually placed in memory and checked during a regex, by putting this in console, you can see what I mean:
var i=0, len = 2<<20;
console.time('test');
while(i<len){i++}
console.timeEnd('test');
// approximately 8s
You can test out your regex against that string (the one with the period at the end) and you'll see what I mean.
Also, note that 'aaaaaaaaaaaaaaaaaaaaaaaaaa' will match your regex although it's invalid.
This is because of the generalization of the check using greedy quantifiers, enabled by the negative lookahead (?!./|.$) (or by both of them?)
This is why I don't like negative lookaheads and prefer to be more declarative. You're almost forced to be more declarative when you don't use the negative lookaheads... but in the end, you are giving 'better instructions' to the javascript engine.
That's why I liked this better (for the host/domain/tld):
/(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9])*\.[a-z\u00a1-\uffff]{2,}/
Note that this is the same as what I posted above, with the exception of switching out the -? for -* (in both host and domain) to allow for as many hyphens in between letters.
This doesn't take care of the xn-- and 3rd/4th position issue, but unless you're allowing someone to register a domain by you, this is less of an issue (since for most cases, it's for a link, and people only need to link to something that is allowed and exists)... and even then, serverside validation would be necessary.
EtaiG
commented
Jul 5, 2014
@dperini , thanks for responding. Please note that I will be analysing this issue in depth below, and if I come off critical - that is not my intent, so I apologize in advance. I disagree with the negative lookaheads. There are rare cases when they are truly useful. I like being more explicit about the regex- which may make it more verbose, but it's very clear what the javascript engine needs to do to match it. For example, when you have:
This part can match long strings in too many different ways, and the regex is too general, so for characters which would match both the first character group and the second (namely, almost anything except for a dot and a hyphen), it can match an exponential number of times. For example, it can match 'ab' as: It's easy to see that for a string of length n, it has 2^(n-1) possible matches. The way a greedy quantifier works is that it will stop as soon as it finds a possible match - otherwise it will try the next possibility in order to continue matching the regular expression.
You can test out your regex against that string (the one with the period at the end) and you'll see what I mean. Also, note that 'aaaaaaaaaaaaaaaaaaaaaaaaaa' will match your regex although it's invalid. This is because of the generalization of the check using greedy quantifiers, enabled by the negative lookahead (?!./|.$) (or by both of them?) This is why I don't like negative lookaheads and prefer to be more declarative. You're almost forced to be more declarative when you don't use the negative lookaheads... but in the end, you are giving 'better instructions' to the javascript engine. That's why I liked this better (for the host/domain/tld):
Note that this is the same as what I posted above, with the exception of switching out the -? for -* (in both host and domain) to allow for as many hyphens in between letters. This doesn't take care of the xn-- and 3rd/4th position issue, but unless you're allowing someone to register a domain by you, this is less of an issue (since for most cases, it's for a link, and people only need to link to something that is allowed and exists)... and even then, serverside validation would be necessary. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Jul 6, 2014
@EtaiG many thanks for the review and the good suggestions.
After trying myself your tweaks I have to completely agree with your points.
I still believe that by moving the dot matching to the end of the RE the host/domain/tld part can be reduced to only two main groups (since the only label with don't want followed by a dot is the TLD):
// host (optional) + domain + tld
"(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+\\.)+" +
"(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+" +
I am not sure I should consider digits as valid in the TLD group (also it is considered a label itself).
Now the tests do not lock up Chrome and it also seem the overall speed for URL validation is faster.
@EtaiG many thanks for the review and the good suggestions.
I am not sure I should consider digits as valid in the TLD group (also it is considered a label itself). Now the tests do not lock up Chrome and it also seem the overall speed for URL validation is faster. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
The gist have been corrected/updated so it doesn't lock up Chrome Javascript. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
schbetsy
Jul 7, 2014
I believe the slash before query params is optional. http://www.example.com?a=1&b=2
should pass, but it currently does not.
Changing line 93 to
"(?:/?\\S*)?" +
solves that issue, but might break other query-parameter specifications that aren't covered in the test cases.
schbetsy
commented
Jul 7, 2014
I believe the slash before query params is optional. Changing line 93 to "(?:/?\\S*)?" + solves that issue, but might break other query-parameter specifications that aren't covered in the test cases. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Jul 10, 2014
@schbetsy I am not sure it is optional either.
Anyway your change fix that if it becomes necessary for some reader.
What I can see is that browsers accept that but then they insert a slash in it when finished.
I am curious to try the effects of this change on my current tests.
Thank you for pointing that out.
@schbetsy I am not sure it is optional either. |
eluck
commented
Aug 7, 2014
Hey @dperini, Thanks for your great work! Please note that this regex fails on the following url: |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Aug 8, 2014
@eluck,
it is written in the comments: 'TLDs have been made mandatory so single names like "localhost" fails'.
The regex was built to match URLs having a real domain name (at least 2 labels separated by a dot).
However it will be very easy to add 'localhost' as an acceptable exception.
@eluck, |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
lysk88
commented
Sep 3, 2014
Hey! can you help me make this URI valid "foo.com" thanks ahead! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
adamrofer
Sep 8, 2014
PYTHON PORT (cc @brifordwylie):
import re
URL_REGEX = re.compile(
u"^"
# protocol identifier
u"(?:(?:https?|ftp)://)"
# user:pass authentication
u"(?:\S+(?::\S*)?@)?"
u"(?:"
# IP address exclusion
# private & local networks
u"(?!(?:10|127)(?:\.\d{1,3}){3})"
u"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
u"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
# IP address dotted notation octets
# excludes loopback network 0.0.0.0
# excludes reserved space >= 224.0.0.0
# excludes network & broadcast addresses
# (first & last IP address of each class)
u"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
u"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
u"|"
# host name
u"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
# domain name
u"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
# TLD identifier
u"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
u")"
# port number
u"(?::\d{2,5})?"
# resource path
u"(?:/\S*)?"
u"$"
, re.UNICODE)
I did make one change: the "-*" in both domain and host was (incorrectly) succeeding against "http://a.b--c.de/" so I changed it to "-?" - I'm not sure why that's in the gist above, I'd think it would fail on a JS unit test also.
adamrofer
commented
Sep 8, 2014
PYTHON PORT (cc @brifordwylie): import re
URL_REGEX = re.compile(
u"^"
# protocol identifier
u"(?:(?:https?|ftp)://)"
# user:pass authentication
u"(?:\S+(?::\S*)?@)?"
u"(?:"
# IP address exclusion
# private & local networks
u"(?!(?:10|127)(?:\.\d{1,3}){3})"
u"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
u"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
# IP address dotted notation octets
# excludes loopback network 0.0.0.0
# excludes reserved space >= 224.0.0.0
# excludes network & broadcast addresses
# (first & last IP address of each class)
u"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
u"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
u"|"
# host name
u"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
# domain name
u"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
# TLD identifier
u"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
u")"
# port number
u"(?::\d{2,5})?"
# resource path
u"(?:/\S*)?"
u"$"
, re.UNICODE) I did make one change: the "-*" in both domain and host was (incorrectly) succeeding against "http://a.b--c.de/" so I changed it to "-?" - I'm not sure why that's in the gist above, I'd think it would fail on a JS unit test also. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Sep 9, 2014
@adamrofer,
it seems the URL "http://a.b--c.de/" you are testing against is actually a valid URL.
As is ""http://g--a.com/". Just test it, it exists and resolves correctly to a Georgia State page.
I have been directed to read the relevant specs here:
http://url.spec.whatwg.org/#concept-host-parser
and the validity criteria are here:
http://www.unicode.org/reports/tr46/#Validity_Criteria
Thank you for the Python port !
@adamrofer, |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
nghuuphuoc
Sep 13, 2014
@dperini
Can you support international URLs?
For example: http://xn--80aaxitdbjk.xn--p1ai
nghuuphuoc
commented
Sep 13, 2014
@dperini |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Sep 13, 2014
@nghuuphuoc,
the regexp already supports international URLs, just write them using natural UTF-8 encoding.
The following is the UTF-8 version of the URL you typed above:
http://папироска.рф
It would be hard to type or remember IDN URLs like the one you typed, nobody will do.
This has been written to validate URLs typed by users and/or found in log files.
@nghuuphuoc, |
Arkni
commented
Sep 29, 2014
@dperini thanks for sharing |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dsgn1graphics
Oct 1, 2014
@dperini,
I'm using chai.js assert library to write a simple test for a js object in my rails app. This for initial client side form validation. Some of the uri formats as tested in @ixti spec above are failing to return false, here's the list.
"http://a.b--c.de/",
"http://-a.b.co",
"http://a.b-.co",
"http://0.0.0.0",
"http://10.1.1.0",
"http://10.1.1.255",
"http://224.1.1.1",
"http://1.1.1.1.1",
"http://123.123.123",
"http://3628126748",
"http://.www.foo.bar/",
"http://www.foo.bar./",
"http://.www.foo.bar./",
"http://10.1.1.1",
"http://10.1.1.254"
Heres my code
form_validators.coffee
#= require regex-weburl
class @FormValidators
uri: (uri)->
re_weburl.test(uri)
form_validators.js.coffee
#= require ../spec_helper
describe 'FormValidators', ->
describe '#uri', ->
beforeEach ->
@formValidators = new FormValidators()
it 'returns false for invalid urls', ->
assert.notOk @formValidators.uri("http://")
assert.notOk @formValidators.uri("http://.")
assert.notOk @formValidators.uri("http://..")
assert.notOk @formValidators.uri("http://../")
assert.notOk @formValidators.uri("http://?")
assert.notOk @formValidators.uri("http://??")
assert.notOk @formValidators.uri("http://??/")
assert.notOk @formValidators.uri("http://#")
assert.notOk @formValidators.uri("http://##")
assert.notOk @formValidators.uri("http://##/")
assert.notOk @formValidators.uri("http://foo.bar?q=Spaces should be encoded")
assert.notOk @formValidators.uri("//")
assert.notOk @formValidators.uri("//a")
assert.notOk @formValidators.uri("///a")
assert.notOk @formValidators.uri("///")
assert.notOk @formValidators.uri("http:///a")
assert.notOk @formValidators.uri("foo.com")
assert.notOk @formValidators.uri("rdar://1234")
assert.notOk @formValidators.uri("http:// shouldfail.com")
assert.notOk @formValidators.uri(":// should fail")
assert.notOk @formValidators.uri("http://foo.bar/foo(bar)baz quux")
assert.notOk @formValidators.uri("http://-error-.invalid/")
assert.notOk @formValidators.uri("http://a.b--c.de/")
assert.notOk @formValidators.uri("http://-a.b.co")
assert.notOk @formValidators.uri("http://a.b-.co")
assert.notOk @formValidators.uri("http://0.0.0.0")
assert.notOk @formValidators.uri("http://10.1.1.0")
assert.notOk @formValidators.uri("http://10.1.1.255")
assert.notOk @formValidators.uri("http://224.1.1.1")
assert.notOk @formValidators.uri("http://1.1.1.1.1")
assert.notOk @formValidators.uri("http://123.123.123")
assert.notOk @formValidators.uri("http://3628126748")
assert.notOk @formValidators.uri("http://.www.foo.bar/")
assert.notOk @formValidators.uri("http://www.foo.bar./")
assert.notOk @formValidators.uri("http://.www.foo.bar./")
assert.notOk @formValidators.uri("http://10.1.1.1")
assert.notOk @formValidators.uri("http://10.1.1.254")
Just thought I would take the time out to let you know. I'm not sure if something changed recently, if you are even supporting this script anymore. Good work by the way, saved me a tone of time.
dsgn1graphics
commented
Oct 1, 2014
@dperini,
Heres my code form_validators.coffee
form_validators.js.coffee
Just thought I would take the time out to let you know. I'm not sure if something changed recently, if you are even supporting this script anymore. Good work by the way, saved me a tone of time. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dsgn1graphics
Oct 1, 2014
@adamrofer fix of changing ( -* ) to ( -? ) in the host and domain name section fixed the js unit test for me
dsgn1graphics
commented
Oct 1, 2014
@adamrofer fix of changing ( -* ) to ( -? ) in the host and domain name section fixed the js unit test for me |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Oct 14, 2014
@dsgn1graphics,
I suggest you check your tests and/or the port of the Regular Expression you are currently using.
In the list of URLs failing validation that you sent above only the first one is a valid URL ("http://a.b--c.de/") all the others are not validating against the regex.
I tested them once more within my environment (Javascript) and everything works as expected.
@dsgn1graphics, I tested them once more within my environment (Javascript) and everything works as expected. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Menelion
Oct 16, 2014
Thanks Diego for your hard work!
Menelion
commented
Oct 16, 2014
Thanks Diego for your hard work! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
mattauckland
Oct 19, 2014
Hi @dperini
I love the expression, but I'm wondering what modification I would need to make, to make the pattern ignore a URL if it is proceeded by either a " or = or ] or > and succeeded with either a " or [/ or </
It is so that the following won't be validated:
[link=http://www.google.com]google.com[/link]
and
<a href="http://www.google.com">google.com</a>
Reason is I currently use modified version gruber's regex as part of a php auto url function in the following manner, but I would like to use your's instead:-
// Regular expression for URLs
// Based on http://daringfireball.net/2010/07/improved_regex_for_matching_urls
// Improved to only pickup links begining with http https ftp ftps mailto and www
$regex = "_(?i)\b((?:https?|ftps?|mailto|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))_iuS";
// If markup is TRUE, convert URLs to html markup
if ($markup == TRUE) $string = preg_replace_callback($regex, array(&$this, 'auto_url'), $string);
Thanks, Matt
mattauckland
commented
Oct 19, 2014
Hi @dperini I love the expression, but I'm wondering what modification I would need to make, to make the pattern ignore a URL if it is proceeded by either a " or = or ] or > and succeeded with either a " or [/ or </ It is so that the following won't be validated:
and
Reason is I currently use modified version gruber's regex as part of a php auto url function in the following manner, but I would like to use your's instead:-
Thanks, Matt |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
mattauckland
Oct 19, 2014
Additional, my thinking behind this question is to be able to allow the manual coding of links, using html or bbcode.
mattauckland
commented
Oct 19, 2014
Additional, my thinking behind this question is to be able to allow the manual coding of links, using html or bbcode. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Nov 19, 2014
Matt,
just saw this ... as a quick suggestion you can try something like:
(?:\x22|\x3d|\x5d|\x3e)(?:regex-weburl)(?:\x22|\x5b\x2f|\x3c\x2f)
haven't tried it, not sure it does exactly what you asked/depicted.
It's a start anyway
Matt, (?:\x22|\x3d|\x5d|\x3e)(?:regex-weburl)(?:\x22|\x5b\x2f|\x3c\x2f) haven't tried it, not sure it does exactly what you asked/depicted. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Nov 19, 2014
Matt,
a better approach to match corresponding open/close brackets and quotes would require more work:
(?:\x5d(?:regex-weburl)\x5b\x2f)|
(?:\x3e(?:regex-weburl)\x3c\x2f)|
(?:\x22(?:regex-weburl)\x22)|
(?:\x3d(?:regex-weburl))
again, I haven't tested it.
Matt,
again, I haven't tested it. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Nov 19, 2014
Oire,
yes I believe it would be a good idea to move this to a Git repo.
However I disagree about having patterns that will never be typed by users like "IPV6" and "PunyCode". I am most likely inclined to also remove IPV4 validation from the base regex, nobody remember these numbers and they will most likely change in time.
Nobody will type/remember "PunyCode" URLs and the regex already supports international UTF-8 URLs.
The above is also true for decimal notations, various forms of IPV6 URLs and other "non-human" URLs.
Oire, However I disagree about having patterns that will never be typed by users like "IPV6" and "PunyCode". I am most likely inclined to also remove IPV4 validation from the base regex, nobody remember these numbers and they will most likely change in time. Nobody will type/remember "PunyCode" URLs and the regex already supports international UTF-8 URLs. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
MarQuisKnox
Jan 10, 2015
Thanks for sharing, Diego.
I put this in a repo: https://github.com/MarQuisKnox/regex-weburl.js
MarQuisKnox
commented
Jan 10, 2015
Thanks for sharing, Diego. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
sanbor
commented
Jan 16, 2015
Thanks @MarQuisKnox, @dperini and @mathiasbynens, it is really helpful! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Fleshgrinder
Jan 23, 2015
Hey guys, here is my extended version https://github.com/Fleshgrinder/php-url-validator
It builds upon your regular expression @dperini but has support for more features:
- IPv6 addresses (actual validation via
filter_var
). - Punycode support.
- URLs which are not in NFC form are invalid.
- URLs with a dash on the third and fourth position are invalid.
Would you mind if I release my code with the Unlicense license? I used MIT because you used MIT, but I'm more into total freedom.
Fleshgrinder
commented
Jan 23, 2015
Hey guys, here is my extended version https://github.com/Fleshgrinder/php-url-validator
Would you mind if I release my code with the Unlicense license? I used MIT because you used MIT, but I'm more into total freedom. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
halloamt
Feb 4, 2015
Hi,
http://example.com./ is a valid URL but the last dot ist usually not written by convention. See http://tools.ietf.org/html/rfc1035 Paragraph 3.1.
http://en.wikipedia.org./wiki/Domain_name#Domain_name_syntax works in Firefox and IE
halloamt
commented
Feb 4, 2015
Hi, |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
pbuyle
commented
Feb 5, 2015
What's wrong with http://php.net/manual/en/function.parse-url.php ? |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Veers01
Feb 6, 2015
Just a small comment about brodcast and network address. these address can be valid in CIDR class. Ex: If a provider have two class like 205.151.128.0/24 and 205.151.129.0/24, they can combine the two in a classless network: 205.151.128.0/23. In that network, 205.151.128.255 and 205.151.129.0 are two valid and usable address.
Veers01
commented
Feb 6, 2015
Just a small comment about brodcast and network address. these address can be valid in CIDR class. Ex: If a provider have two class like 205.151.128.0/24 and 205.151.129.0/24, they can combine the two in a classless network: 205.151.128.0/23. In that network, 205.151.128.255 and 205.151.129.0 are two valid and usable address. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
ngduc
Feb 6, 2015
Any regex can extract URLs from below cases?
"http://google.com" (string contains double quotes)
'http://google.com' (string contains single quote)
[http://google.com] (string contains brackets)
http://google.com</br> (string contains html tags)
ngduc
commented
Feb 6, 2015
Any regex can extract URLs from below cases? "http://google.com" (string contains double quotes) |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
puzrin
Feb 12, 2015
http://markdown-it.github.io/linkify-it/ here is JS demo with full unicode support, including astral characters.
Final regexp in ~6K and generated automatically. Src is here: https://github.com/markdown-it/linkify-it/blob/master/lib/re.js . Since astral characters take 2 positions, [^negative] class is impossible. Negative lookahead is used instead
NOTE, that package does fuzzy search, not strict validation. For strict validation (^...$) required.
puzrin
commented
Feb 12, 2015
http://markdown-it.github.io/linkify-it/ here is JS demo with full unicode support, including astral characters. Final regexp in ~6K and generated automatically. Src is here: https://github.com/markdown-it/linkify-it/blob/master/lib/re.js . Since astral characters take 2 positions, [^negative] class is impossible. Negative lookahead is used instead NOTE, that package does fuzzy search, not strict validation. For strict validation (^...$) required. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
johnjaylward
Mar 6, 2015
I changed the last block for the resource path to look like this:
(?:[/?#]\\S*)?
This will allow URLs like http://test.com#MyAnchor or http://test.com/whatever or http://test.com?some=query
while they may not technically be valid, it is something I could see a user typing and most browsers will fix it for them. If they copy it out and back into a browser so they may not know what's wrong with it upon visual inspection.
johnjaylward
commented
Mar 6, 2015
I changed the last block for the resource path to look like this:
This will allow URLs like http://test.com#MyAnchor or http://test.com/whatever or http://test.com?some=query while they may not technically be valid, it is something I could see a user typing and most browsers will fix it for them. If they copy it out and back into a browser so they may not know what's wrong with it upon visual inspection. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
drjohn-97
Apr 7, 2015
This is exactly what I've been looking for.
Thank you. The only pattern it won't match for me (Using it in a Java Regex) is where the IP address is '0'(ZERO) padded, like:
http://096.004.012.125/index.html
Which I get as input from other tools.
Thanks again for the GREAT regex!!
drjohn-97
commented
Apr 7, 2015
This is exactly what I've been looking for. http://096.004.012.125/index.html Which I get as input from other tools. Thanks again for the GREAT regex!! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
barrygleeson
commented
May 7, 2015
anyone have a vb.net port? |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
barrygleeson
May 9, 2015
'VB Port that handles domains with or without a hostname
Public Sub MatchUrl(url As String)
Dim rxs As String = ""
'protocol identifier
rxs = rxs + "(?:(?:https?)://)"
' user:pass authentication
rxs = rxs + "(?:\S+(?::\S*)?@)?"
rxs = rxs + "(?:"
'IP address exclusion
'private & local networks
rxs = rxs + "(?!(?:10|127)(?:\.\d{1,3}){3})"
rxs = rxs + "(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
rxs = rxs + "(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
'IP address dotted notation octets
'excludes loopback network 0.0.0.0
'excludes reserved space >= 224.0.0.0
'excludes network & broacast addresses
'(first & last IP address of each class)
rxs = rxs + "(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
rxs = rxs + "(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
rxs = rxs + "(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
rxs = rxs + "|"
'host name
rxs = rxs + "(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
'domain name
rxs = rxs + "(?:(?:\.[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
' TLD identifier
rxs = rxs + "(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
rxs = rxs + ")"
' port number
rxs = rxs + "(?::\d{2,5})?"
' resource path
rxs = rxs + "(?:/\S*)?"
Dim rx As Regex = New Regex(rxs, RegexOptions.IgnoreCase)
Dim match As Match = rx.Match(url)
If match.Success Then
Console.WriteLine(match.Value.ToString)
Else
Console.WriteLine("not a match")
End If
End Sub
barrygleeson
commented
May 9, 2015
'VB Port that handles domains with or without a hostname
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Mickael-van-der-Beek
May 18, 2015
I also discovered that underscores are not valid if you follow this RegExp.
e.g:
The URL
http://a_b.c.com
will fail.
Here's a link to a relevant StackOverflow question:
http://stackoverflow.com/questions/2180465/can-hostname-subdomains-have-an-underscore-in-it
Mickael-van-der-Beek
commented
May 18, 2015
I also discovered that underscores are not valid if you follow this RegExp. The URL
will fail. Here's a link to a relevant StackOverflow question: http://stackoverflow.com/questions/2180465/can-hostname-subdomains-have-an-underscore-in-it |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
bluemoehre
May 21, 2015
This is my PHP port...
I added (?=\s|$)
to the end to prevent matches like http://foo.bar?param=meter
(no path-slash).
I added (?<=^|\s)
at the beginning to use it within text.
Additionally i reordered the hostname parts, to get it working with preg_replace_callback (I had some BACKTRACE LIMIT EXCEEDED errors).
[a-z\x{00a1}-\x{ffff}0-9]+
(?:-[a-z\x{00a1}-\x{ffff}0-9]+)*
The full expression:
const RX_LINK_ALL = '#
(?<=^|\s)
(?:(?:https?|ftp)://)?
(?:\S+(?::\S*)?@)?
(?:
(?!10(?:\.\d{1,3}){3})
(?!127(?:\.\d{1,3}){3})
(?!169\.254(?:\.\d{1,3}){2})
(?!192\.168(?:\.\d{1,3}){2})
(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})
(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])
(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))
|
(?:[a-z\x{00a1}-\x{ffff}0-9]+(?:-[a-z\x{00a1}-\x{ffff}0-9]+)*)
(?:\.[a-z\x{00a1}-\x{ffff}0-9]+(?:-[a-z\x{00a1}-\x{ffff}0-9]+)*)*
(?:\.(?:[a-z\x{00a1}-\x{ffff}]{2,}))
)
(?::\d{2,5})?
(?:/\S*)?
(?=\s|$)
#ux';
bluemoehre
commented
May 21, 2015
This is my PHP port... I added I added Additionally i reordered the hostname parts, to get it working with preg_replace_callback (I had some BACKTRACE LIMIT EXCEEDED errors).
The full expression:
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
jnovack
May 23, 2015
10.1.1.255
is a VALID HOST IP for a host within a 10.1.0.0/22
subnet or larger.
- First IP
10.1.0.1
- Last IP
10.1.3.254
http://www.adminsub.net/ipv4-subnet-calculator/10.1.0.0/22
At a minimum, there are only two always-invalid IPs in the 10.
subnet. I suggest only testing the following:
10.0.0.0
- Subnet address in10.0.0.0/8
(largest possible10.
subnet)10.255.255.255
- Broadcast address in10.0.0.0/8
(largest possible10.
subnet)10.1.1.256
- For validation testing.
jnovack
commented
May 23, 2015
http://www.adminsub.net/ipv4-subnet-calculator/10.1.0.0/22 At a minimum, there are only two always-invalid IPs in the
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
danyboy85
Jun 1, 2015
Hi,
(I'm french)
I don't understand why it don't match my string, using the javascript version of the regex ?
function fTest() {
var str = "aaa bbb ccc http://www.google.fr aaa bbb eee";
var res = str.match("/^(?:(?:https?|ftp):\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:\/\S*)?$/i");
alert(res);
}
--> res is empty
Anybody could explain to me why it dosn't work ?
Thx !
danyboy85
commented
Jun 1, 2015
Hi, function fTest() {
} --> res is empty Anybody could explain to me why it dosn't work ? Thx ! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Mickael-van-der-Beek
Jun 2, 2015
@danyboy85 This is because the RegExp is conceived to validate strings and not to match URLs in a strings. The ^
at the start of the RegExp means that the string should start with the URL protocol and the $
at the end of the RegExp means that the string should end with the URL pathname.
Mickael-van-der-Beek
commented
Jun 2, 2015
@danyboy85 This is because the RegExp is conceived to validate strings and not to match URLs in a strings. The |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
bmuessig
Jun 4, 2015
I am not sure if anybody mentioned it before, but some of the "invalid" URL's are in fact valid!
So I make an example here:
http://example.org./ must parse as valid. http://example.org/ could parse as valid.
If you disagree, read the actual specifications. The domain should actually be suffixed by a .
bmuessig
commented
Jun 4, 2015
I am not sure if anybody mentioned it before, but some of the "invalid" URL's are in fact valid! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
peter-fu
commented
Jun 8, 2015
Shouldn't this be valid? |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
peter-fu
commented
Jun 8, 2015
Just noted the workaround provided by @johnjaylward worked. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
skeller88
Jun 16, 2015
This regex and everyone's comments have been really informative! Thanks for writing this.
I'm confused about this regex's handling of UTF-8 characters. The RFC spec does not allow "" characters, so why does the regex use "" to match UTF-8 characters? From the spec:
" URI producing applications must not use percent-encoding in host unless it is used
to represent a UTF-8 character sequence. When a non-ASCII registered
name represents an internationalized domain name intended for
resolution via the DNS, the name must be transformed to the IDNA
encoding [RFC3490] prior to name lookup. URI producers should
provide these registered names in the IDNA encoding, rather than a
percent-encoding, if they wish to maximize interoperability with
legacy URI resolvers."
So, UTF-8 characters other than alphanumeric characters should be represented using % encoding and IDNA encoding. I'll post the regex I have in mind later on.
_EDIT_
I answered my own question. Browsers reduce UTF-8 in URIs to punycode now, so from the perspective of the RFC spec, the URI actually sent over the wire will be valid.
skeller88
commented
Jun 16, 2015
This regex and everyone's comments have been really informative! Thanks for writing this. I'm confused about this regex's handling of UTF-8 characters. The RFC spec does not allow "" characters, so why does the regex use "" to match UTF-8 characters? From the spec: " URI producing applications must not use percent-encoding in host unless it is used So, UTF-8 characters other than alphanumeric characters should be represented using % encoding and IDNA encoding. I'll post the regex I have in mind later on. _EDIT_ |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Jun 23, 2015
Many thanks to everybody for the comments and the suggestions.
I have updated the gist:
- Made starting path slash optional (http://example.com?foo=bar)
- Allow a dot (.) at the end of hostnames (http://example.com.)
Many thanks to everybody for the comments and the suggestions. I have updated the gist:
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Jun 25, 2015
This is an answer to @halloamt & @muessigb questions.
They are related to having/allowing a trailing dot at the end of the hostname.
I answered to this question previously on Twitter, here is an interesting link with additional info:
http://saynt2day.blogspot.it/2013/03/danger-of-trailing-dot-in-domain-name.html
The title of the article say it all: "The danger of the trailing dot in the domain name".
As you can see from the previous message I recently allowed it in my regular expression.
So be careful if you use a trailing dot at the end of the domain name, it may not work in all situations.
This is an answer to @halloamt & @muessigb questions.
The title of the article say it all: "The danger of the trailing dot in the domain name". |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dmose
Jul 6, 2015
Looks like the "allowed a trailing dot" clause is missing a backslash in front of the dot, so it in fact allows a trailing character of any type, including whitespace, since that is the semantics of the . character in a RegExp.
dmose
commented
Jul 6, 2015
Looks like the "allowed a trailing dot" clause is missing a backslash in front of the dot, so it in fact allows a trailing character of any type, including whitespace, since that is the semantics of the . character in a RegExp. |
You are correct @dmose, thank you for noticing that. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Jul 14, 2015
I added the following example URLs to my tests:
http://www.example.com.
http://www.example.com/
http://www.example.com?
http://www.example.com#
http://www.example.com./?#
all the above URLs are now passing the tests correctly !
I added the following example URLs to my tests:
all the above URLs are now passing the tests correctly ! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
sethnewton
Jul 16, 2015
@dperini: I don't believe your javascript one liner will match against the period in front of the TLD without two backslashes. I found this out the hard way when I put a question mark after the protocol match, making it optional.... and discovered it was passing any word ex: sethnewton
I forked and made the change here: https://gist.github.com/sethnewton/9fe949bbc8edfe429232 ... hopefully it's of some use to you.
sethnewton
commented
Jul 16, 2015
@dperini: I don't believe your javascript one liner will match against the period in front of the TLD without two backslashes. I found this out the hard way when I put a question mark after the protocol match, making it optional.... and discovered it was passing any word ex: sethnewton I forked and made the change here: https://gist.github.com/sethnewton/9fe949bbc8edfe429232 ... hopefully it's of some use to you. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Jul 24, 2015
@sethnewton,
I did a cut&paste of the one liner in your gist inside my tests and most of the tests fail.
It seems you have added the double backslash in the wrong place (not after the TLD block).
If you look to the one liner regular expression there is no place where a backslash need to be escaped.
It is only inside the new RegExp()
constructor that it is necessary to double the backslashes (escape them).
@sethnewton, |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
nhahtdh
Jul 30, 2015
There is a subtle inefficiency in this construct:
(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+
On a string without any -
, the regex degenerates to [a-z\\u00a1-\\uffff0-9]*[a-z\\u00a1-\\uffff0-9]+
, which is of the form A*A*
. It will cause quadratic complexity in worst case. The effect is not very visible, until the length of the non-matching string goes up to a few thousand to tens of thousands characters.
This is my suggested fix:
[a-z\\u00a1-\\uffff0-9]+(?:-+[a-z\\u00a1-\\uffff0-9]+)*
It can only starts and ends with [a-z\\u00a1-\\uffff0-9]
, and any stretch of -
or [a-z\\u00a1-\\uffff0-9]
is still allowed. Likewise, minimum matching length is still 1.
nhahtdh
commented
Jul 30, 2015
There is a subtle inefficiency in this construct:
On a string without any This is my suggested fix:
It can only starts and ends with |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
jbardu
Aug 5, 2015
Control-F Perl .. nothing.
A perl version is the one line Javascript version with \x{00a1}-\x{ffff} instead of \u00a1-\uffff
Tested against the test-case list and passed.
jbardu
commented
Aug 5, 2015
Control-F Perl .. nothing. A perl version is the one line Javascript version with \x{00a1}-\x{ffff} instead of \u00a1-\uffff |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
gburtini
Aug 7, 2015
This doesn't seem to allow http://3628126748
It is a decimal address which resolves to an IP owned by The Coca Cola Corp (not an internal IP).
gburtini
commented
Aug 7, 2015
This doesn't seem to allow http://3628126748 It is a decimal address which resolves to an IP owned by The Coca Cola Corp (not an internal IP). |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
bazzargh
Aug 7, 2015
The patterns for username/password are overly lax and allow you to put in almost anything as a url, if you finish with something that looks like @domain.name. eg re_weburl.test("http://127.0.0.1/@example.com"), or re_weburl.test("http://???/@example.com")
bazzargh
commented
Aug 7, 2015
The patterns for username/password are overly lax and allow you to put in almost anything as a url, if you finish with something that looks like @domain.name. eg re_weburl.test("http://127.0.0.1/@example.com"), or re_weburl.test("http://???/@example.com") |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Mickael-van-der-Beek
Aug 17, 2015
@gburtini Actually although browsers allow and resolve URLs with IP addresses that are in hexadecimal, octal or without a dot-notation, these formats are made invalid in a URL by RFC 3986:
https://www.ietf.org/rfc/rfc3986.txt section 7.4 Rare IP Address Formats
Mickael-van-der-Beek
commented
Aug 17, 2015
@gburtini Actually although browsers allow and resolve URLs with IP addresses that are in hexadecimal, octal or without a dot-notation, these formats are made invalid in a URL by RFC 3986: https://www.ietf.org/rfc/rfc3986.txt section |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
yang7229693
commented
Aug 27, 2015
'www.google.com' check failed |
o5
commented
Aug 31, 2015
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
lucasvrm
Sep 1, 2015
Thanks for the great regex! I am trying to use it within a custom validation rule in Laravel.
But it's not validating anything at all...
My code bellow:
$regex = '_^(?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:.\d{1,3}){3})(?!(?:169.254|192.168)(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\x{00a1}-\x{ffff}0-9]-)[a-z\x{00a1}-\x{ffff}0-9]+)(?:.(?:[a-z\x{00a1}-\x{ffff}0-9]-)[a-z\x{00a1}-\x{ffff}0-9]+)(?:.(?:[a-z\x{00a1}-\x{ffff}]{2,})).?)(?::\d{2,5})?(?:[/?#]\S)?$_iuS';
$rules = array('hewit' => array('required', 'regex:'. $regex));
Am I doing something wrong?
lucasvrm
commented
Sep 1, 2015
Thanks for the great regex! I am trying to use it within a custom validation rule in Laravel. But it's not validating anything at all... My code bellow:
Am I doing something wrong? |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
SarthakM9
Sep 14, 2015
Hello Diego,
Awesome work!!!
In reference to the link: https://mathiasbynens.be/demo/url-regex
These 2 test cases (should return false) are returning true:
http://a.b--c.de/
http://www.foo.bar./
SarthakM9
commented
Sep 14, 2015
Hello Diego, In reference to the link: https://mathiasbynens.be/demo/url-regex |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
worenga
Sep 18, 2015
Hi @dperini, very nice RegEx!
I ran the regex on thousands of user input text data and it seems that urls like
http://google.de/)? is recognized by the regex, im not sure whether this is intended or not.
worenga
commented
Sep 18, 2015
Hi @dperini, very nice RegEx! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
tomchentw
Oct 20, 2015
Thanks for the great work.
I also found that someone created a npm package for this gist: https://www.npmjs.com/package/url-regex
tomchentw
commented
Oct 20, 2015
Thanks for the great work. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
YanaSavchenko
Oct 20, 2015
Hi @dperini !
I have a question.
Your regexp for JS:
var urlReStr = '^(?:(?:https?|ftp)://)(?:\S+(?::\S_)?@)?(?:(?!(?:10|127)(?:.\d{1,3}){3})(?!(?:169.254|192.168)(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-)[a-z\u00a1-\uffff0-9]+)(?:.(?:[a-z\u00a1-\uffff0-9]-)[a-z\u00a1-\uffff0-9]+)_(?:.(?:[a-z\u00a1-\uffff]{2,})).?)(?::\d{2,5})?(?:[/?#]\S*)?$';
var urlRe = new RegExp(urlReStr, 'i');
urlRe.test(value)
It works perfectly for some tricky cases, but doesn't fail for such simple case as http://dddddddddddddd.
Maybe I'm doing something wrong. Please, give me advise.
YanaSavchenko
commented
Oct 20, 2015
Hi @dperini ! var urlRe = new RegExp(urlReStr, 'i'); It works perfectly for some tricky cases, but doesn't fail for such simple case as http://dddddddddddddd. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
sircharleswatson
commented
Nov 16, 2015
Unfortunately... www.google.com - Fails |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
goa
Nov 23, 2015
https://www.youtube.com/watch?v=LrHx6Q_-tLU - Fails, although it is a valid YouTube url.
goa
commented
Nov 23, 2015
https://www.youtube.com/watch?v=LrHx6Q_-tLU - Fails, although it is a valid YouTube url. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
hassanila97
commented
Dec 4, 2015
http://localhost fails it's a crucial one |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
shimondoodkin
Dec 27, 2015
@diegoperini's version converted to javascript:
var match_url_re=/^(?:(?:https?|ftp)://)(?:\S+(?::\S_)?@)?(?:(?!10(?:.\d{1,3}){3})(?!127(?:.\d{1,3}){3})(?!169.254(?:.\d{1,3}){2})(?!192.168(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)(?:.(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)_(?:.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/[^\s]*)?$/i;
shimondoodkin
commented
Dec 27, 2015
@diegoperini's version converted to javascript: var match_url_re=/^(?:(?:https?|ftp)://)(?:\S+(?::\S_)?@)?(?:(?!10(?:.\d{1,3}){3})(?!127(?:.\d{1,3}){3})(?!169.254(?:.\d{1,3}){2})(?!192.168(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)(?:.(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)_(?:.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/[^\s]*)?$/i; |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
ivanderos
commented
Dec 30, 2015
Great, |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
megamos
commented
Jan 5, 2016
Thanks for this lovly gist! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
umutm
Feb 18, 2016
Wouldn't it be better if we update host name validation part from
(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)
into
(?:(?:[a-z\\u00a1-\\uffff0-9-_]-*)*[a-z\\u00a1-\\uffff0-9-_]+)
so that the sub-domains allow underscore character?
Edit: this update fails on http://test_domain.com but not http://www.test_domain.com an requires some improvement. Any suggestions?
umutm
commented
Feb 18, 2016
Wouldn't it be better if we update host name validation part from Edit: this update fails on http://test_domain.com but not http://www.test_domain.com an requires some improvement. Any suggestions? |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
derekshull
Mar 25, 2016
Right now http://www.google shows to be true and http://www.google_hello.com shows to be false.
derekshull
commented
Mar 25, 2016
Right now http://www.google shows to be true and http://www.google_hello.com shows to be false. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
klimashkin
commented
May 4, 2016
http://xn--j1ail.xn--p1ai/ - fails, but it valid url |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
mockdeep
May 12, 2016
@derekshull google_hello.com
isn't a valid url, since underscores aren't valid in domain names.
mockdeep
commented
May 12, 2016
•
edited
edited
@derekshull |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
mockdeep
commented
May 12, 2016
nm, I stand corrected. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
jcannon98188
May 16, 2016
@mockdeep you were right, an underscore is not valid in a domain name, but it is valid in uri's. So www.google_test.com is not valid, but www.google.com/test_page is valid
jcannon98188
commented
May 16, 2016
@mockdeep you were right, an underscore is not valid in a domain name, but it is valid in uri's. So www.google_test.com is not valid, but www.google.com/test_page is valid |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
krowe-hsc
May 17, 2016
For anyone trying to get this to work with local host names (those without any '.'), replacing the final '*' with a '?' on line 90 did the trick for me.
krowe-hsc
commented
May 17, 2016
For anyone trying to get this to work with local host names (those without any '.'), replacing the final '*' with a '?' on line 90 did the trick for me. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
tillkruss
commented
Jul 5, 2016
The regexp sadly doesn't match Twitter's short links:
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Jul 9, 2016
I have seen that some users (like @yang7229693 and @sircharleswatson) are trying to validate host names like:
www.google.com
without a needed protocol identifier (schema) so this will fail since these are not Web URLs.
In case you need to make the "protocol identifier" optional change the schema related line at the beginning from:
// protocol identifier
"(?:(?:https?|ftp)://)" +
to the following (added a question mark to the end, before the closing double quote):
// protocol identifier
"(?:(?:https?|ftp)://)?" +
I have seen that some users (like @yang7229693 and @sircharleswatson) are trying to validate host names like:
without a needed protocol identifier (schema) so this will fail since these are not Web URLs. In case you need to make the "protocol identifier" optional change the schema related line at the beginning from:
to the following (added a question mark to the end, before the closing double quote):
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Jul 9, 2016
@umutm & @derekshull
as already said by user @mockdeep the underscore character is not allowed by specifications !
It is not supported either in the domain or in the sub-domain parts (even if you can configure your DNS to accept it).
@umutm & @derekshull |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Jul 9, 2016
@klimashkin
it works if you write it as plain UTF-8 instead of puny-encoded, I mean written like: http://кто.рф/
It was already been explained in previous comments, nobody will remember and type puny-encoded URLs.
Try to validate the http://кто.рф/ as is, you will see it passes the validation (see my last comment on 19 Nov 2014).
@klimashkin |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
amogil
commented
Jul 18, 2016
Made a gem for ruby. Thanks, @dperini and @mathiasbynens! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
kofifus
Jul 25, 2016
validating without a protocol is tricky .. just by adding the question mark above you'll end up with any 'word.word' as a valid url which is usually not what you want.
I came up with the following, which is not perfect but works.. it will approve http://www.google.bla http://google.bla www.google.bla and google.com but not google.bla
function isUrl(s) {
if (!isUrl.rx_url) {
// taken from https://gist.github.com/dperini/729294
isUrl.rx_url=/^(?:(?:https?|ftp):\/\/)?(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,}))\.?)(?::\d{2,5})?(?:[/?#]\S*)?$/i;
// valid prefixes
isUrl.prefixes=['http:\/\/', 'https:\/\/', 'ftp:\/\/', 'www.'];
// taken from https://w3techs.com/technologies/overview/top_level_domain/all
isUrl.domains=['com','ru','net','org','de','jp','uk','br','pl','in','it','fr','au','info','nl','ir','cn','es','cz','kr','ua','ca','eu','biz','za','gr','co','ro','se','tw','mx','vn','tr','ch','hu','at','be','dk','tv','me','ar','no','us','sk','xyz','fi','id','cl','by','nz','il','ie','pt','kz','io','my','lt','hk','cc','sg','edu','pk','su','bg','th','top','lv','hr','pe','club','rs','ae','az','si','ph','pro','ng','tk','ee','asia','mobi'];
}
if (!isUrl.rx_url.test(s)) return false;
for (let i=0; i<isUrl.prefixes.length; i++) if (s.startsWith(isUrl.prefixes[i])) return true;
for (let i=0; i<isUrl.domains.length; i++) if (s.endsWith('.'+isUrl.domains[i]) || s.includes('.'+isUrl.domains[i]+'\/') ||s.includes('.'+isUrl.domains[i]+'?')) return true;
return false;
}
kofifus
commented
Jul 25, 2016
•
edited
edited
validating without a protocol is tricky .. just by adding the question mark above you'll end up with any 'word.word' as a valid url which is usually not what you want. I came up with the following, which is not perfect but works.. it will approve http://www.google.bla http://google.bla www.google.bla and google.com but not google.bla
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
amogil
Jul 26, 2016
In that case you have to have to-date list of TLDs. It can be tricky. https://en.wikipedia.org/wiki/List_of_Internet_top-level_domains
amogil
commented
Jul 26, 2016
•
edited
edited
In that case you have to have to-date list of TLDs. It can be tricky. https://en.wikipedia.org/wiki/List_of_Internet_top-level_domains |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
luckydonald
Sep 21, 2016
Up-to-date official TLD list can be found at http://www.iana.org/domains/root/db, or [https://data.iana.org/TLD/tlds-alpha-by-domain.txt](https://data.iana.org/TLD/tlds-alpha-by-domain.txt as marchine parsable.
luckydonald
commented
Sep 21, 2016
•
edited
edited
Up-to-date official TLD list can be found at http://www.iana.org/domains/root/db, or [https://data.iana.org/TLD/tlds-alpha-by-domain.txt](https://data.iana.org/TLD/tlds-alpha-by-domain.txt as marchine parsable. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
kewlcoder
Oct 15, 2016
I am using the expression mentioned above - the JS version - at the below tool and it says "Pattern Error" at the first character -
https://regex101.com/r/6hdF29/2
When I remove the 1st character i.e. '/', the error goes away, but no match is found for the following inputs -
http://face.com
https://face.com
https://www.face.com/
face.com
www.face.com
192.168.202.97
92.18.2.97
http://92.18.2.97
Please help !
kewlcoder
commented
Oct 15, 2016
I am using the expression mentioned above - the JS version - at the below tool and it says "Pattern Error" at the first character - Please help ! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
marcopompili
Oct 16, 2016
@kewlcoder You have to remove '/', you should copy only the content between '/' if you are using a regex with regex101. Also this regex is for matching one URL not a list of them, you have to use the correct flags (img) for multiple matching: https://regex101.com/r/JmI8qG/1
marcopompili
commented
Oct 16, 2016
•
edited
edited
@kewlcoder You have to remove '/', you should copy only the content between '/' if you are using a regex with regex101. Also this regex is for matching one URL not a list of them, you have to use the correct flags (img) for multiple matching: https://regex101.com/r/JmI8qG/1 |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
stevenvachon
Dec 27, 2016
For those here that don't yet know, there is an implementation of the URL spec available via whatwg-url and in Node.js v7 (experimental). These discussions should probably move to one of these places.
stevenvachon
commented
Dec 27, 2016
For those here that don't yet know, there is an implementation of the URL spec available via whatwg-url and in Node.js v7 (experimental). These discussions should probably move to one of these places. |
spinus
commented
Jan 4, 2017
@dperini, you excluded localhost and few private networks from "valid" set. If the reason for it is to discourage people from accessing "local" resources, I would ass "localhost.localdomain" as well. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
gajus
Jan 25, 2017
It might be worth stating that the current JavaScript regex version does not pass the https://mathiasbynens.be/demo/url-regex test.
gajus
commented
Jan 25, 2017
It might be worth stating that the current JavaScript regex version does not pass the https://mathiasbynens.be/demo/url-regex test. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
spence
Feb 12, 2017
Published a Elixir library for this. Thanks @dperini and everyone here!
ValidUrl.validate("https://www.example.com")
spence
commented
Feb 12, 2017
Published a Elixir library for this. Thanks @dperini and everyone here! ValidUrl.validate("https://www.example.com") |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
anchev
Mar 1, 2017
Thanks. You need to add also ftps and sftp. BTW this test is not quite actual it seems because when testing in https://regex101.com/ these urls show as valid and according to Mathias they should not be:
http://a.b--c.de/
http://www.foo.bar./
and this one is invalid (according to Mathias it should be):
anchev
commented
Mar 1, 2017
•
edited
edited
Thanks. You need to add also ftps and sftp. BTW this test is not quite actual it seems because when testing in https://regex101.com/ these urls show as valid and according to Mathias they should not be: http://a.b--c.de/ and this one is invalid (according to Mathias it should be): |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
steelliberty
Mar 4, 2017
Hi. I am using python validators and validators.url which is based on your url validator - when I use a url with double dashes after the first word , for example http://www.word1--word2.com it fails -- however there are many valid urls out there with double dashes -- Just wondering if you knowledge of that failure ? Thank you ..
steelliberty
commented
Mar 4, 2017
Hi. I am using python validators and validators.url which is based on your url validator - when I use a url with double dashes after the first word , for example http://www.word1--word2.com it fails -- however there are many valid urls out there with double dashes -- Just wondering if you knowledge of that failure ? Thank you .. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
evecalm
Mar 24, 2017
This is a false regexp. According to the specification rfc3986, there are many more valid chars in search string. A valid url (like http://192.168.2.79:1300/onlineReader/Reader.jsp?url=http://192.168.2.79:1300/attach/downloadAttach?mn=1467617896) could not pass through your regexp.
evecalm
commented
Mar 24, 2017
•
edited
edited
This is a false regexp. According to the specification rfc3986, there are many more valid chars in search string. A valid url (like http://192.168.2.79:1300/onlineReader/Reader.jsp?url=http://192.168.2.79:1300/attach/downloadAttach?mn=1467617896) could not pass through your regexp. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
IMM0rtalis
Mar 27, 2017
I am trying to make this regex work with an xsd-file. So far i am just disappointed with the restrictions of xsd. Even when i left out the negative lookahead for the ip's i am still struggling with the unicode-ranges inside the characterclasses.
What i have so far:
((https?|ftp)://)(\S+(:\S*)?@)?
(
([1-9]\d?|1\d\d|2[01]\d|22[0-3])(\.(1?\d{1,2}|2[0-4]\d|25[0-5])){2}(\.([1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))
|
(([a-zA-Z\u00a1-\uffff0-9]-?)*[a-zA-Z\u00a1-\uffff0-9]+)(\.([a-zA-Z\u00a1-\uffff0-9]-?)*[a-zA-Z\u00a1-\uffff0-9]+)*(\.([a-zA-Z\u00a1-\uffff]{2,}))
)
(:\d{2,5})?(/\S*)?
Problem is line 5 atm: [a-zA-Z\u00a1-\uffff0-9] Has somebody any idea?
IMM0rtalis
commented
Mar 27, 2017
I am trying to make this regex work with an xsd-file. So far i am just disappointed with the restrictions of xsd. Even when i left out the negative lookahead for the ip's i am still struggling with the unicode-ranges inside the characterclasses. What i have so far:
Problem is line 5 atm: [a-zA-Z\u00a1-\uffff0-9] Has somebody any idea? |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
gorurs
commented
Jun 5, 2017
Hi Diego, Is there a version of this regex for Java? Thanks, |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
pdalfarr
Jun 19, 2017
Hi,
About https://mathiasbynens.be/demo/url-regex page,
Latest regex " @diegoperini (502 chars) " does not completely match last valid example which is
indeed, there is a match, but it's
http://223.255.255.25
without the trailing '4'...?
Am I missing something? Below the Java Pattern I am using.
Thanks.
Why is that?
@ gorurs : here is Diego's regex as Java pattern:
Pattern.compile("(?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?!10(?:\.\d{1,3}){3})(?!127(?:\.\d{1,3}){3})(?!169\.254(?:\.\d{1,3}){2})(?!192\.168(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)[a-z\x{00a1}-\x{ffff}0-9]+)(?:\.(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)[a-z\x{00a1}-\x{ffff}0-9]+)(?:\.(?:[a-z\x{00a1}-\x{ffff}]{2,})))(?::\d{2,5})?(?:/[^\s])?");
pdalfarr
commented
Jun 19, 2017
Hi, About https://mathiasbynens.be/demo/url-regex page, indeed, there is a match, but it's
without the trailing '4'...? Am I missing something? Below the Java Pattern I am using. Thanks. Why is that? @ gorurs : here is Diego's regex as Java pattern: Pattern.compile("(?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?!10(?:\.\d{1,3}){3})(?!127(?:\.\d{1,3}){3})(?!169\.254(?:\.\d{1,3}){2})(?!192\.168(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)[a-z\x{00a1}-\x{ffff}0-9]+)(?:\.(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)[a-z\x{00a1}-\x{ffff}0-9]+)(?:\.(?:[a-z\x{00a1}-\x{ffff}]{2,})))(?::\d{2,5})?(?:/[^\s])?"); |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
rkyoku
Jul 12, 2017
Shouldn't the regex only validate matching parenthesis? I mean, syntactically it is valid to have orphan parenthesis inside the URL, but most of the time it does not happen, whereas having an URL inside parenthesis is pretty common:
This is a test (with url inside parenthesis: http://br.io). Done!
With this regex, the matched URL will be http://br.io).
which is wrong. It should only validate matching parenthesis, because it is the most common use case.
Any way around that? I am a reaaaaal beginner at regexes. Don't even know how to adapt this regex in order to achieve this.
rkyoku
commented
Jul 12, 2017
Shouldn't the regex only validate matching parenthesis? I mean, syntactically it is valid to have orphan parenthesis inside the URL, but most of the time it does not happen, whereas having an URL inside parenthesis is pretty common:
With this regex, the matched URL will be Any way around that? I am a reaaaaal beginner at regexes. Don't even know how to adapt this regex in order to achieve this. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
BlaM
Aug 14, 2017
@renaudparis I don't think you can make that assumption. There is no reason why you could not end up with a closing ")" in an ID (for example) - which does not make the URL invalid: http://example.com/?id=j378)cqv
BlaM
commented
Aug 14, 2017
@renaudparis I don't think you can make that assumption. There is no reason why you could not end up with a closing ")" in an ID (for example) - which does not make the URL invalid: http://example.com/?id=j378)cqv |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
rokoroku
commented
Aug 17, 2017
•
edited
edited
Hmm this regex does not match localhost uri like |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Az0res
Aug 17, 2017
Thanks for your work! Maybe you could add support for urls such as www.example.com (with no protocol defined, but www in front)?
Az0res
commented
Aug 17, 2017
Thanks for your work! Maybe you could add support for urls such as www.example.com (with no protocol defined, but www in front)? |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
cguillemette
Aug 21, 2017
@rokoroku: it does not consider valid localhost per documentation in comment of gist.
See "TLDs have been made mandatory so single names like "localhost" fails".
cguillemette
commented
Aug 21, 2017
•
edited
edited
@rokoroku: it does not consider valid localhost per documentation in comment of gist. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dinbrca
Sep 11, 2017
Hello, thanks you for the hard work, I would like to note that https://raaya_karas.carbonmade.com/ doesn't pass validation. Can you fix it? thanks
dinbrca
commented
Sep 11, 2017
Hello, thanks you for the hard work, I would like to note that https://raaya_karas.carbonmade.com/ doesn't pass validation. Can you fix it? thanks |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
kaijMueller
Oct 14, 2017
@rokoroku: you can add a '?' to the end of line 92 so the TLD is optional:
"(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))?"
@dinbrca add the underscore to the host and domain regex:
"(?:(?:[a-z\u00a1-\uffff0-9](?:_|-)*)*[a-z\u00a1-\uffff0-9]+)"
"(?:\\.(?:[a-z\\u00a1-\\uffff0-9](?:_|-)*)*[a-z\\u00a1-\\uffff0-9]+)*"
According to https://www.w3.org/Addressing/URL/5_BNF.html can a host or domain contain some special characters like -_& but not so much like \u00a1-\uffff
. Also a host can end with a '-', which is not possible with this regex
kaijMueller
commented
Oct 14, 2017
•
edited
edited
@rokoroku: you can add a '?' to the end of line 92 so the TLD is optional: According to https://www.w3.org/Addressing/URL/5_BNF.html can a host or domain contain some special characters like -_& but not so much like |
idanen
commented
Oct 17, 2017
@dinbrca, I also had the same problem. |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
dperini
Nov 29, 2017
@gaius @anchev and @steelliberty
two consecutive hyphen (http://g--a.com) are considered valid by the specification see here:
https://stackoverflow.com/questions/16468309/can-domain-name-have-two-continuous-hyphens
this have been said repeatedly in this discussion and also a dot at the end of the domain is perfectly valid:
http://www.google.com./
@gaius @anchev and @steelliberty
this have been said repeatedly in this discussion and also a dot at the end of the domain is perfectly valid:
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
estebannn
Dec 11, 2017
Hi,
Trying to validate the following url:
http://my-test.dom.fr/the-content/lang/ext/?uri=ID:3200AA(01)
using
http://www.regexplanet.com/advanced/java/index.html
with the regex:
(?i)\b(?:(?:https?|ftp)://)?(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:.\d{1,3}){3})(?!(?:169.254|192.168)(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-)[a-z\u00a1-\uffff0-9]+)(?:.(?:[a-z\u00a1-\uffff0-9]-)[a-z\u00a1-\uffff0-9]+)(?:.(?:[a-z\u00a1-\uffff]{2,})).?)(?::\d{2,5})?(?:[/?#]\S)?\b
I got only the group :
http://my-test.dom.fr/the-content/lang/ext/?uri=ID:3200AA(01 WITHOUT ")" ?????
Any idea ? Thanks in advance !
estebannn
commented
Dec 11, 2017
Hi, Trying to validate the following url: with the regex: I got only the group : http://my-test.dom.fr/the-content/lang/ext/?uri=ID:3200AA(01 WITHOUT ")" ????? Any idea ? Thanks in advance ! |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Synchro
Dec 14, 2017
This is considered valid, even though it contains incorrectly URL-encoded elements:
http://example.com?a=%T=
I assume this is considered valid because the URL scheme itself doesn't care about higher-level concerns, however, the standard PHP http extension will fail to parse a URL containing such encoding errors, which is exactly the kind of thing I'm likely to be checking a URL for before trying to request it:
$req = new http\Client\Request('HEAD', 'http://example.com?a=%T=');
PHP Warning: http\Client\Request::__construct(): Failed to parse query; invalid percent encoding at pos 2 in 'a=%T='
Synchro
commented
Dec 14, 2017
This is considered valid, even though it contains incorrectly URL-encoded elements:
I assume this is considered valid because the URL scheme itself doesn't care about higher-level concerns, however, the standard PHP http extension will fail to parse a URL containing such encoding errors, which is exactly the kind of thing I'm likely to be checking a URL for before trying to request it:
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Synchro
commented
Dec 14, 2017
Some valid URL schemes are marked as invalid:
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
alpeshp
Dec 18, 2017
Hello sir
I want to allowed user to enter following URl Schema
http://
https://
ftp://
ftps://
file://
market://
linkedin://
fb://
geo:
maps://
is your RE this accept Url Like geo:37.786971,-122.399677
and all others
please let me know how can i used your RE
thanks and regars
alpeshp
commented
Dec 18, 2017
Hello sir I want to allowed user to enter following URl Schema http:// is your RE this accept Url Like geo:37.786971,-122.399677 and all others please let me know how can i used your RE thanks and regars |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
mnogueron
Jan 4, 2018
@pdalfarr Same problem for me, http://1.1.1.253
is not recognised even though it's a valid URL. In fact as soon as the last component of the IP has more than 2 digits, the last digit is not captured.
It seems that this part of the regex: (?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))
should be reordered so that it captures the whole 253
.
Here is my solution: (?:\.(?:25[0-4]|1\d\d|2[0-4]\d|[1-9]\d?))
Hope it helps other people! :)
mnogueron
commented
Jan 4, 2018
•
edited
edited
@pdalfarr Same problem for me, It seems that this part of the regex: Hope it helps other people! :) |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
gajus
Jan 17, 2018
@Mickael-van-der-Beek regarding:
I also discovered that underscores are not valid if you follow this RegExp.
Just add _
to the 17th matching group, i.e.
- ^((https?|ftp):\/\/)(\S+(:\S*)?@)?((?!10(\.\d{1,3}){3})(?!127(\.\d{1,3}){3})(?!169\.254(\.\d{1,3}){2})(?!192\.168(\.\d{1,3}){2})(?!172\.(1[6-9]|2\d|3[0-1])(\.\d{1,3}){2})([1-9]\d?|1\d\d|2[01]\d|22[0-3])(\.(1?\d{1,2}|2[0-4]\d|25[0-5])){2}(\.([1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(([a-z\u{00a1}-\u{ffff}0-9]+-?)*[a-z\u{00a1}-\u{ffff}0-9]+)(\.([a-z\u{00a1}-\u{ffff}0-9]+-?)*[a-z\u{00a1}-\u{ffff}0-9]+)*(\.([a-z\u{00a1}-\u{ffff}]{2,})))(:\d{2,5})?(\/[^\s]*)?$
+ ^((https?|ftp):\/\/)(\S+(:\S*)?@)?((?!10(\.\d{1,3}){3})(?!127(\.\d{1,3}){3})(?!169\.254(\.\d{1,3}){2})(?!192\.168(\.\d{1,3}){2})(?!172\.(1[6-9]|2\d|3[0-1])(\.\d{1,3}){2})([1-9]\d?|1\d\d|2[01]\d|22[0-3])(\.(1?\d{1,2}|2[0-4]\d|25[0-5])){2}(\.([1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(([_a-z\u{00a1}-\u{ffff}0-9]+-?)*[a-z\u{00a1}-\u{ffff}0-9]+)(\.([a-z\u{00a1}-\u{ffff}0-9]+-?)*[a-z\u{00a1}-\u{ffff}0-9]+)*(\.([a-z\u{00a1}-\u{ffff}]{2,})))(:\d{2,5})?(\/[^\s]*)?$
gajus
commented
Jan 17, 2018
@Mickael-van-der-Beek regarding:
Just add - ^((https?|ftp):\/\/)(\S+(:\S*)?@)?((?!10(\.\d{1,3}){3})(?!127(\.\d{1,3}){3})(?!169\.254(\.\d{1,3}){2})(?!192\.168(\.\d{1,3}){2})(?!172\.(1[6-9]|2\d|3[0-1])(\.\d{1,3}){2})([1-9]\d?|1\d\d|2[01]\d|22[0-3])(\.(1?\d{1,2}|2[0-4]\d|25[0-5])){2}(\.([1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(([a-z\u{00a1}-\u{ffff}0-9]+-?)*[a-z\u{00a1}-\u{ffff}0-9]+)(\.([a-z\u{00a1}-\u{ffff}0-9]+-?)*[a-z\u{00a1}-\u{ffff}0-9]+)*(\.([a-z\u{00a1}-\u{ffff}]{2,})))(:\d{2,5})?(\/[^\s]*)?$
+ ^((https?|ftp):\/\/)(\S+(:\S*)?@)?((?!10(\.\d{1,3}){3})(?!127(\.\d{1,3}){3})(?!169\.254(\.\d{1,3}){2})(?!192\.168(\.\d{1,3}){2})(?!172\.(1[6-9]|2\d|3[0-1])(\.\d{1,3}){2})([1-9]\d?|1\d\d|2[01]\d|22[0-3])(\.(1?\d{1,2}|2[0-4]\d|25[0-5])){2}(\.([1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(([_a-z\u{00a1}-\u{ffff}0-9]+-?)*[a-z\u{00a1}-\u{ffff}0-9]+)(\.([a-z\u{00a1}-\u{ffff}0-9]+-?)*[a-z\u{00a1}-\u{ffff}0-9]+)*(\.([a-z\u{00a1}-\u{ffff}]{2,})))(:\d{2,5})?(\/[^\s]*)?$
|
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
adriano-di-giovanni
commented
Jan 30, 2018
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
eftychiaira
commented
Feb 13, 2018
how to add another kind of url in this regex? |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
wrone
Feb 21, 2018
has anyone been able to use it in Salesforce Apex?
upd:
removed these "\u00a1-\uffff" and was able to execute it, code example:
String urlRegex = '^(?:(?:https?|ftp)://)(?:\\'+'S+(?::\\'+'S*)?@)?(?:(?!(?:10|127)(?:\\'+'.\\'+'d{1,3}){3})(?!(?:169\\'+'.254|192\\'+'.168)(?:\\'+'.\\'+'d{1,3}){2})(?!172\\'+'.(?:1[6-9]|2\\'+'d|3[0-1])(?:\\'+'.\\'+'d{1,3}){2})(?:[1-9]\\'+'d?|1\\'+'d\\'+'d|2[01]\\'+'d|22[0-3])(?:\\'+'.(?:1?\\'+'d{1,2}|2[0-4]\\'+'d|25[0-5])){2}(?:\\'+'.(?:[1-9]\\'+'d?|1\\'+'d\\'+'d|2[0-4]\\'+'d|25[0-4]))|(?:(?:[a-zA-Z0-9]-*)*[a-zA-Z0-9]+)(?:\\'+'.(?:[a-zA-Z0-9]-*)*[a-zA-Z0-9]+)*(?:\\'+'.(?:[a-zA-Z]{2,}))\\'+'.?)(?::\\'+'d{2,5})?(?:[/?#]\\'+'S*)?';
"\u00a1-\uffff" - these, as far as i understood, are some special characters. I believe they are rarely used, actually i have never seen urls with non-alphabetic character. Will try to use my version, should be enough I think
wrone
commented
Feb 21, 2018
•
edited
edited
has anyone been able to use it in Salesforce Apex? upd: "\u00a1-\uffff" - these, as far as i understood, are some special characters. I believe they are rarely used, actually i have never seen urls with non-alphabetic character. Will try to use my version, should be enough I think |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
djowinz
Mar 6, 2018
It appears this still locks chrome up when using for JavaScript.
Chrome Version: 64.0.3282.186 (Official Build) (64-bit)
code executed:
var re_weburl = new RegExp(
"^" +
// protocol identifier
"(?:(?:https?|ftp)://)" +
// user:pass authentication
"(?:\\S+(?::\\S*)?@)?" +
"(?:" +
// IP address exclusion
// private & local networks
"(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
// IP address dotted notation octets
// excludes loopback network 0.0.0.0
// excludes reserved space >= 224.0.0.0
// excludes network & broacast addresses
// (first & last IP address of each class)
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
"|" +
// host name
"(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)" +
// domain name
"(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*" +
// TLD identifier
"(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" +
// TLD may end with dot
"\\.?" +
")" +
// port number
"(?::\\d{2,5})?" +
// resource path
"(?:[/?#]\\S*)?" +
"$", "i"
);
re_weburl.test('http://www.Y6jp2mNKXucTNw8vkiu.75QF0dlTT9EgkqGt7Tr.eeZXNZLZqlHWJg0ewf9.p0H6nnxidoxp8Gkrpln.5Rl9eN5ZA0UYMSXfLeL.oRBQyrj3Pw8YM5pbcIO.ph7UoP01AsuKFiIYJSl');
I'm not sure what's causing it to break, but this should theoretically return false if I understand the regex properly. None the less, when using this regex someone could completely freeze their own application by supplying a url with groups broken up by periods and having enough groups. Increasing the size of the group requires fewer groups to freeze the regex. It's super edge case, but it can happen, and it "could" happen in the real world with bizarre sub-domains.
If I add a .com
to the end of the string I provided this works as expected, but if I continue to increase the number of groups it begins to degrade in performance.
djowinz
commented
Mar 6, 2018
•
edited
edited
It appears this still locks chrome up when using for JavaScript. Chrome Version: 64.0.3282.186 (Official Build) (64-bit) code executed: var re_weburl = new RegExp(
"^" +
// protocol identifier
"(?:(?:https?|ftp)://)" +
// user:pass authentication
"(?:\\S+(?::\\S*)?@)?" +
"(?:" +
// IP address exclusion
// private & local networks
"(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
// IP address dotted notation octets
// excludes loopback network 0.0.0.0
// excludes reserved space >= 224.0.0.0
// excludes network & broacast addresses
// (first & last IP address of each class)
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
"|" +
// host name
"(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)" +
// domain name
"(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*" +
// TLD identifier
"(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" +
// TLD may end with dot
"\\.?" +
")" +
// port number
"(?::\\d{2,5})?" +
// resource path
"(?:[/?#]\\S*)?" +
"$", "i"
);
re_weburl.test('http://www.Y6jp2mNKXucTNw8vkiu.75QF0dlTT9EgkqGt7Tr.eeZXNZLZqlHWJg0ewf9.p0H6nnxidoxp8Gkrpln.5Rl9eN5ZA0UYMSXfLeL.oRBQyrj3Pw8YM5pbcIO.ph7UoP01AsuKFiIYJSl'); I'm not sure what's causing it to break, but this should theoretically return false if I understand the regex properly. None the less, when using this regex someone could completely freeze their own application by supplying a url with groups broken up by periods and having enough groups. Increasing the size of the group requires fewer groups to freeze the regex. It's super edge case, but it can happen, and it "could" happen in the real world with bizarre sub-domains. If I add a |
demian51
commented
Mar 9, 2018
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
athe0i
Mar 14, 2018
how about mine - https://regex101.com/r/yvPwkL/8/ . the concept there is to keep it somewhat short, yet good enough, so its not ideal(specially ip-like stuff).
athe0i
commented
Mar 14, 2018
how about mine - https://regex101.com/r/yvPwkL/8/ . the concept there is to keep it somewhat short, yet good enough, so its not ideal(specially ip-like stuff). |
This comment has been minimized.
Show comment
Hide comment
This comment has been minimized.
Show comment Hide comment
Boheminsan
Apr 16, 2018
I adapted your code to my c# project. Im grateful to you Mr Perini. Thank you so much.
Boheminsan
commented
Apr 16, 2018
I adapted your code to my c# project. Im grateful to you Mr Perini. Thank you so much. |
In PHP (for use with
preg_match
), this becomes:Thanks for the regex Diego, I’ve added it to the test case and it seems to pass all the tests :) Nice job!