// | |
// Regular Expression for URL validation | |
// | |
// Author: Diego Perini | |
// Created: 2010/12/05 | |
// Updated: 2018/09/12 | |
// License: MIT | |
// | |
// Copyright (c) 2010-2018 Diego Perini (http://www.iport.it) | |
// | |
// Permission is hereby granted, free of charge, to any person | |
// obtaining a copy of this software and associated documentation | |
// files (the "Software"), to deal in the Software without | |
// restriction, including without limitation the rights to use, | |
// copy, modify, merge, publish, distribute, sublicense, and/or sell | |
// copies of the Software, and to permit persons to whom the | |
// Software is furnished to do so, subject to the following | |
// conditions: | |
// | |
// The above copyright notice and this permission notice shall be | |
// included in all copies or substantial portions of the Software. | |
// | |
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | |
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
// OTHER DEALINGS IN THE SOFTWARE. | |
// | |
// the regular expression composed & commented | |
// could be easily tweaked for RFC compliance, | |
// it was expressly modified to fit & satisfy | |
// these test for an URL shortener: | |
// | |
// http://mathiasbynens.be/demo/url-regex | |
// | |
// Notes on possible differences from a standard/generic validation: | |
// | |
// - utf-8 char class take in consideration the full Unicode range | |
// - TLDs have been made mandatory so single names like "localhost" fails | |
// - protocols have been restricted to ftp, http and https only as requested | |
// | |
// Changes: | |
// | |
// - IP address dotted notation validation, range: 1.0.0.0 - 223.255.255.255 | |
// first and last IP address of each class is considered invalid | |
// (since they are broadcast/network addresses) | |
// | |
// - Added exclusion of private, reserved and/or local networks ranges | |
// - Made starting path slash optional (http://example.com?foo=bar) | |
// - Allow a dot (.) at the end of hostnames (http://example.com.) | |
// - Allow an underscore (_) character in host/domain names | |
// - Check dot delimited parts length and total length | |
// - Made protocol optional, allowed short syntax // | |
// | |
// Compressed one-line versions: | |
// | |
// Javascript regex version | |
// | |
// /^(?:(?:(?:https?|ftp):)?\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?[a-z0-9\u00a1-\uffff]\.)+(?:[a-z\u00a1-\uffff]{2,}\.?))(?::\d{2,5})?(?:[/?#]\S*)?$/i | |
// | |
// PHP version (uses % symbol as delimiter) | |
// | |
// %^(?:(?:(?:https?|ftp):)?\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9\x{00a1}-\x{ffff}][a-z0-9\x{00a1}-\x{ffff}_-]{0,62})?[a-z0-9\x{00a1}-\x{ffff}]\.)+(?:[a-z\x{00a1}-\x{ffff}]{2,}\.?))(?::\d{2,5})?(?:[/?#]\S*)?$%iuS | |
// | |
var re_weburl = new RegExp( | |
"^" + | |
// protocol identifier (optional) | |
// short syntax // still required | |
"(?:(?:(?:https?|ftp):)?\\/\\/)" + | |
// user:pass BasicAuth (optional) | |
"(?:\\S+(?::\\S*)?@)?" + | |
"(?:" + | |
// IP address exclusion | |
// private & local networks | |
"(?!(?:10|127)(?:\\.\\d{1,3}){3})" + | |
"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" + | |
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" + | |
// IP address dotted notation octets | |
// excludes loopback network 0.0.0.0 | |
// excludes reserved space >= 224.0.0.0 | |
// excludes network & broadcast addresses | |
// (first & last IP address of each class) | |
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" + | |
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" + | |
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" + | |
"|" + | |
// host & domain names, may end with dot | |
// can be replaced by a shortest alternative | |
// (?![-_])(?:[-\\w\\u00a1-\\uffff]{0,63}[^-_]\\.)+ | |
"(?:" + | |
"(?:" + | |
"[a-z0-9\\u00a1-\\uffff]" + | |
"[a-z0-9\\u00a1-\\uffff_-]{0,62}" + | |
")?" + | |
"[a-z0-9\\u00a1-\\uffff]\\." + | |
")+" + | |
// TLD identifier name, may end with dot | |
"(?:[a-z\\u00a1-\\uffff]{2,}\\.?)" + | |
")" + | |
// port number (optional) | |
"(?::\\d{2,5})?" + | |
// resource path (optional) | |
"(?:[/?#]\\S*)?" + | |
"$", "i" | |
); |
This comment has been minimized.
This comment has been minimized.
I have added simple network ranges validation, the rules I used are: This a very minimal list of tests to add to your testings: PASS FAIL Need testing :) |
This comment has been minimized.
This comment has been minimized.
Need to mention I took the idea of validating the possible IP address ranges in the URL while looking at other developers regular expressions I have seen in your tests, especially the one from @scottgonzales. He also sliced up the Unicode ranges :=), that's the reason his one is so long :) |
This comment has been minimized.
This comment has been minimized.
Awesome stuff Diego!! |
This comment has been minimized.
This comment has been minimized.
Added IP address validation tweaking and optimizations suggested by @abozhilov |
This comment has been minimized.
This comment has been minimized.
Added exclusion of private, reserved, auto-configuration and local network ranges as described in the previous message. It is easy to just remove the unwanted parts of the validation to fit different scopes (length, precision) so I will probably add more options like the list of existing TLD (possibly grouped), the list of existing protocols and/or a fall back for a more generic protocol match too. |
This comment has been minimized.
This comment has been minimized.
Hey, just randomly came across this... my JavaScript URI parsing library does strict URI validation as per RFC 3986. It uses a much larger regular expression then this one. Code can be found at: https://github.com/garycourt/uri-js |
This comment has been minimized.
This comment has been minimized.
I changed it a little bit so that it's valid in Ruby. Here it is: /\A(?:(?:https?|ftp)://)(?:\S+(?::\S_)?@)?(?:(?!10(?:.\d{1,3}){3})(?!127(?:.\d{1,3}){3})(?!169.254(?:.\d{1,3}){2})(?!192.168(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)(?:.(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)_(?:.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/[^\s]*)?\z/i |
This comment has been minimized.
This comment has been minimized.
Hi Diego, Just came across this awesome code. I'd like to use this as a basis, and I'm hoping you can help me with a simple tweak. I'd like to let through URL's without the protocol specified (HTTP(S) or FTP). For some reason I can't seem to get it to work. Thanks, |
This comment has been minimized.
This comment has been minimized.
Hey Diego, Nice work. You make it a bit shorter though:
Similarly with the 0.0.255.255 subnets |
This comment has been minimized.
This comment has been minimized.
@dperini Can you assign a license to this? MIT or BSD? |
This comment has been minimized.
This comment has been minimized.
+1 for the license information |
This comment has been minimized.
This comment has been minimized.
+1 for the license information from me, too |
This comment has been minimized.
This comment has been minimized.
+infinity on the license Diego |
This comment has been minimized.
This comment has been minimized.
I have added the MIT License to the gist as requested. Thank you all for the support. |
This comment has been minimized.
This comment has been minimized.
@dperini: Could you add support for url such this? //dc8hdnsmzapvm.cloudfront.net/assets/styles/application.css thanks |
This comment has been minimized.
This comment has been minimized.
Is there a Java version of the regex available? That would be great for my android app! |
This comment has been minimized.
This comment has been minimized.
@mparodi Ruby version untouched by markdown
|
This comment has been minimized.
This comment has been minimized.
Ruby port: class Regexp
PERFECT_URL_PATTERN = %r{
\A
# protocol identifier
(?:(?:https?|ftp)://)
# user:pass authentication
(?:\S+(?::\S*)?@)?
(?:
# IP address exclusion
# private & local networks
(?!10(?:\.\d{1,3}){3})
(?!127(?:\.\d{1,3}){3})
(?!169\.254(?:\.\d{1,3}){2})
(?!192\.168(?:\.\d{1,3}){2})
(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})
# IP address dotted notation octets
# excludes loopback network 0.0.0.0
# excludes reserved space >= 224.0.0.0
# excludes network & broacast addresses
# (first & last IP address of each class)
(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])
(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}
(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))
|
# host name
(?:(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)
# domain name
(?:\.(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)*
# TLD identifier
(?:\.(?:[a-z\u00a1-\uffff]{2,}))
)
# port number
(?::\d{2,5})?
# resource path
(?:/[^\s]*)?
\z
}xi
end And specs: # encoding: utf-8
require "spec_helper"
describe "Regexp::PERFECT_URL_PATTERN" do
[
"http://✪df.ws/123",
"http://userid:password@example.com:8080",
"http://userid:password@example.com:8080/",
"http://userid@example.com",
"http://userid@example.com/",
"http://userid@example.com:8080",
"http://userid@example.com:8080/",
"http://userid:password@example.com",
"http://userid:password@example.com/",
"http://142.42.1.1/",
"http://142.42.1.1:8080/",
"http://➡.ws/䨹",
"http://⌘.ws",
"http://⌘.ws/",
"http://foo.com/blah_(wikipedia)#cite-1",
"http://foo.com/blah_(wikipedia)_blah#cite-1",
"http://foo.com/unicode_(✪)_in_parens",
"http://foo.com/(something)?after=parens",
"http://☺.damowmow.com/",
"http://code.google.com/events/#&product=browser",
"http://j.mp",
"ftp://foo.bar/baz",
"http://foo.bar/?q=Test%20URL-encoded%20stuff",
"http://مثال.إختبار",
"http://例子.测试"
].each do |valid_url|
it "matches #{valid_url}" do
expect(Regexp::PERFECT_URL_PATTERN =~ valid_url).to eq 0
end
end
[
"http://",
"http://.",
"http://..",
"http://../",
"http://?",
"http://??",
"http://??/",
"http://#",
"http://##",
"http://##/",
"http://foo.bar?q=Spaces should be encoded",
"//",
"//a",
"///a",
"///",
"http:///a",
"foo.com",
"rdar://1234",
"h://test",
"http:// shouldfail.com",
":// should fail",
"http://foo.bar/foo(bar)baz quux",
"ftps://foo.bar/",
"http://-error-.invalid/",
"http://a.b--c.de/",
"http://-a.b.co",
"http://a.b-.co",
"http://0.0.0.0",
"http://10.1.1.0",
"http://10.1.1.255",
"http://224.1.1.1",
"http://1.1.1.1.1",
"http://123.123.123",
"http://3628126748",
"http://.www.foo.bar/",
"http://www.foo.bar./",
"http://.www.foo.bar./",
"http://10.1.1.1",
"http://10.1.1.254"
].each do |invalid_url|
it "does not match #{invalid_url}" do
expect(Regexp::PERFECT_URL_PATTERN =~ invalid_url).to be_nil
end
end
end |
This comment has been minimized.
This comment has been minimized.
very good, thank you for share |
This comment has been minimized.
This comment has been minimized.
I added support for punycoded domain names: https://gist.github.com/HenkPoley/8899766 |
This comment has been minimized.
This comment has been minimized.
Updated the gist with reductions/shortenings suggested by "jpillora". Thank you ! |
This comment has been minimized.
This comment has been minimized.
raitucarp, to do that you can change line 65 from:
to
this way the protocol and colon becomes an optional macth. You can also just leave the double slash on that line if no URLs have the protocol prefix:
|
This comment has been minimized.
This comment has been minimized.
Why can't the maximum range for Unicode strings extend to U0010ffff (instead of uffff)? |
This comment has been minimized.
This comment has been minimized.
What about relative URLs?
|
This comment has been minimized.
This comment has been minimized.
@stevenvachon relatives wouldn't be URLs they would be paths, which wouldn't need this validation at that point. |
This comment has been minimized.
This comment has been minimized.
I recently needed this but have a dumb question. In the very last part for the resource path, why do you use |
This comment has been minimized.
This comment has been minimized.
For the following Regex and the one pasted by ixti: URL = /\A(?:(?:https?):\/\/)?(?:\S+(?::\S*)?@)?(?:(?:(?:[a-z0-9][a-z0-9\-]+)*[a-z0-9]+)(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*(?:\.(?:[a-z]{2,})(:\d{1,5})?))(?:\/[^\s]*)?\z/i You will end up with extremely slow matching, to the point where you suspect an infinite loop, if you have a long subdomain for a URL ending with a period: ie: it { should_not match "http://aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.randomstring." } The longer the subdomain "aaa....", the longer it'll take. |
This comment has been minimized.
This comment has been minimized.
Fixed the URL Regex to make the subdomain match non-recursive thereby improving performance. Long story short: it passed our existing test suite and improved performance dramatically. URL = /\A(?:(?:https?):\/\/)?(?:\S+(?::\S*)?@)?(?:(?:([a-z0-9][a-z0-9\-]*)?[a-z0-9]+)(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*(?:\.(?:[a-z]{2,})(:\d{1,5})?))(?:\/[^\s]*)?\z/i |
This comment has been minimized.
This comment has been minimized.
Anyone have a python port? My recollection was that the python regexp engine does have some differences. |
This comment has been minimized.
This comment has been minimized.
@dperini you should add support for 32bit addresses and ipv6 addresses. https://news.ycombinator.com/item?id=7928990 I vote that this should be turned into a git repository with multi-language ports. |
This comment has been minimized.
This comment has been minimized.
I'm also using the top of the page gist regex in JS and finding it very slow to process long invalid URLs such as: The more letters added there the slower the response. It sounds like what @phiyangt is referring to above. Is there any solution for this for JS? Thanks. |
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
Well after a few test I can say the slowdown and further browser crash is a Chrome only problem. I have reduced the original REGEXP to a minimal to be able to show the problem. /^(?:\w+)(?:.(?:[\w]+-?)[\w]+)(?:.[a-z]{2,})$/i.test('www.isjdfofjasodfjsodifjosadifjsdoiafjaisdjfisdfjs'); So I believe this is just a bug in Chrome RE engine. |
This comment has been minimized.
This comment has been minimized.
Hi Diego, Yeah I'm on latest stable Chrome (Version 35.0.1916.153 m). This is the "bad" url I'm checking http://qweqweqweqwesadasdqweeqweqwsdqweqweqweqwesadasdqweeqweqwsd The original regex I'm using (the one from the Gist on top - 1 liner or full version) locks the browser in Chrome as you say. It also locks up IE11. In Firefox 29 it gave this error: I updated to latest Firefox v30. The regex runs and gives false which is correct. From some research online it appears Chrome does not halt execution when there is catastrophic backtracking in a regex. Safari, Firefox and IE could just report 'no match' after some arbitrary number of backtracks. I also tried your recent regex above and it doesn't lock any browsers. However it returns true for 'isjdfofjasodfjsodifjosadifjsdoiafjaisdjfisdfjs' which is invalid. Are you sure there isn't a runaway loop in there somewhere? |
This comment has been minimized.
This comment has been minimized.
@Feendish Please try to cut and paste the RE from this tweet: I retested it and it actually crashes the console in that it doesn't answer to commands anymore after running that RE test that you can find in the above tweet. The fact that the original RE also works on Safari pushes me to believe it's a Chrome problem but I need to do more tests. The "weburl" RE also work in PHP and other environments. I am testing on the same Chrome Version 35.0.1916.153 under OS X 10.9.3. Suggestion and help on this matter are welcome ! |
This comment has been minimized.
This comment has been minimized.
@dperini This seems to be a V8 issue. Relevant bug ticket: https://code.google.com/p/v8/issues/detail?id=430 |
This comment has been minimized.
This comment has been minimized.
@dperini I ran the RE from the tweet in RegexBuddy analyser and it says "Your regular expression leads to "catastrophic backtracking", making it too complex to be run to completion." It locks up Chrome & Opera but not Firefox. As the ticket @mathiasbynens linked to suggests, certain browsers are more lenient when catastrophic backtracking happens. Chrome V8 seems to not have any fail limit for this and puts the onus on the regex format. |
This comment has been minimized.
This comment has been minimized.
@Feendish |
This comment has been minimized.
This comment has been minimized.
Sure sent it there now. Thanks. |
This comment has been minimized.
This comment has been minimized.
@dperini, we've found this issue too... looks like there's a highly exponential recursion into infinity on simple strings. I've managed to reduce this to the way the hostname check is written (since it's followed later (eventually) by TLD).
In other words, when you have a repeat of something 1 -> infinity times, and this group is repeated 0->infinity times, and the next match is for anything not in the group (obviously... but I put [^w] just to illustrate), then chrome will keep recursion to search for a possible group of (1->n) which repeats (0->m) times which has that letter matching. Of course, internally, the regex should first be run 'greedily' to check if there's a possible match by making sure required letters are there.. Essentially, if I were to write the implementation for a regex, when encountering such a group, I would internally be doing this:
because first I'm doing a positive lookahead to check if this is even possible... though the complexity for this rises as the nested groups become more complex Finally, I think this can be fixed here, by changing the host name from:
to:
which is really the same thing, if you think about it. |
This comment has been minimized.
This comment has been minimized.
In fact, I believe the whole host-domain-TLD identifier is the same as this (but this should be more performant and not crash):
There's no need to add non-capturing groups if you're not doing anything with the group... if you plan to modify a group with a repeater, lookahead or just use an OR operator in it, then use a group, but otherwise there's really no point (since all you want, is to make sure everything in the group is present... which you don't need to use a group for!) |
This comment has been minimized.
This comment has been minimized.
Thank you @EtaiG, However I have been pushed to "re-read" the specifications throughly and was answered on a V8 ticket here: https://code.google.com/p/v8/issues/detail?id=430 Since most wanted a Javascript to use as a pattern checking inputs I did tests in Javascript only. This is the result of following his advice, no ftp protocol no special IP handling, only the minimal:
This RE fits in a tweet ! But let's see how it works for you. I also changed [^\s] with a \S as suggested by @jkj and relaxed the match on protocol identifiers. Consecutive hyphens are allowed by specifications but they must not be found in both 3rd and 4th positions, those sequences are reserved for "xn--" and similar ASCII Compatible Encodings. If that exclusion were necessary maybe a simple lookahead (?|..--) will help there too. |
This comment has been minimized.
This comment has been minimized.
@dperini , thanks for responding. Please note that I will be analysing this issue in depth below, and if I come off critical - that is not my intent, so I apologize in advance. I disagree with the negative lookaheads. There are rare cases when they are truly useful. I like being more explicit about the regex- which may make it more verbose, but it's very clear what the javascript engine needs to do to match it. For example, when you have:
This part can match long strings in too many different ways, and the regex is too general, so for characters which would match both the first character group and the second (namely, almost anything except for a dot and a hyphen), it can match an exponential number of times. For example, it can match 'ab' as: It's easy to see that for a string of length n, it has 2^(n-1) possible matches. The way a greedy quantifier works is that it will stop as soon as it finds a possible match - otherwise it will try the next possibility in order to continue matching the regular expression.
You can test out your regex against that string (the one with the period at the end) and you'll see what I mean. Also, note that 'aaaaaaaaaaaaaaaaaaaaaaaaaa' will match your regex although it's invalid. This is because of the generalization of the check using greedy quantifiers, enabled by the negative lookahead (?!./|.$) (or by both of them?) This is why I don't like negative lookaheads and prefer to be more declarative. You're almost forced to be more declarative when you don't use the negative lookaheads... but in the end, you are giving 'better instructions' to the javascript engine. That's why I liked this better (for the host/domain/tld):
Note that this is the same as what I posted above, with the exception of switching out the -? for -* (in both host and domain) to allow for as many hyphens in between letters. This doesn't take care of the xn-- and 3rd/4th position issue, but unless you're allowing someone to register a domain by you, this is less of an issue (since for most cases, it's for a link, and people only need to link to something that is allowed and exists)... and even then, serverside validation would be necessary. |
This comment has been minimized.
This comment has been minimized.
@EtaiG many thanks for the review and the good suggestions.
I am not sure I should consider digits as valid in the TLD group (also it is considered a label itself). Now the tests do not lock up Chrome and it also seem the overall speed for URL validation is faster. |
This comment has been minimized.
This comment has been minimized.
The gist have been corrected/updated so it doesn't lock up Chrome Javascript. |
This comment has been minimized.
This comment has been minimized.
I believe the slash before query params is optional. Changing line 93 to "(?:/?\\S*)?" + solves that issue, but might break other query-parameter specifications that aren't covered in the test cases. |
This comment has been minimized.
This comment has been minimized.
@schbetsy I am not sure it is optional either. |
This comment has been minimized.
This comment has been minimized.
Hey @dperini, Thanks for your great work! Please note that this regex fails on the following url: |
This comment has been minimized.
This comment has been minimized.
@eluck, |
This comment has been minimized.
This comment has been minimized.
Hey! can you help me make this URI valid "foo.com" thanks ahead! |
This comment has been minimized.
This comment has been minimized.
PYTHON PORT (cc @brifordwylie): import re
URL_REGEX = re.compile(
u"^"
# protocol identifier
u"(?:(?:https?|ftp)://)"
# user:pass authentication
u"(?:\S+(?::\S*)?@)?"
u"(?:"
# IP address exclusion
# private & local networks
u"(?!(?:10|127)(?:\.\d{1,3}){3})"
u"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
u"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
# IP address dotted notation octets
# excludes loopback network 0.0.0.0
# excludes reserved space >= 224.0.0.0
# excludes network & broadcast addresses
# (first & last IP address of each class)
u"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
u"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
u"|"
# host name
u"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
# domain name
u"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
# TLD identifier
u"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
u")"
# port number
u"(?::\d{2,5})?"
# resource path
u"(?:/\S*)?"
u"$"
, re.UNICODE) I did make one change: the "-*" in both domain and host was (incorrectly) succeeding against "http://a.b--c.de/" so I changed it to "-?" - I'm not sure why that's in the gist above, I'd think it would fail on a JS unit test also. |
This comment has been minimized.
This comment has been minimized.
@adamrofer, |
This comment has been minimized.
This comment has been minimized.
@dperini |
This comment has been minimized.
This comment has been minimized.
@nghuuphuoc, |
This comment has been minimized.
This comment has been minimized.
@dperini thanks for sharing |
This comment has been minimized.
This comment has been minimized.
@dperini,
Heres my code form_validators.coffee
form_validators.js.coffee
Just thought I would take the time out to let you know. I'm not sure if something changed recently, if you are even supporting this script anymore. Good work by the way, saved me a tone of time. |
This comment has been minimized.
This comment has been minimized.
@adamrofer fix of changing ( -* ) to ( -? ) in the host and domain name section fixed the js unit test for me |
This comment has been minimized.
This comment has been minimized.
@dsgn1graphics, I tested them once more within my environment (Javascript) and everything works as expected. |
This comment has been minimized.
This comment has been minimized.
Thanks Diego for your hard work! |
This comment has been minimized.
This comment has been minimized.
Hi @dperini I love the expression, but I'm wondering what modification I would need to make, to make the pattern ignore a URL if it is proceeded by either a " or = or ] or > and succeeded with either a " or [/ or </ It is so that the following won't be validated:
and
Reason is I currently use modified version gruber's regex as part of a php auto url function in the following manner, but I would like to use your's instead:-
Thanks, Matt |
This comment has been minimized.
This comment has been minimized.
Additional, my thinking behind this question is to be able to allow the manual coding of links, using html or bbcode. |
This comment has been minimized.
This comment has been minimized.
Matt, (?:\x22|\x3d|\x5d|\x3e)(?:regex-weburl)(?:\x22|\x5b\x2f|\x3c\x2f) haven't tried it, not sure it does exactly what you asked/depicted. |
This comment has been minimized.
This comment has been minimized.
Matt,
again, I haven't tested it. |
This comment has been minimized.
This comment has been minimized.
Oire, However I disagree about having patterns that will never be typed by users like "IPV6" and "PunyCode". I am most likely inclined to also remove IPV4 validation from the base regex, nobody remember these numbers and they will most likely change in time. Nobody will type/remember "PunyCode" URLs and the regex already supports international UTF-8 URLs. |
This comment has been minimized.
This comment has been minimized.
Thanks for sharing, Diego. |
This comment has been minimized.
This comment has been minimized.
Thanks @MarQuisKnox, @dperini and @mathiasbynens, it is really helpful! |
This comment has been minimized.
This comment has been minimized.
Hey guys, here is my extended version https://github.com/Fleshgrinder/php-url-validator
Would you mind if I release my code with the Unlicense license? I used MIT because you used MIT, but I'm more into total freedom. |
This comment has been minimized.
This comment has been minimized.
Hi, |
This comment has been minimized.
This comment has been minimized.
What's wrong with http://php.net/manual/en/function.parse-url.php ? |
This comment has been minimized.
This comment has been minimized.
Just a small comment about brodcast and network address. these address can be valid in CIDR class. Ex: If a provider have two class like 205.151.128.0/24 and 205.151.129.0/24, they can combine the two in a classless network: 205.151.128.0/23. In that network, 205.151.128.255 and 205.151.129.0 are two valid and usable address. |
This comment has been minimized.
This comment has been minimized.
Any regex can extract URLs from below cases? "http://google.com" (string contains double quotes) |
This comment has been minimized.
This comment has been minimized.
http://markdown-it.github.io/linkify-it/ here is JS demo with full unicode support, including astral characters. Final regexp in ~6K and generated automatically. Src is here: https://github.com/markdown-it/linkify-it/blob/master/lib/re.js . Since astral characters take 2 positions, [^negative] class is impossible. Negative lookahead is used instead NOTE, that package does fuzzy search, not strict validation. For strict validation (^...$) required. |
This comment has been minimized.
This comment has been minimized.
I changed the last block for the resource path to look like this:
This will allow URLs like http://test.com#MyAnchor or http://test.com/whatever or http://test.com?some=query while they may not technically be valid, it is something I could see a user typing and most browsers will fix it for them. If they copy it out and back into a browser so they may not know what's wrong with it upon visual inspection. |
This comment has been minimized.
This comment has been minimized.
This is exactly what I've been looking for. http://096.004.012.125/index.html Which I get as input from other tools. Thanks again for the GREAT regex!! |
This comment has been minimized.
This comment has been minimized.
anyone have a vb.net port? |
This comment has been minimized.
This comment has been minimized.
'VB Port that handles domains with or without a hostname
|
This comment has been minimized.
This comment has been minimized.
I also discovered that underscores are not valid if you follow this RegExp. The URL
will fail. Here's a link to a relevant StackOverflow question: http://stackoverflow.com/questions/2180465/can-hostname-subdomains-have-an-underscore-in-it |
This comment has been minimized.
This comment has been minimized.
This is my PHP port... I added I added Additionally i reordered the hostname parts, to get it working with preg_replace_callback (I had some BACKTRACE LIMIT EXCEEDED errors).
The full expression:
|
This comment has been minimized.
This comment has been minimized.
http://www.adminsub.net/ipv4-subnet-calculator/10.1.0.0/22 At a minimum, there are only two always-invalid IPs in the
|
This comment has been minimized.
This comment has been minimized.
Hi, function fTest() {
} --> res is empty Anybody could explain to me why it dosn't work ? Thx ! |
This comment has been minimized.
This comment has been minimized.
@danyboy85 This is because the RegExp is conceived to validate strings and not to match URLs in a strings. The |
This comment has been minimized.
This comment has been minimized.
I am not sure if anybody mentioned it before, but some of the "invalid" URL's are in fact valid! |
This comment has been minimized.
This comment has been minimized.
Shouldn't this be valid? |
This comment has been minimized.
This comment has been minimized.
Just noted the workaround provided by @johnjaylward worked. |
This comment has been minimized.
This comment has been minimized.
This regex and everyone's comments have been really informative! Thanks for writing this. I'm confused about this regex's handling of UTF-8 characters. The RFC spec does not allow "" characters, so why does the regex use "" to match UTF-8 characters? From the spec: " URI producing applications must not use percent-encoding in host unless it is used So, UTF-8 characters other than alphanumeric characters should be represented using % encoding and IDNA encoding. I'll post the regex I have in mind later on. _EDIT_ |
This comment has been minimized.
This comment has been minimized.
Many thanks to everybody for the comments and the suggestions. I have updated the gist:
|
This comment has been minimized.
This comment has been minimized.
This is an answer to @halloamt & @muessigb questions.
The title of the article say it all: "The danger of the trailing dot in the domain name". |
This comment has been minimized.
This comment has been minimized.
Looks like the "allowed a trailing dot" clause is missing a backslash in front of the dot, so it in fact allows a trailing character of any type, including whitespace, since that is the semantics of the . character in a RegExp. |
This comment has been minimized.
This comment has been minimized.
You are correct @dmose, thank you for noticing that. |
This comment has been minimized.
This comment has been minimized.
I added the following example URLs to my tests:
all the above URLs are now passing the tests correctly ! |
This comment has been minimized.
This comment has been minimized.
@dperini: I don't believe your javascript one liner will match against the period in front of the TLD without two backslashes. I found this out the hard way when I put a question mark after the protocol match, making it optional.... and discovered it was passing any word ex: sethnewton I forked and made the change here: https://gist.github.com/sethnewton/9fe949bbc8edfe429232 ... hopefully it's of some use to you. |
This comment has been minimized.
This comment has been minimized.
@sethnewton, |
This comment has been minimized.
This comment has been minimized.
There is a subtle inefficiency in this construct:
On a string without any This is my suggested fix:
It can only starts and ends with |
This comment has been minimized.
This comment has been minimized.
Control-F Perl .. nothing. A perl version is the one line Javascript version with \x{00a1}-\x{ffff} instead of \u00a1-\uffff |
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This doesn't seem to allow http://3628126748 It is a decimal address which resolves to an IP owned by The Coca Cola Corp (not an internal IP). |
This comment has been minimized.
This comment has been minimized.
The patterns for username/password are overly lax and allow you to put in almost anything as a url, if you finish with something that looks like @domain.name. eg re_weburl.test("http://127.0.0.1/@example.com"), or re_weburl.test("http://???/@example.com") |
This comment has been minimized.
This comment has been minimized.
@gburtini Actually although browsers allow and resolve URLs with IP addresses that are in hexadecimal, octal or without a dot-notation, these formats are made invalid in a URL by RFC 3986: https://www.ietf.org/rfc/rfc3986.txt section |
This comment has been minimized.
This comment has been minimized.
'www.google.com' check failed |
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
Thanks for the great regex! I am trying to use it within a custom validation rule in Laravel. But it's not validating anything at all... My code bellow:
Am I doing something wrong? |
This comment has been minimized.
This comment has been minimized.
Hello Diego, In reference to the link: https://mathiasbynens.be/demo/url-regex |
This comment has been minimized.
This comment has been minimized.
Hi @dperini, very nice RegEx! |
This comment has been minimized.
This comment has been minimized.
Thanks for the great work. |
This comment has been minimized.
This comment has been minimized.
Hi @dperini ! var urlRe = new RegExp(urlReStr, 'i'); It works perfectly for some tricky cases, but doesn't fail for such simple case as http://dddddddddddddd. |
This comment has been minimized.
This comment has been minimized.
Unfortunately... www.google.com - Fails |
This comment has been minimized.
This comment has been minimized.
https://www.youtube.com/watch?v=LrHx6Q_-tLU - Fails, although it is a valid YouTube url. |
This comment has been minimized.
This comment has been minimized.
http://localhost fails it's a crucial one |
This comment has been minimized.
This comment has been minimized.
@diegoperini's version converted to javascript: var match_url_re=/^(?:(?:https?|ftp)://)(?:\S+(?::\S_)?@)?(?:(?!10(?:.\d{1,3}){3})(?!127(?:.\d{1,3}){3})(?!169.254(?:.\d{1,3}){2})(?!192.168(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)(?:.(?:[a-z\u00a1-\uffff0-9]+-?)[a-z\u00a1-\uffff0-9]+)_(?:.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/[^\s]*)?$/i; |
This comment has been minimized.
This comment has been minimized.
Great, |
This comment has been minimized.
This comment has been minimized.
Thanks for this lovly gist! |
This comment has been minimized.
This comment has been minimized.
Wouldn't it be better if we update host name validation part from Edit: this update fails on http://test_domain.com but not http://www.test_domain.com an requires some improvement. Any suggestions? |
This comment has been minimized.
This comment has been minimized.
Right now http://www.google shows to be true and http://www.google_hello.com shows to be false. |
This comment has been minimized.
This comment has been minimized.
http://xn--j1ail.xn--p1ai/ - fails, but it valid url |
This comment has been minimized.
This comment has been minimized.
@derekshull |
This comment has been minimized.
This comment has been minimized.
nm, I stand corrected. |
This comment has been minimized.
This comment has been minimized.
@mockdeep you were right, an underscore is not valid in a domain name, but it is valid in uri's. So www.google_test.com is not valid, but www.google.com/test_page is valid |
This comment has been minimized.
This comment has been minimized.
For anyone trying to get this to work with local host names (those without any '.'), replacing the final '*' with a '?' on line 90 did the trick for me. |
This comment has been minimized.
This comment has been minimized.
The regexp sadly doesn't match Twitter's short links:
|
This comment has been minimized.
This comment has been minimized.
I have seen that some users (like @yang7229693 and @sircharleswatson) are trying to validate host names like:
without a needed protocol identifier (schema) so this will fail since these are not Web URLs. In case you need to make the "protocol identifier" optional change the schema related line at the beginning from:
to the following (added a question mark to the end, before the closing double quote):
|
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
@klimashkin |
This comment has been minimized.
This comment has been minimized.
Made a gem for ruby. Thanks, @dperini and @mathiasbynens! |
This comment has been minimized.
This comment has been minimized.
validating without a protocol is tricky .. just by adding the question mark above you'll end up with any 'word.word' as a valid url which is usually not what you want. I came up with the following, which is not perfect but works.. it will approve http://www.google.bla http://google.bla www.google.bla and google.com but not google.bla
|
This comment has been minimized.
This comment has been minimized.
In that case you have to have to-date list of TLDs. It can be tricky. https://en.wikipedia.org/wiki/List_of_Internet_top-level_domains |
This comment has been minimized.
This comment has been minimized.
Up-to-date official TLD list can be found at http://www.iana.org/domains/root/db, or [https://data.iana.org/TLD/tlds-alpha-by-domain.txt](https://data.iana.org/TLD/tlds-alpha-by-domain.txt as marchine parsable. |
This comment has been minimized.
This comment has been minimized.
I am using the expression mentioned above - the JS version - at the below tool and it says "Pattern Error" at the first character - Please help ! |
This comment has been minimized.
This comment has been minimized.
@kewlcoder You have to remove '/', you should copy only the content between '/' if you are using a regex with regex101. Also this regex is for matching one URL not a list of them, you have to use the correct flags (img) for multiple matching: https://regex101.com/r/JmI8qG/1 |
This comment has been minimized.
This comment has been minimized.
For those here that don't yet know, there is an implementation of the URL spec available via whatwg-url and in Node.js v7 (experimental). These discussions should probably move to one of these places. |
This comment has been minimized.
This comment has been minimized.
@dperini, you excluded localhost and few private networks from "valid" set. If the reason for it is to discourage people from accessing "local" resources, I would ass "localhost.localdomain" as well. |
This comment has been minimized.
This comment has been minimized.
It might be worth stating that the current JavaScript regex version does not pass the https://mathiasbynens.be/demo/url-regex test. |
This comment has been minimized.
This comment has been minimized.
Published a Elixir library for this. Thanks @dperini and everyone here! ValidUrl.validate("https://www.example.com") |
This comment has been minimized.
This comment has been minimized.
Thanks. You need to add also ftps and sftp. BTW this test is not quite actual it seems because when testing in https://regex101.com/ these urls show as valid and according to Mathias they should not be: http://a.b--c.de/ and this one is invalid (according to Mathias it should be): |
This comment has been minimized.
This comment has been minimized.
Hi. I am using python validators and validators.url which is based on your url validator - when I use a url with double dashes after the first word , for example http://www.word1--word2.com it fails -- however there are many valid urls out there with double dashes -- Just wondering if you knowledge of that failure ? Thank you .. |
This comment has been minimized.
This comment has been minimized.
This is a false regexp. According to the specification rfc3986, there are many more valid chars in search string. A valid url (like http://192.168.2.79:1300/onlineReader/Reader.jsp?url=http://192.168.2.79:1300/attach/downloadAttach?mn=1467617896) could not pass through your regexp. |
This comment has been minimized.
This comment has been minimized.
I am trying to make this regex work with an xsd-file. So far i am just disappointed with the restrictions of xsd. Even when i left out the negative lookahead for the ip's i am still struggling with the unicode-ranges inside the characterclasses. What i have so far:
Problem is line 5 atm: [a-zA-Z\u00a1-\uffff0-9] Has somebody any idea? |
This comment has been minimized.
This comment has been minimized.
Hi Diego, Is there a version of this regex for Java? Thanks, |
This comment has been minimized.
This comment has been minimized.
Hi, About https://mathiasbynens.be/demo/url-regex page, indeed, there is a match, but it's
without the trailing '4'...? Am I missing something? Below the Java Pattern I am using. Thanks. Why is that? @ gorurs : here is Diego's regex as Java pattern: Pattern.compile("(?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?!10(?:\.\d{1,3}){3})(?!127(?:\.\d{1,3}){3})(?!169\.254(?:\.\d{1,3}){2})(?!192\.168(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)[a-z\x{00a1}-\x{ffff}0-9]+)(?:\.(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)[a-z\x{00a1}-\x{ffff}0-9]+)(?:\.(?:[a-z\x{00a1}-\x{ffff}]{2,})))(?::\d{2,5})?(?:/[^\s])?"); |
This comment has been minimized.
This comment has been minimized.
Shouldn't the regex only validate matching parenthesis? I mean, syntactically it is valid to have orphan parenthesis inside the URL, but most of the time it does not happen, whereas having an URL inside parenthesis is pretty common:
With this regex, the matched URL will be Any way around that? I am a reaaaaal beginner at regexes. Don't even know how to adapt this regex in order to achieve this. |
This comment has been minimized.
This comment has been minimized.
@RenaudParis I don't think you can make that assumption. There is no reason why you could not end up with a closing ")" in an ID (for example) - which does not make the URL invalid: http://example.com/?id=j378)cqv |
This comment has been minimized.
This comment has been minimized.
Hmm this regex does not match localhost uri like |
This comment has been minimized.
This comment has been minimized.
Thanks for your work! Maybe you could add support for urls such as www.example.com (with no protocol defined, but www in front)? |
This comment has been minimized.
This comment has been minimized.
@rokoroku: it does not consider valid localhost per documentation in comment of gist. |
This comment has been minimized.
This comment has been minimized.
Hello, thanks you for the hard work, I would like to note that https://raaya_karas.carbonmade.com/ doesn't pass validation. Can you fix it? thanks |
This comment has been minimized.
This comment has been minimized.
@rokoroku: you can add a '?' to the end of line 92 so the TLD is optional: According to https://www.w3.org/Addressing/URL/5_BNF.html can a host or domain contain some special characters like -_& but not so much like |
This comment has been minimized.
This comment has been minimized.
@dinbrca, I also had the same problem. |
This comment has been minimized.
This comment has been minimized.
@gaius @AnChEv and @steelliberty
this have been said repeatedly in this discussion and also a dot at the end of the domain is perfectly valid:
|
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
Hi, Trying to validate the following url: with the regex: I got only the group : http://my-test.dom.fr/the-content/lang/ext/?uri=ID:3200AA(01 WITHOUT ")" ????? Any idea ? Thanks in advance ! |
This comment has been minimized.
This comment has been minimized.
This is considered valid, even though it contains incorrectly URL-encoded elements:
I assume this is considered valid because the URL scheme itself doesn't care about higher-level concerns, however, the standard PHP http extension will fail to parse a URL containing such encoding errors, which is exactly the kind of thing I'm likely to be checking a URL for before trying to request it:
|
This comment has been minimized.
This comment has been minimized.
Some valid URL schemes are marked as invalid:
|
This comment has been minimized.
This comment has been minimized.
Hello sir I want to allowed user to enter following URl Schema http:// is your RE this accept Url Like geo:37.786971,-122.399677 and all others please let me know how can i used your RE thanks and regars |
This comment has been minimized.
This comment has been minimized.
@pdalfarr Same problem for me, It seems that this part of the regex: Hope it helps other people! :) |
This comment has been minimized.
This comment has been minimized.
@Mickael-van-der-Beek regarding:
Just add - ^((https?|ftp):\/\/)(\S+(:\S*)?@)?((?!10(\.\d{1,3}){3})(?!127(\.\d{1,3}){3})(?!169\.254(\.\d{1,3}){2})(?!192\.168(\.\d{1,3}){2})(?!172\.(1[6-9]|2\d|3[0-1])(\.\d{1,3}){2})([1-9]\d?|1\d\d|2[01]\d|22[0-3])(\.(1?\d{1,2}|2[0-4]\d|25[0-5])){2}(\.([1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(([a-z\u{00a1}-\u{ffff}0-9]+-?)*[a-z\u{00a1}-\u{ffff}0-9]+)(\.([a-z\u{00a1}-\u{ffff}0-9]+-?)*[a-z\u{00a1}-\u{ffff}0-9]+)*(\.([a-z\u{00a1}-\u{ffff}]{2,})))(:\d{2,5})?(\/[^\s]*)?$
+ ^((https?|ftp):\/\/)(\S+(:\S*)?@)?((?!10(\.\d{1,3}){3})(?!127(\.\d{1,3}){3})(?!169\.254(\.\d{1,3}){2})(?!192\.168(\.\d{1,3}){2})(?!172\.(1[6-9]|2\d|3[0-1])(\.\d{1,3}){2})([1-9]\d?|1\d\d|2[01]\d|22[0-3])(\.(1?\d{1,2}|2[0-4]\d|25[0-5])){2}(\.([1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(([_a-z\u{00a1}-\u{ffff}0-9]+-?)*[a-z\u{00a1}-\u{ffff}0-9]+)(\.([a-z\u{00a1}-\u{ffff}0-9]+-?)*[a-z\u{00a1}-\u{ffff}0-9]+)*(\.([a-z\u{00a1}-\u{ffff}]{2,})))(:\d{2,5})?(\/[^\s]*)?$
|
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
how to add another kind of url in this regex? |
This comment has been minimized.
This comment has been minimized.
has anyone been able to use it in Salesforce Apex? upd: "\u00a1-\uffff" - these, as far as i understood, are some special characters. I believe they are rarely used, actually i have never seen urls with non-alphabetic character. Will try to use my version, should be enough I think |
This comment has been minimized.
This comment has been minimized.
It appears this still locks chrome up when using for JavaScript. Chrome Version: 64.0.3282.186 (Official Build) (64-bit) code executed: var re_weburl = new RegExp(
"^" +
// protocol identifier
"(?:(?:https?|ftp)://)" +
// user:pass authentication
"(?:\\S+(?::\\S*)?@)?" +
"(?:" +
// IP address exclusion
// private & local networks
"(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
// IP address dotted notation octets
// excludes loopback network 0.0.0.0
// excludes reserved space >= 224.0.0.0
// excludes network & broacast addresses
// (first & last IP address of each class)
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
"|" +
// host name
"(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)" +
// domain name
"(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*" +
// TLD identifier
"(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" +
// TLD may end with dot
"\\.?" +
")" +
// port number
"(?::\\d{2,5})?" +
// resource path
"(?:[/?#]\\S*)?" +
"$", "i"
);
re_weburl.test('http://www.Y6jp2mNKXucTNw8vkiu.75QF0dlTT9EgkqGt7Tr.eeZXNZLZqlHWJg0ewf9.p0H6nnxidoxp8Gkrpln.5Rl9eN5ZA0UYMSXfLeL.oRBQyrj3Pw8YM5pbcIO.ph7UoP01AsuKFiIYJSl'); I'm not sure what's causing it to break, but this should theoretically return false if I understand the regex properly. None the less, when using this regex someone could completely freeze their own application by supplying a url with groups broken up by periods and having enough groups. Increasing the size of the group requires fewer groups to freeze the regex. It's super edge case, but it can happen, and it "could" happen in the real world with bizarre sub-domains. If I add a |
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
how about mine - https://regex101.com/r/yvPwkL/8/ . the concept there is to keep it somewhat short, yet good enough, so its not ideal(specially ip-like stuff). |
This comment has been minimized.
This comment has been minimized.
I adapted your code to my c# project. Im grateful to you Mr Perini. Thank you so much. |
This comment has been minimized.
This comment has been minimized.
Underscores are allowed in host and domain names (RFC 2181).
SMTP is the only protocol that specific forbids it (RFC 2821). |
This comment has been minimized.
This comment has been minimized.
just packaged on npm https://www.npmjs.com/package/regex-weburl Grazie Diego! |
This comment has been minimized.
This comment has been minimized.
To linkify text, excluding some trailing punctuation: var re_weburl = new RegExp(
// protocol identifier
"(?:(?:https?|ftp)://)" +
// user:pass authentication
"(?:\\S+(?::\\S*)?@)?" +
"(?:" +
// IP address exclusion
// private & local networks
"(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
// IP address dotted notation octets
// excludes loopback network 0.0.0.0
// excludes reserved space >= 224.0.0.0
// excludes network & broacast addresses
// (first & last IP address of each class)
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
"|" +
// host name
"(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)" +
// domain name
"(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*" +
// TLD identifier
"(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" +
// sorry, ignore TLD ending with dot
// "\\.?" +
")" +
// port number
"(?::\\d{2,5})?" +
// resource path, excluding a trailing punctuation mark
"(?:[/?#](?:\\S*[^\\s!\"'()*,-.:;<>?\\[\\]_`{|}~]|))?"
, "gi"
);
var text = "http://google.com/ google http://www.bing.com/search, bing http://www.duckduckgo.com?search=a,b,c#foo.";
var replaced = text.replace(re_weburl, '<a href="$&">$&</a>'); |
This comment has been minimized.
This comment has been minimized.
Any idea why Edit: I note that @bazzargh has already raised this point. |
This comment has been minimized.
In PHP (for use with
preg_match
), this becomes:Thanks for the regex Diego, I’ve added it to the test case and it seems to pass all the tests :) Nice job!