Skip to content

Instantly share code, notes, and snippets.

@originell
Created April 22, 2011 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save originell/936744 to your computer and use it in GitHub Desktop.
Save originell/936744 to your computer and use it in GitHub Desktop.
Trying to build a regex validating the given URIs
#!/usr/bin/env python
# encoding: utf-8
"""
Build URL Validation Regex.
See the according *\_urls list for a definition of what we
want to match and what not.
Note that according to RFC 2612 (HTTP 1.1, Section 3.1.2) there is
no such thing as a maximum URI length. Since servers should
be able to handle URIs of unbounded length.
However in practice (see http://www.boutell.com/newfaq/misc/urllength.html)
it seems that a good limit is 2,000 characters.
The DNS Protocol has the following limits, which affect
the domain-level, not on the complete URI (remember that GET parameters
and hashbangs are part of the URI):
* Maximum level of subdivions (read: subdomains) is 127
* Each label may consist of up to 63 characters
* The full domain name may not exceed 253 characters
in it\'s external dotted-label respresentation.
* DNS names adhere to a subset of the ASCII charset.
Umlaut and other linguistically special characters get resolved
by punycode. Therefore the LHD-Rule applies (letters, digits, hyphen),
meaning only a-z, A-Z, 0-9 and the hyphen itself are legal characters.
Should we ever need to use pjunycode, I recommend implementing this
via a webservice API since Python has a pjunycode implementation integrated.
Furthermore we do not need to recognize 100% specific locations.
So dealing with filenames or ports should not necessary.
By assuring this, we can also avoid pointers to abusive images.
Attention:
This regular expression is build to be compatible to JavaScript 1.2 (ECMA-262).
To see the differences visit http://www.regular-expressions.info/javascript.html
and http://www.regular-expressions.info/python.html
"""
import re
# This is django's regex. Here for reference.
#regex = re.compile(
# r'^(?:http|ftp)s?://' # http:// or https://
# r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
# r'localhost|' #localhost...
# r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
# r'(?::\d+)?' # optional port
# r'(?:/?|[/?]\S+)$', re.IGNORECASE)
#
# Second try
# regex = re.compile(
# r'^('
# r'((http)s?://)?' # match http:// or https:// - OPTIONAL
# r'(www\.)?' # match www. - OPTIONAL
# r')?' # the http/https/www part is completely... OPTIONAL
# r'[A-Z0-9][-A-Z0-9]{0,61}[A-Z0-9]\.' # domainname not allowed to start
# # with a hyphen or end with it.
# # max length: 63 chars
# r'[A-Z]{2,6}/?' # tld (longest I found was .museum)
# r'[A-Z0-9][-A-Z0-9]{0,61}[A-Z0-9]\.'
# r'$'
# , re.IGNORECASE) #|re.DEBUG
# Third try. Trying to understand django's domain regex :b
regex = re.compile(
r'^('
r'((http)s?://)?' # match http:// or https:// - OPTIONAL
r'(www\.)?' # match www. - OPTIONAL
r')?' # the http/https/www part is completely... OPTIONAL
r'(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)'
r'(?:/?|[/?]\S+)$'
, re.IGNORECASE) #|re.DEBUG
# These are the URLs we want to match
match_urls = [
'http://www.example.com/',
'https://www.example.com/',
'http://subdomain.example.com/',
'https://subdomain.example.com/',
'http://example.com/',
'https://example.com/',
'http://www.exa-mple.com/',
'https://www.exa-mple.com/',
'http://www.example.com',
'https://www.example.com',
'http://subdomain.example.com',
'https://subdomain.example.com',
'http://example.com',
'https://example.com',
'http://www.exa-mple.com',
'https://www.exa-mple.com',
'www.example.com/',
'www.example.com',
'subdomain.example.com/',
'subdomain.example.com',
'subdomain.sub2.example.com/',
'subdomain.sub2.example.com',
'http://sub-domain.example.com/',
'http://sub-domain.example.com',
'sub-domain.example.com/',
'sub-domain.example.com',
###############
'http://www.example.com/path/',
'https://www.example.com/path/',
'http://subdomain.example.com/path/',
'https://subdomain.example.com/path/',
'http://example.com/path/',
'https://example.com/path/',
'http://www.exa-mple.com/path/',
'https://www.exa-mple.com/path/',
'http://www.example.com/path',
'https://www.example.com/path',
'http://subdomain.example.com/path',
'https://subdomain.example.com/path',
'http://example.com/path',
'https://example.com/path',
'http://www.exa-mple.com/path',
'https://www.exa-mple.com/path',
'www.example.com/path/',
'www.example.com/path',
'subdomain.example.com/path/',
'subdomain.example.com/path',
'subdomain.sub2.example.com/path/',
'subdomain.sub2.example.com/path',
'http://sub-domain.example.com/path/',
'http://sub-domain.example.com/path',
'sub-domain.example.com/path/',
'sub-domain.example.com/path',
################
'http://www.example.com/path/secondlevel/',
'https://www.example.com/path/secondlevel/',
'http://subdomain.example.com/path/secondlevel/',
'https://subdomain.example.com/path/secondlevel/',
'http://example.com/path/secondlevel/',
'https://example.com/path/secondlevel/',
'http://www.exa-mple.com/path/secondlevel/',
'https://www.exa-mple.com/path/secondlevel/',
'http://www.example.com/path/secondlevel',
'https://www.example.com/path/secondlevel',
'http://subdomain.example.com/path/secondlevel',
'https://subdomain.example.com/path/secondlevel',
'http://example.com/path/secondlevel',
'https://example.com/path/secondlevel',
'http://www.exa-mple.com/path/secondlevel',
'https://www.exa-mple.com/path/secondlevel',
'www.example.com/path/secondlevel/',
'www.example.com/path/secondlevel',
'subdomain.example.com/path/secondlevel/',
'subdomain.example.com/path/secondlevel',
'subdomain.sub2.example.com/path/secondlevel/',
'subdomain.sub2.example.com/path/secondlevel',
'http://sub-domain.example.com/path/secondlevel/',
'http://sub-domain.example.com/path/secondlevel',
'sub-domain.example.com/path/secondlevel/',
'sub-domain.example.com/path/secondlevel',
################
'http://www.example.com/path/secondlevel/thirdlevel/',
'https://www.example.com/path/secondlevel/thirdlevel/',
'http://subdomain.example.com/path/secondlevel/thirdlevel/',
'https://subdomain.example.com/path/secondlevel/thirdlevel/',
'http://example.com/path/secondlevel/thirdlevel/',
'https://example.com/path/secondlevel/thirdlevel/',
'http://www.exa-mple.com/path/secondlevel/thirdlevel/',
'https://www.exa-mple.com/path/secondlevel/thirdlevel/',
'http://www.example.com/path/secondlevel/thirdlevel',
'https://www.example.com/path/secondlevel/thirdlevel',
'http://subdomain.example.com/path/secondlevel/thirdlevel',
'https://subdomain.example.com/path/secondlevel/thirdlevel',
'http://example.com/path/secondlevel/thirdlevel',
'https://example.com/path/secondlevel/thirdlevel',
'http://www.exa-mple.com/path/secondlevel/thirdlevel',
'https://www.exa-mple.com/path/secondlevel/thirdlevel',
'www.example.com/path/secondlevel/thirdlevel/',
'www.example.com/path/secondlevel/thirdlevel',
'subdomain.example.com/path/secondlevel/thirdlevel/',
'subdomain.example.com/path/secondlevel/thirdlevel',
'subdomain.sub2.example.com/path/secondlevel/thirdlevel/',
'subdomain.sub2.example.com/path/secondlevel/thirdlevel',
'http://sub-domain.example.com/path/secondlevel/thirdlevel/',
'http://sub-domain.example.com/path/secondlevel/thirdlevel',
'sub-domain.example.com/path/secondlevel/thirdlevel/',
'sub-domain.example.com/path/secondlevel/thirdlevel',
]
dont_match_urls = [
'ftp://foobar@example.com/',
'ftp://foobar@example.com',
'ftp://foobar@example.com/path/',
'ftp://foobar@example.com/path',
#################
'http://127.0.0.1/',
'https://127.0.0.1/',
'http://127.0.0.1',
'https://127.0.0.1/',
#################
'http://example.com:80/',
'https://example.com:80/',
'http://example.com:80',
'https://example.com:80',
'http://example.com:80/path/',
'https://example.com:80/path/',
'http://example.com:80/path',
'https://example.com:80/path',
'www.example.com:80/',
'www.example.com:80',
'example.com:80/',
'example.com:80',
##################
'http://',
'https://',
'www.',
'example',
'.com',
'com',
'//',
##################
'//192.168.1.1/',
'http://localhost/',
##################
'http://-example.com/',
'http://example-.com/'
##################
'http://example.com/index.php',
'http://example.com/path/index.php',
'http://example.com/foobar.jpg',
'http://example.com/path/foobar.jpg',
##################
'http://example.com:foobar/',
]
false_matches = []
for match_url in match_urls:
does_match = re.match(regex, match_url)
if does_match is None:
print match_url, does_match
false_matches.append(match_url)
for dont_match_url in dont_match_urls:
does_not_match = re.match(regex, dont_match_url)
if does_not_match is not None:
print dont_match_url, does_not_match
false_matches.append(dont_match_url)
if false_matches:
print "=========================================="
print "%i/%i urls failed" % (len(false_matches), len(match_urls)+len(dont_match_urls))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment