Created
June 19, 2012 20:37
-
-
Save pmarreck/2956396 to your computer and use it in GitHub Desktop.
A URI detector/parser regex that is a better, but slower, auto_link regex for Rails (and a line that patches it in)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################ | |
# A more comprehensive drop-in replacement for Rails' internal link matcher regex | |
# which I invented from scratch... Oh yeah, Adderall is awesome. | |
# Takes about 40% longer to run than the stock auto_link regex in our test suite unfortunately :( | |
# but it accounts for pretty much EVERY linkable URI/URL, and is far more (in my opinion) | |
# readable and maintainable, due to its use of named groups/tokens. | |
# References used to build this: | |
# http://www.faqs.org/rfcs/rfc1738.html | |
# http://tools.ietf.org/html/rfc3986 | |
# http://en.wikipedia.org/wiki/URI_scheme << primary ref, was awesome, great diagram | |
# http://en.wikipedia.org/wiki/Uniform_resource_identifier | |
# http://en.wikipedia.org/wiki/Domain_name | |
# http://www.icann.org/en/resources/registries/tlds | |
# https://wiki.mozilla.org/TLD_List | |
# http://en.wikipedia.org/wiki/CcSLD | |
# http://en.wikipedia.org/wiki/Fragment_identifier | |
############################################################################ | |
# Notes on Regex Named-Group Elements Below | |
# ----------------------------------------- | |
# (For some reason, even though this regex has the 'x' option, I could not comment in-line without throwing errors, | |
# so I've put them up here.) | |
# tld: Note that in 2013, hundreds of new TLD's may be introduced which will force a revision here | |
# name: names can be letters, numbers, or hyphens but cannot start or end with a hyphen, | |
# hence the fancy look-ahead look-behind syntax | |
# hostname: port optional | |
# username: This is not very defined, this is my stab at a reasonable character set after googling | |
# scheme separator: optional slash. let's make the 2nd slash optional to be nice too. | |
# fragment: the actual allowed fragment characters are slightly different, but I got lazy. see http://stackoverflow.com/questions/2849756/list-of-valid-characters-for-the-fragment-identifier-in-an-url | |
# scheme: protocol names and atom terms from http://en.wikipedia.org/wiki/URI_scheme | |
# URI: I allow a lack of protocol specifier but only if the leftmost subdomain implies a protocol (example: www.domain.com or irc.freenode.net) | |
# path_segment_char: Note that Amazon has = in its URL paths (sigh) and some other sites use colons. I've added those characters. | |
# disallowed_encoded: Low order ASCII should not be in a URL, even encoded. | |
# I have replaced most +'s with {1,limit} and *'s' with {0,limit} to avoid runaway regex parsing of bad input. | |
AUTHORITATIVE_URI_RE = /( | |
(?<scheme> | |
\b | |
(?> https? | mailto | [st]?ftp | aaas? | about | a?cap | cid | crid | data | dav | dict | dns | fax | file | geo | gopher | go | h323 | iax | icap | im | imap | info | ipp | iris | ldap | msrps? | news | nfs | nntp | pop | rsync | rtsp | sips? | sms | snmp | tag | telnet | tel | tip | tv | urn | uuid | view\-source | wss? | xmpp | aim | apt | afp | bitcoin | bolo | callto | chrome | content | cvs | doi | facetime | feed | finger | fish | git | gg | gizmoproject | gtalk | irc[s6]? | itms | jar | javascript | lastfm | ldaps | magnet | maps | market | message | mms | msnim | mumble | mvn | notes | palm | paparazzi | platform | proxy | psyc | query | rmi | rtmp | secondlife | sgn | skype | spotify | ssh | smb | soldat | steam | svn | teamspeak | things | udp | unreal | ventrilo | webcal | wtai | wyciwyg | xfire | xri | ymsgr) | |
\b | |
\: | |
){0} | |
(?<scheme_separator> | |
\/{0,3} | |
){0} | |
(?<scheme_prefix> | |
\g<scheme> | |
\g<scheme_separator> | |
){0} | |
(?<tld> | |
\b | |
(?> COM | ORG | EDU | GOV | UK | NET | CA | DE | JP | FR | AERO | ARPA | ASIA | A[UCDEFGILMNOQRSTWXZ] | US | RU | CH | IT | NL | SE | NO | ES | MIL | BIZ | B[ABDEFGHIJMNORSTVWYZ] | CAT | COOP | C[CDFGIKLMNORUVWXYZ] | D[JKMOZ] | E[CEGRTU] | F[IJKMO] | G[ABDEFGHILMNPQRSTUWY] | H[KMNRTU] | INFO | INT | I[DELMNOQRS] | JOBS | J[EMO] | K[EGHIMNPRWYZ] | L[ABCIKRSTUVY] | MOBI | MUSEUM | M[ACDEGHKLMNOPQRSTUVWXYZ] | NAME | N[ACEFGIPRUZ] | OM | PRO | P[AEFGHKLMNRSTWY] | QA | R[EOSW] | S[ABCDGHIJKLMNORTUVXYZ] | TRAVEL | TEL | TLD | T[CDFGHJKLMNOPRTVWZ] | U[AGYZ] | VET | V[ACEGINU] | WIKI | W[FS] | XN\-\- (?> 0ZWM56D | 11B5BS3A9AJ6G | 3E0B707E | 45BRJ9C | 80AKHBYKNJ4F | 80AO21A | 90A3AC | 9T4B11YI5A | CLCHC0EA0B2G2A9GCD | DEBA0AD | FIQS8S | FIQZ9S | FPCRJ9C3D | FZC2C9E2C | G6W251D | GECRJ9C | H2BRJ9C | HGBK6AJ7F53BBA | HLCJ6AYA9ESC7A | J6W193G | JXALPDLP | KGBECHTV | KPRW13D | KPRY57D | LGBBAT1AD8J | MGBAAM7A8H | MGBAYH7GPA | MGBBH1A71E | MGBC0A9AZCG | MGBERP4A5D4AR | O3CW4H | OGBPF8FL | P1AI | PGBS0DH | S9BRJ9C | WGBH1C | WGBL6A | XKC2AL3HYE2A | XKC2DL3A5EE0H | YFRO4I67O | YGBI2AMMX | ZCKZAH ) | XXX | Y[ET] | Z[AMW] ) | |
\b | |
){0} | |
(?<ccsld> | |
\g<tld>(?= [.]\g<tld>) | |
){0} | |
(?<tlds> | |
(?: [.]\g<ccsld>)? [.]\g<tld> | |
){0} | |
(?<allowed_name> | |
\b(?<!\-)[a-z0-9\-]{1,40}(?!\-)\b | |
# \b(?>[a-z0-9\-]+)\b | |
){0} | |
(?<subdomain_with_implicit_scheme> | |
(?> w{2,3}\d{0,3} | mail | proxy | s[fm]tp | pop | ftp | irc | images | news | video ) | |
(?! \g<tlds> ) | |
){0} | |
(?<subdomain> | |
(?! \g<subdomains_with_implicit_scheme> ) | |
\g<allowed_name> | |
(?! \g<tlds> ) | |
){0} | |
(?<subdomains_with_implicit_scheme> | |
\g<subdomain_with_implicit_scheme>(?: [.]\g<subdomain>){0,3}[.] | |
){0} | |
(?<subdomains> | |
\g<subdomain>(?: [.]\g<subdomain>){0,3}[.] | |
){0} | |
(?<domain> | |
\g<allowed_name> | |
(?= \g<tlds> ) | |
){0} | |
(?<port> | |
\d{1,5} | |
){0} | |
(?<hostname> | |
\g<subdomains>? \g<domain> \g<tlds> | |
){0} | |
(?<hostname_with_implicit_scheme> | |
\g<subdomains_with_implicit_scheme> \g<domain> \g<tlds> | |
){0} | |
(?<host> | |
\g<hostname> | |
(?: \:\g<port>)? | |
){0} | |
(?<host_with_implicit_scheme> | |
\g<hostname_with_implicit_scheme> | |
(?: \:\g<port>)? | |
){0} | |
(?<username> | |
[\.\-\w]{2,40} | |
){0} | |
(?<password> | |
[a-z0-9\,\.\<\>\/\;\:\'\"\\\[\]\{\}\|\`\~\!\?\@\#\$\%\^\&\*\(\)\-\=\_\+]{1,50} | |
){0} | |
(?<userinfo> | |
(?> \g<username>(?: \: \g<password> )? \@ ) | |
){0} | |
(?<authority> | |
\g<userinfo>? \g<host> | |
){0} | |
(?<authority_with_implicit_scheme> | |
\g<userinfo>? \g<host_with_implicit_scheme> | |
){0} | |
(?<hex> | |
[0-9a-f] | |
){0} | |
(?<disallowed_encoded> | |
\%[01][0-9A-F] | |
){0} | |
(?<hex_encoded> | |
(?! \g<disallowed_encoded> ) | |
\%\g<hex>{2} | |
){0} | |
(?<html_entity> | |
\& (?> \#[0-9]{1,4} | \#x\g<hex>{1,4} | [a-z]{2,8} )\; | |
){0} | |
(?<path_segment_char> | |
(?> [a-z0-9\-\_\$\.\+\*\'\(\)\,\=\:\;\~\@] | \#(?=\w) | \g<hex_encoded> | \g<html_entity> ) | |
){0} | |
(?<path_segment> | |
\g<path_segment_char>{1,200} | |
){0} | |
(?<path> | |
(?: \/ \g<path_segment>? ){1,10} \#? | |
){0} | |
(?<fragment> | |
\# \g<path_segment> | |
){0} | |
(?<querystring_name> | |
\g<path_segment> (?: \[\g<path_segment>\] )? | |
){0} | |
(?<querystring_value> | |
(?> \g<URI> | \g<path_segment> ) | |
){0} | |
(?<name_value_pair> | |
\g<querystring_name>=\g<querystring_value> | |
){0} | |
(?<name_value_pairs> | |
\g<name_value_pair> (?: \& \g<name_value_pair>){0,20} | |
){0} | |
(?<query> | |
[\?\!\&] \g<name_value_pairs> | |
){0} | |
(?<locator> | |
\g<path>? \g<query>? \g<fragment>? | |
){0} | |
(?<URI> | |
(?> | |
\g<scheme_prefix> \g<authority> | |
| | |
\g<scheme_prefix>? \g<authority_with_implicit_scheme> | |
) | |
\g<locator> | |
){0} | |
\g<URI> | |
)/uix | |
ActionView::Helpers::TextHelper::AUTO_LINK_RE = AUTHORITATIVE_URI_RE | |
# I did not include the test suite here because our tests for auto_link are rather extensive, but I assure you this passes all of them. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment