Skip to content

Instantly share code, notes, and snippets.

@sayrer
Created July 16, 2011 19:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sayrer/1086653 to your computer and use it in GitHub Desktop.
Save sayrer/1086653 to your computer and use it in GitHub Desktop.
don't create regexes during pageload
diff --git a/twitter-text.js b/twitter-text.js
index 0741aa2..adc2388 100644
--- a/twitter-text.js
+++ b/twitter-text.js
@@ -5,6 +5,7 @@ if (!window.twttr) {
(function() {
twttr.txt = {};
twttr.txt.regexen = {};
+ twttr.txt.initialized = false;
var HTML_ENTITIES = {
'&': '&',
@@ -65,201 +66,205 @@ if (!window.twttr) {
// to access both the list of characters and a pattern suitible for use with String#split
// Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
var fromCode = String.fromCharCode;
- var UNICODE_SPACES = [
- fromCode(0x0020), // White_Space # Zs SPACE
- fromCode(0x0085), // White_Space # Cc <control-0085>
- fromCode(0x00A0), // White_Space # Zs NO-BREAK SPACE
- fromCode(0x1680), // White_Space # Zs OGHAM SPACE MARK
- fromCode(0x180E), // White_Space # Zs MONGOLIAN VOWEL SEPARATOR
- fromCode(0x2028), // White_Space # Zl LINE SEPARATOR
- fromCode(0x2029), // White_Space # Zp PARAGRAPH SEPARATOR
- fromCode(0x202F), // White_Space # Zs NARROW NO-BREAK SPACE
- fromCode(0x205F), // White_Space # Zs MEDIUM MATHEMATICAL SPACE
- fromCode(0x3000) // White_Space # Zs IDEOGRAPHIC SPACE
- ];
- addCharsToCharClass(UNICODE_SPACES, 0x009, 0x00D); // White_Space # Cc [5] <control-0009>..<control-000D>
- addCharsToCharClass(UNICODE_SPACES, 0x2000, 0x200A); // White_Space # Zs [11] EN QUAD..HAIR SPACE
-
- twttr.txt.regexen.spaces_group = regexSupplant(UNICODE_SPACES.join(""));
- twttr.txt.regexen.spaces = regexSupplant("[" + UNICODE_SPACES.join("") + "]");
- twttr.txt.regexen.punct = /\!'#%&'\(\)*\+,\\\-\.\/:;<=>\?@\[\]\^_{|}~/;
- twttr.txt.regexen.atSigns = /[@@]/;
- twttr.txt.regexen.extractMentions = regexSupplant(/(^|[^a-zA-Z0-9_])(#{atSigns})([a-zA-Z0-9_]{1,20})(?=(.|$))/g);
- twttr.txt.regexen.extractReply = regexSupplant(/^(?:#{spaces})*#{atSigns}([a-zA-Z0-9_]{1,20})/);
- twttr.txt.regexen.listName = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/;
-
- var nonLatinHashtagChars = [];
- // Cyrillic
- addCharsToCharClass(nonLatinHashtagChars, 0x0400, 0x04ff); // Cyrillic
- addCharsToCharClass(nonLatinHashtagChars, 0x0500, 0x0527); // Cyrillic Supplement
- // Hangul (Korean)
- addCharsToCharClass(nonLatinHashtagChars, 0x1100, 0x11ff); // Hangul Jamo
- addCharsToCharClass(nonLatinHashtagChars, 0x3130, 0x3185); // Hangul Compatibility Jamo
- addCharsToCharClass(nonLatinHashtagChars, 0xA960, 0xA97F); // Hangul Jamo Extended-A
- addCharsToCharClass(nonLatinHashtagChars, 0xAC00, 0xD7AF); // Hangul Syllables
- addCharsToCharClass(nonLatinHashtagChars, 0xD7B0, 0xD7FF); // Hangul Jamo Extended-B
- // Japanese and Chinese
- addCharsToCharClass(nonLatinHashtagChars, 0x30A1, 0x30FA); // Katakana (full-width)
- addCharsToCharClass(nonLatinHashtagChars, 0x30FC, 0x30FC); // Katakana Chouon (full-width)
- addCharsToCharClass(nonLatinHashtagChars, 0xFF66, 0xFF9F); // Katakana (half-width)
- addCharsToCharClass(nonLatinHashtagChars, 0xFF70, 0xFF70); // Katakana Chouon (half-width)
- addCharsToCharClass(nonLatinHashtagChars, 0xFF10, 0xFF19); // \
- addCharsToCharClass(nonLatinHashtagChars, 0xFF21, 0xFF3A); // - Latin (full-width)
- addCharsToCharClass(nonLatinHashtagChars, 0xFF41, 0xFF5A); // /
- addCharsToCharClass(nonLatinHashtagChars, 0x3041, 0x3096); // Hiragana
- addCharsToCharClass(nonLatinHashtagChars, 0x3400, 0x4DBF); // Kanji (CJK Extension A)
- addCharsToCharClass(nonLatinHashtagChars, 0x4E00, 0x9FFF); // Kanji (Unified)
- // -- Disabled as it breaks the Regex.
- //addCharsToCharClass(nonLatinHashtagChars, 0x20000, 0x2A6DF); // Kanji (CJK Extension B)
- addCharsToCharClass(nonLatinHashtagChars, 0x2A700, 0x2B73F); // Kanji (CJK Extension C)
- addCharsToCharClass(nonLatinHashtagChars, 0x2B740, 0x2B81F); // Kanji (CJK Extension D)
- addCharsToCharClass(nonLatinHashtagChars, 0x2F800, 0x2FA1F); // Kanji (CJK supplement)
- addCharsToCharClass(nonLatinHashtagChars, 0x3005, 0x3005); // Kanji (CJK iteration mark)
-
- twttr.txt.regexen.nonLatinHashtagChars = regexSupplant(nonLatinHashtagChars.join(""));
- // Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
- twttr.txt.regexen.latinAccentChars = regexSupplant("ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ\\303\\277");
- twttr.txt.regexen.latenAccents = regexSupplant(/[#{latinAccentChars}]+/);
-
- twttr.txt.regexen.endScreenNameMatch = regexSupplant(/^(?:#{atSigns}|[#{latinAccentChars}]|:\/\/)/);
-
- // A hashtag must contain characters, numbers and underscores, but not all numbers.
- twttr.txt.regexen.hashtagBoundary = regexSupplant(/(?:^|$|#{spaces}|「|」|。|、|\.|!|!|\?|?|,)/);
- twttr.txt.regexen.hashtagAlpha = regexSupplant(/[a-z_#{latinAccentChars}#{nonLatinHashtagChars}]/i);
- twttr.txt.regexen.hashtagAlphaNumeric = regexSupplant(/[a-z0-9_#{latinAccentChars}#{nonLatinHashtagChars}]/i);
- twttr.txt.regexen.autoLinkHashtags = regexSupplant(/(#{hashtagBoundary})(#|#)(#{hashtagAlphaNumeric}*#{hashtagAlpha}#{hashtagAlphaNumeric}*)/gi);
- twttr.txt.regexen.autoLinkUsernamesOrLists = /(^|[^a-zA-Z0-9_]|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/g;
- twttr.txt.regexen.autoLinkEmoticon = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/g;
-
- // URL related hash regex collection
- twttr.txt.regexen.invalidDomainChars = stringSupplant("\u00A0#{punct}#{spaces_group}", twttr.txt.regexen);
- twttr.txt.regexen.validPrecedingChars = regexSupplant(/(?:[^-\/"':!=A-Za-z0-9_@@]|^|\:)/);
-
- twttr.txt.regexen.validSubdomain = regexSupplant(/(?:[^#{invalidDomainChars}](?:[_-]|[^#{invalidDomainChars}])*)?[^#{invalidDomainChars}]\./);
- twttr.txt.regexen.validDomainName = regexSupplant(/(?:[^#{invalidDomainChars}](?:[-]|[^#{invalidDomainChars}])*)?[^#{invalidDomainChars}]/);
- twttr.txt.regexen.validDomain = regexSupplant(/(#{validSubdomain})*#{validDomainName}\.(?:xn--[a-z0-9]{2,}|[a-z]{2,})(?::[0-9]+)?/i);
-
- twttr.txt.regexen.validGeneralUrlPathChars = /[a-z0-9!\*';:=\+\$\/%#\[\]\-_,~|\.]/i;
- // Allow URL paths to contain balanced parens
- // 1. Used in Wikipedia URLs like /Primer_(film)
- // 2. Used in IIS sessions like /S(dfd346)/
- twttr.txt.regexen.wikipediaDisambiguation = regexSupplant(/(?:\(#{validGeneralUrlPathChars}+\))/i);
- // Allow @ in a url, but only in the middle. Catch things like http://example.com/@user
- twttr.txt.regexen.validUrlPathChars = regexSupplant(/(?:#{wikipediaDisambiguation}|@#{validGeneralUrlPathChars}+\/|[\.,]?#{validGeneralUrlPathChars})/i);
-
- // Valid end-of-path chracters (so /foo. does not gobble the period).
- // 1. Allow =&# for empty URL parameters and other URL-join artifacts
- twttr.txt.regexen.validUrlPathEndingChars = regexSupplant(/(?:[\+\-a-z0-9=_#\/]|#{wikipediaDisambiguation})/i);
- twttr.txt.regexen.validUrlQueryChars = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i;
- twttr.txt.regexen.validUrlQueryEndingChars = /[a-z0-9_&=#\/]/i;
- twttr.txt.regexen.extractUrl = regexSupplant(
- '(' + // $1 total match
- '(#{validPrecedingChars})' + // $2 Preceeding chracter
- '(' + // $3 URL
- '(https?:\\/\\/)' + // $4 Protocol
- '(#{validDomain})' + // $5 Domain(s) and optional post number
- '(\\/' + // $6 URL Path
- '(?:' +
- '#{validUrlPathChars}+#{validUrlPathEndingChars}|' +
- '#{validUrlPathChars}+#{validUrlPathEndingChars}?|' +
- '#{validUrlPathEndingChars}' +
- ')?' +
- ')?' +
- '(\\?#{validUrlQueryChars}*#{validUrlQueryEndingChars})?' + // $7 Query String
- ')' +
- ')'
- , "gi");
-
-
- // These URL validation pattern strings are based on the ABNF from RFC 3986
- twttr.txt.regexen.validateUrlUnreserved = /[a-z0-9\-._~]/i;
- twttr.txt.regexen.validateUrlPctEncoded = /(?:%[0-9a-f]{2})/i;
- twttr.txt.regexen.validateUrlSubDelims = /[!$&'()*+,;=]/i;
- twttr.txt.regexen.validateUrlPchar = regexSupplant('(?:' +
- '#{validateUrlUnreserved}|' +
- '#{validateUrlPctEncoded}|' +
- '#{validateUrlSubDelims}|' +
- ':|@' +
- ')', 'i');
-
- twttr.txt.regexen.validateUrlScheme = /(?:[a-z][a-z0-9+\-.]*)/i;
- twttr.txt.regexen.validateUrlUserinfo = regexSupplant('(?:' +
- '#{validateUrlUnreserved}|' +
- '#{validateUrlPctEncoded}|' +
- '#{validateUrlSubDelims}|' +
- ':' +
- ')*', 'i');
-
- twttr.txt.regexen.validateUrlDecOctet = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i;
- twttr.txt.regexen.validateUrlIpv4 = regexSupplant(/(?:#{validateUrlDecOctet}(?:\.#{validateUrlDecOctet}){3})/i);
-
- // Punting on real IPv6 validation for now
- twttr.txt.regexen.validateUrlIpv6 = /(?:\[[a-f0-9:\.]+\])/i;
-
- // Also punting on IPvFuture for now
- twttr.txt.regexen.validateUrlIp = regexSupplant('(?:' +
- '#{validateUrlIpv4}|' +
- '#{validateUrlIpv6}' +
- ')', 'i');
-
- // This is more strict than the rfc specifies
- twttr.txt.regexen.validateUrlSubDomainSegment = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i;
- twttr.txt.regexen.validateUrlDomainSegment = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i;
- twttr.txt.regexen.validateUrlDomainTld = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i;
- twttr.txt.regexen.validateUrlDomain = regexSupplant(/(?:(?:#{validateUrlSubDomainSegment]}\.)*(?:#{validateUrlDomainSegment]}\.)#{validateUrlDomainTld})/i);
-
- twttr.txt.regexen.validateUrlHost = regexSupplant('(?:' +
- '#{validateUrlIp}|' +
- '#{validateUrlDomain}' +
- ')', 'i');
-
- // Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
- twttr.txt.regexen.validateUrlUnicodeSubDomainSegment = /(?:(?:[a-z0-9]|[^\u0000-\u007f])(?:(?:[a-z0-9_\-]|[^\u0000-\u007f])*(?:[a-z0-9]|[^\u0000-\u007f]))?)/i;
- twttr.txt.regexen.validateUrlUnicodeDomainSegment = /(?:(?:[a-z0-9]|[^\u0000-\u007f])(?:(?:[a-z0-9\-]|[^\u0000-\u007f])*(?:[a-z0-9]|[^\u0000-\u007f]))?)/i;
- twttr.txt.regexen.validateUrlUnicodeDomainTld = /(?:(?:[a-z]|[^\u0000-\u007f])(?:(?:[a-z0-9\-]|[^\u0000-\u007f])*(?:[a-z0-9]|[^\u0000-\u007f]))?)/i;
- twttr.txt.regexen.validateUrlUnicodeDomain = regexSupplant(/(?:(?:#{validateUrlUnicodeSubDomainSegment}\.)*(?:#{validateUrlUnicodeDomainSegment}\.)#{validateUrlUnicodeDomainTld})/i);
-
- twttr.txt.regexen.validateUrlUnicodeHost = regexSupplant('(?:' +
- '#{validateUrlIp}|' +
- '#{validateUrlUnicodeDomain}' +
- ')', 'i');
-
- twttr.txt.regexen.validateUrlPort = /[0-9]{1,5}/;
-
- twttr.txt.regexen.validateUrlUnicodeAuthority = regexSupplant(
- '(?:(#{validateUrlUserinfo})@)?' + // $1 userinfo
- '(#{validateUrlUnicodeHost})' + // $2 host
- '(?::(#{validateUrlPort}))?' //$3 port
- , "i");
-
- twttr.txt.regexen.validateUrlAuthority = regexSupplant(
- '(?:(#{validateUrlUserinfo})@)?' + // $1 userinfo
- '(#{validateUrlHost})' + // $2 host
- '(?::(#{validateUrlPort}))?' // $3 port
- , "i");
-
- twttr.txt.regexen.validateUrlPath = regexSupplant(/(\/#{validateUrlPchar}*)*/i);
- twttr.txt.regexen.validateUrlQuery = regexSupplant(/(#{validateUrlPchar}|\/|\?)*/i);
- twttr.txt.regexen.validateUrlFragment = regexSupplant(/(#{validateUrlPchar}|\/|\?)*/i);
-
- // Modified version of RFC 3986 Appendix B
- twttr.txt.regexen.validateUrlUnencoded = regexSupplant(
- '^' + // Full URL
- '(?:' +
- '([^:/?#]+):' + // $1 Scheme
- ')' +
- '(?://' +
- '([^/?#]*)' + // $2 Authority
- ')' +
- '([^?#]*)' + // $3 Path
- '(?:' +
- '\\?([^#]*)' + // $4 Query
- ')?' +
- '(?:' +
- '#(.*)' + // $5 Fragment
- ')?$'
- , "i");
-
+
+ function init() {
+ var UNICODE_SPACES = [
+ fromCode(0x0020), // White_Space # Zs SPACE
+ fromCode(0x0085), // White_Space # Cc <control-0085>
+ fromCode(0x00A0), // White_Space # Zs NO-BREAK SPACE
+ fromCode(0x1680), // White_Space # Zs OGHAM SPACE MARK
+ fromCode(0x180E), // White_Space # Zs MONGOLIAN VOWEL SEPARATOR
+ fromCode(0x2028), // White_Space # Zl LINE SEPARATOR
+ fromCode(0x2029), // White_Space # Zp PARAGRAPH SEPARATOR
+ fromCode(0x202F), // White_Space # Zs NARROW NO-BREAK SPACE
+ fromCode(0x205F), // White_Space # Zs MEDIUM MATHEMATICAL SPACE
+ fromCode(0x3000) // White_Space # Zs IDEOGRAPHIC SPACE
+ ];
+ addCharsToCharClass(UNICODE_SPACES, 0x009, 0x00D); // White_Space # Cc [5] <control-0009>..<control-000D>
+ addCharsToCharClass(UNICODE_SPACES, 0x2000, 0x200A); // White_Space # Zs [11] EN QUAD..HAIR SPACE
+
+ twttr.txt.regexen.spaces_group = regexSupplant(UNICODE_SPACES.join(""));
+ twttr.txt.regexen.spaces = regexSupplant("[" + UNICODE_SPACES.join("") + "]");
+ twttr.txt.regexen.punct = /\!'#%&'\(\)*\+,\\\-\.\/:;<=>\?@\[\]\^_{|}~/;
+ twttr.txt.regexen.atSigns = /[@@]/;
+ twttr.txt.regexen.extractMentions = regexSupplant(/(^|[^a-zA-Z0-9_])(#{atSigns})([a-zA-Z0-9_]{1,20})(?=(.|$))/g);
+ twttr.txt.regexen.extractReply = regexSupplant(/^(?:#{spaces})*#{atSigns}([a-zA-Z0-9_]{1,20})/);
+ twttr.txt.regexen.listName = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/;
+
+ var nonLatinHashtagChars = [];
+ // Cyrillic
+ addCharsToCharClass(nonLatinHashtagChars, 0x0400, 0x04ff); // Cyrillic
+ addCharsToCharClass(nonLatinHashtagChars, 0x0500, 0x0527); // Cyrillic Supplement
+ // Hangul (Korean)
+ addCharsToCharClass(nonLatinHashtagChars, 0x1100, 0x11ff); // Hangul Jamo
+ addCharsToCharClass(nonLatinHashtagChars, 0x3130, 0x3185); // Hangul Compatibility Jamo
+ addCharsToCharClass(nonLatinHashtagChars, 0xA960, 0xA97F); // Hangul Jamo Extended-A
+ addCharsToCharClass(nonLatinHashtagChars, 0xAC00, 0xD7AF); // Hangul Syllables
+ addCharsToCharClass(nonLatinHashtagChars, 0xD7B0, 0xD7FF); // Hangul Jamo Extended-B
+ // Japanese and Chinese
+ addCharsToCharClass(nonLatinHashtagChars, 0x30A1, 0x30FA); // Katakana (full-width)
+ addCharsToCharClass(nonLatinHashtagChars, 0x30FC, 0x30FC); // Katakana Chouon (full-width)
+ addCharsToCharClass(nonLatinHashtagChars, 0xFF66, 0xFF9F); // Katakana (half-width)
+ addCharsToCharClass(nonLatinHashtagChars, 0xFF70, 0xFF70); // Katakana Chouon (half-width)
+ addCharsToCharClass(nonLatinHashtagChars, 0xFF10, 0xFF19); // \
+ addCharsToCharClass(nonLatinHashtagChars, 0xFF21, 0xFF3A); // - Latin (full-width)
+ addCharsToCharClass(nonLatinHashtagChars, 0xFF41, 0xFF5A); // /
+ addCharsToCharClass(nonLatinHashtagChars, 0x3041, 0x3096); // Hiragana
+ addCharsToCharClass(nonLatinHashtagChars, 0x3400, 0x4DBF); // Kanji (CJK Extension A)
+ addCharsToCharClass(nonLatinHashtagChars, 0x4E00, 0x9FFF); // Kanji (Unified)
+ // -- Disabled as it breaks the Regex.
+ //addCharsToCharClass(nonLatinHashtagChars, 0x20000, 0x2A6DF); // Kanji (CJK Extension B)
+ addCharsToCharClass(nonLatinHashtagChars, 0x2A700, 0x2B73F); // Kanji (CJK Extension C)
+ addCharsToCharClass(nonLatinHashtagChars, 0x2B740, 0x2B81F); // Kanji (CJK Extension D)
+ addCharsToCharClass(nonLatinHashtagChars, 0x2F800, 0x2FA1F); // Kanji (CJK supplement)
+ addCharsToCharClass(nonLatinHashtagChars, 0x3005, 0x3005); // Kanji (CJK iteration mark)
+
+ twttr.txt.regexen.nonLatinHashtagChars = regexSupplant(nonLatinHashtagChars.join(""));
+ // Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
+ twttr.txt.regexen.latinAccentChars = regexSupplant("ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ\\303\\277");
+ twttr.txt.regexen.latenAccents = regexSupplant(/[#{latinAccentChars}]+/);
+
+ twttr.txt.regexen.endScreenNameMatch = regexSupplant(/^(?:#{atSigns}|[#{latinAccentChars}]|:\/\/)/);
+
+ // A hashtag must contain characters, numbers and underscores, but not all numbers.
+ twttr.txt.regexen.hashtagBoundary = regexSupplant(/(?:^|$|#{spaces}|「|」|。|、|\.|!|!|\?|?|,)/);
+ twttr.txt.regexen.hashtagAlpha = regexSupplant(/[a-z_#{latinAccentChars}#{nonLatinHashtagChars}]/i);
+ twttr.txt.regexen.hashtagAlphaNumeric = regexSupplant(/[a-z0-9_#{latinAccentChars}#{nonLatinHashtagChars}]/i);
+ twttr.txt.regexen.autoLinkHashtags = regexSupplant(/(#{hashtagBoundary})(#|#)(#{hashtagAlphaNumeric}*#{hashtagAlpha}#{hashtagAlphaNumeric}*)/gi);
+ twttr.txt.regexen.autoLinkUsernamesOrLists = /(^|[^a-zA-Z0-9_]|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/g;
+ twttr.txt.regexen.autoLinkEmoticon = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/g;
+
+ // URL related hash regex collection
+ twttr.txt.regexen.invalidDomainChars = stringSupplant("\u00A0#{punct}#{spaces_group}", twttr.txt.regexen);
+ twttr.txt.regexen.validPrecedingChars = regexSupplant(/(?:[^-\/"':!=A-Za-z0-9_@@]|^|\:)/);
+
+ twttr.txt.regexen.validSubdomain = regexSupplant(/(?:[^#{invalidDomainChars}](?:[_-]|[^#{invalidDomainChars}])*)?[^#{invalidDomainChars}]\./);
+ twttr.txt.regexen.validDomainName = regexSupplant(/(?:[^#{invalidDomainChars}](?:[-]|[^#{invalidDomainChars}])*)?[^#{invalidDomainChars}]/);
+ twttr.txt.regexen.validDomain = regexSupplant(/(#{validSubdomain})*#{validDomainName}\.(?:xn--[a-z0-9]{2,}|[a-z]{2,})(?::[0-9]+)?/i);
+
+ twttr.txt.regexen.validGeneralUrlPathChars = /[a-z0-9!\*';:=\+\$\/%#\[\]\-_,~|\.]/i;
+ // Allow URL paths to contain balanced parens
+ // 1. Used in Wikipedia URLs like /Primer_(film)
+ // 2. Used in IIS sessions like /S(dfd346)/
+ twttr.txt.regexen.wikipediaDisambiguation = regexSupplant(/(?:\(#{validGeneralUrlPathChars}+\))/i);
+ // Allow @ in a url, but only in the middle. Catch things like http://example.com/@user
+ twttr.txt.regexen.validUrlPathChars = regexSupplant(/(?:#{wikipediaDisambiguation}|@#{validGeneralUrlPathChars}+\/|[\.,]?#{validGeneralUrlPathChars})/i);
+
+ // Valid end-of-path chracters (so /foo. does not gobble the period).
+ // 1. Allow =&# for empty URL parameters and other URL-join artifacts
+ twttr.txt.regexen.validUrlPathEndingChars = regexSupplant(/(?:[\+\-a-z0-9=_#\/]|#{wikipediaDisambiguation})/i);
+ twttr.txt.regexen.validUrlQueryChars = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i;
+ twttr.txt.regexen.validUrlQueryEndingChars = /[a-z0-9_&=#\/]/i;
+ twttr.txt.regexen.extractUrl = regexSupplant(
+ '(' + // $1 total match
+ '(#{validPrecedingChars})' + // $2 Preceeding chracter
+ '(' + // $3 URL
+ '(https?:\\/\\/)' + // $4 Protocol
+ '(#{validDomain})' + // $5 Domain(s) and optional post number
+ '(\\/' + // $6 URL Path
+ '(?:' +
+ '#{validUrlPathChars}+#{validUrlPathEndingChars}|' +
+ '#{validUrlPathChars}+#{validUrlPathEndingChars}?|' +
+ '#{validUrlPathEndingChars}' +
+ ')?' +
+ ')?' +
+ '(\\?#{validUrlQueryChars}*#{validUrlQueryEndingChars})?' + // $7 Query String
+ ')' +
+ ')'
+ , "gi");
+
+
+ // These URL validation pattern strings are based on the ABNF from RFC 3986
+ twttr.txt.regexen.validateUrlUnreserved = /[a-z0-9\-._~]/i;
+ twttr.txt.regexen.validateUrlPctEncoded = /(?:%[0-9a-f]{2})/i;
+ twttr.txt.regexen.validateUrlSubDelims = /[!$&'()*+,;=]/i;
+ twttr.txt.regexen.validateUrlPchar = regexSupplant('(?:' +
+ '#{validateUrlUnreserved}|' +
+ '#{validateUrlPctEncoded}|' +
+ '#{validateUrlSubDelims}|' +
+ ':|@' +
+ ')', 'i');
+
+ twttr.txt.regexen.validateUrlScheme = /(?:[a-z][a-z0-9+\-.]*)/i;
+ twttr.txt.regexen.validateUrlUserinfo = regexSupplant('(?:' +
+ '#{validateUrlUnreserved}|' +
+ '#{validateUrlPctEncoded}|' +
+ '#{validateUrlSubDelims}|' +
+ ':' +
+ ')*', 'i');
+
+ twttr.txt.regexen.validateUrlDecOctet = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i;
+ twttr.txt.regexen.validateUrlIpv4 = regexSupplant(/(?:#{validateUrlDecOctet}(?:\.#{validateUrlDecOctet}){3})/i);
+
+ // Punting on real IPv6 validation for now
+ twttr.txt.regexen.validateUrlIpv6 = /(?:\[[a-f0-9:\.]+\])/i;
+
+ // Also punting on IPvFuture for now
+ twttr.txt.regexen.validateUrlIp = regexSupplant('(?:' +
+ '#{validateUrlIpv4}|' +
+ '#{validateUrlIpv6}' +
+ ')', 'i');
+
+ // This is more strict than the rfc specifies
+ twttr.txt.regexen.validateUrlSubDomainSegment = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i;
+ twttr.txt.regexen.validateUrlDomainSegment = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i;
+ twttr.txt.regexen.validateUrlDomainTld = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i;
+ twttr.txt.regexen.validateUrlDomain = regexSupplant(/(?:(?:#{validateUrlSubDomainSegment]}\.)*(?:#{validateUrlDomainSegment]}\.)#{validateUrlDomainTld})/i);
+
+ twttr.txt.regexen.validateUrlHost = regexSupplant('(?:' +
+ '#{validateUrlIp}|' +
+ '#{validateUrlDomain}' +
+ ')', 'i');
+
+ // Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
+ twttr.txt.regexen.validateUrlUnicodeSubDomainSegment = /(?:(?:[a-z0-9]|[^\u0000-\u007f])(?:(?:[a-z0-9_\-]|[^\u0000-\u007f])*(?:[a-z0-9]|[^\u0000-\u007f]))?)/i;
+ twttr.txt.regexen.validateUrlUnicodeDomainSegment = /(?:(?:[a-z0-9]|[^\u0000-\u007f])(?:(?:[a-z0-9\-]|[^\u0000-\u007f])*(?:[a-z0-9]|[^\u0000-\u007f]))?)/i;
+ twttr.txt.regexen.validateUrlUnicodeDomainTld = /(?:(?:[a-z]|[^\u0000-\u007f])(?:(?:[a-z0-9\-]|[^\u0000-\u007f])*(?:[a-z0-9]|[^\u0000-\u007f]))?)/i;
+ twttr.txt.regexen.validateUrlUnicodeDomain = regexSupplant(/(?:(?:#{validateUrlUnicodeSubDomainSegment}\.)*(?:#{validateUrlUnicodeDomainSegment}\.)#{validateUrlUnicodeDomainTld})/i);
+
+ twttr.txt.regexen.validateUrlUnicodeHost = regexSupplant('(?:' +
+ '#{validateUrlIp}|' +
+ '#{validateUrlUnicodeDomain}' +
+ ')', 'i');
+
+ twttr.txt.regexen.validateUrlPort = /[0-9]{1,5}/;
+
+ twttr.txt.regexen.validateUrlUnicodeAuthority = regexSupplant(
+ '(?:(#{validateUrlUserinfo})@)?' + // $1 userinfo
+ '(#{validateUrlUnicodeHost})' + // $2 host
+ '(?::(#{validateUrlPort}))?' //$3 port
+ , "i");
+
+ twttr.txt.regexen.validateUrlAuthority = regexSupplant(
+ '(?:(#{validateUrlUserinfo})@)?' + // $1 userinfo
+ '(#{validateUrlHost})' + // $2 host
+ '(?::(#{validateUrlPort}))?' // $3 port
+ , "i");
+
+ twttr.txt.regexen.validateUrlPath = regexSupplant(/(\/#{validateUrlPchar}*)*/i);
+ twttr.txt.regexen.validateUrlQuery = regexSupplant(/(#{validateUrlPchar}|\/|\?)*/i);
+ twttr.txt.regexen.validateUrlFragment = regexSupplant(/(#{validateUrlPchar}|\/|\?)*/i);
+
+ // Modified version of RFC 3986 Appendix B
+ twttr.txt.regexen.validateUrlUnencoded = regexSupplant(
+ '^' + // Full URL
+ '(?:' +
+ '([^:/?#]+):' + // $1 Scheme
+ ')' +
+ '(?://' +
+ '([^/?#]*)' + // $2 Authority
+ ')' +
+ '([^?#]*)' + // $3 Path
+ '(?:' +
+ '\\?([^#]*)' + // $4 Query
+ ')?' +
+ '(?:' +
+ '#(.*)' + // $5 Fragment
+ ')?$'
+ , "i");
+
+ twttr.txt.initialized = true;
+ };
// Default CSS class for auto-linked URLs
var DEFAULT_URL_CLASS = "tweet-url";
@@ -285,6 +290,9 @@ if (!window.twttr) {
}
twttr.txt.autoLink = function(text, options) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
options = clone(options || {});
return twttr.txt.autoLinkUsernamesOrLists(
twttr.txt.autoLinkUrlsCustom(
@@ -295,6 +303,9 @@ if (!window.twttr) {
twttr.txt.autoLinkUsernamesOrLists = function(text, options) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
options = clone(options || {});
options.urlClass = options.urlClass || DEFAULT_URL_CLASS;
@@ -362,6 +373,9 @@ if (!window.twttr) {
};
twttr.txt.autoLinkHashtags = function(text, options) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
options = clone(options || {});
options.urlClass = options.urlClass || DEFAULT_URL_CLASS;
options.hashtagClass = options.hashtagClass || DEFAULT_HASHTAG_CLASS;
@@ -392,6 +406,9 @@ if (!window.twttr) {
twttr.txt.autoLinkUrlsCustom = function(text, options) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
options = clone(options || {});
if (!options.suppressNoFollow) {
options.rel = "nofollow";
@@ -427,6 +444,9 @@ if (!window.twttr) {
};
twttr.txt.extractMentions = function(text) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
var screenNamesOnly = [],
screenNamesWithIndices = twttr.txt.extractMentionsWithIndices(text);
@@ -439,6 +459,9 @@ if (!window.twttr) {
};
twttr.txt.extractMentionsWithIndices = function(text) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
if (!text) {
return [];
}
@@ -461,6 +484,9 @@ if (!window.twttr) {
};
twttr.txt.extractReplies = function(text) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
if (!text) {
return null;
}
@@ -474,6 +500,9 @@ if (!window.twttr) {
};
twttr.txt.extractUrls = function(text) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
var urlsOnly = [],
urlsWithIndices = twttr.txt.extractUrlsWithIndices(text);
@@ -485,6 +514,9 @@ if (!window.twttr) {
};
twttr.txt.extractUrlsWithIndices = function(text) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
if (!text) {
return [];
}
@@ -510,6 +542,9 @@ if (!window.twttr) {
};
twttr.txt.extractHashtags = function(text) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
var hashtagsOnly = [],
hashtagsWithIndices = twttr.txt.extractHashtagsWithIndices(text);
@@ -521,6 +556,9 @@ if (!window.twttr) {
};
twttr.txt.extractHashtagsWithIndices = function(text) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
if (!text) {
return [];
}
@@ -545,6 +583,9 @@ if (!window.twttr) {
// so "<>".split(/<|>/) => [] in IE, but is ["", "", ""] in all others
// but "<<".split("<") => ["", "", ""]
twttr.txt.splitTags = function(text) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
var firstSplits = text.split("<"),
secondSplits,
allSplits = [],
@@ -566,6 +607,9 @@ if (!window.twttr) {
};
twttr.txt.hitHighlight = function(text, hits, options) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
var defaultHighlightTag = "em";
hits = hits || [];
@@ -681,6 +725,9 @@ if (!window.twttr) {
// "empty": if the text is nil or empty
// "invalid_characters": if the text contains non-Unicode or any of the disallowed Unicode characters
twttr.txt.isInvalidTweet = function(text) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
if (!text) {
return "empty";
}
@@ -699,10 +746,16 @@ if (!window.twttr) {
};
twttr.txt.isValidTweetText = function(text) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
return !twttr.txt.isInvalidTweet(text);
};
twttr.txt.isValidUsername = function(username) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
if (!username) {
return false;
}
@@ -716,6 +769,9 @@ if (!window.twttr) {
var VALID_LIST_RE = regexSupplant(/^#{autoLinkUsernamesOrLists}$/);
twttr.txt.isValidList = function(usernameList) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
var match = usernameList.match(VALID_LIST_RE);
// Must have matched and had nothing before or after
@@ -723,6 +779,9 @@ if (!window.twttr) {
};
twttr.txt.isValidHashtag = function(hashtag) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
if (!hashtag) {
return false;
}
@@ -734,6 +793,9 @@ if (!window.twttr) {
};
twttr.txt.isValidUrl = function(url, unicodeDomains) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
if (unicodeDomains == null) {
unicodeDomains = true;
}
@@ -768,6 +830,9 @@ if (!window.twttr) {
};
function isValidMatch(string, regex, optional) {
+ if (!twttr.txt.initialized) {
+ init();
+ }
if (!optional) {
// RegExp["$&"] is the text of the last match
// blank strings are ok, but are falsy, so we check stringiness instead of truthiness
@mzsanford
Copy link

Version 1.4.5 was just pushed to twitter/twitter-text-js and changed form large character classes to character ranges. Unlike Ruby 1.8.7 all of the tested Javascript engines support Unicode ranges so this should be much faster. I'll talk to @bcherry about the lazy init thing and see if we can integrate that as well. If you have a chance please test the latest version and see if it helps.

@sayrer
Copy link
Author

sayrer commented Jul 18, 2011

yeah, I noticed your patch right after I posted this. The new code definitely fixes the problem. It might not even be worth doing the lazy init stuff now.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment