Skip to content

Instantly share code, notes, and snippets.

@episod
Created August 31, 2011 22:28
Show Gist options
  • Save episod/1184914 to your computer and use it in GitHub Desktop.
Save episod/1184914 to your computer and use it in GitHub Desktop.
help/configuration with expressions
{
"twitter_text": {
"version": "1.0-blah-blah",
"regular_expressions": {
"at_signs": '[@@]',
"url_chars_before": '(?:[^-\\/"\':!=a-z0-9_@@]|^|\\:)',
"url_domain": '(?:[^\\p{P}\\p{Lo}\\s][\\.-](?=[^\\p{P}\\p{Lo}\\s])|[^\\p{P}\\p{Lo}\\s])+\\.[a-z]{2,}(?::[0-9]+)?',
"probable_tld": '/\\.(?:com|net|org|gov|edu)$/iu',
"url_chars_path": '(?:(?:\\([a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_,~]+\\))|@[a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_,~]+\\/|[\\.\\,]?(?:[a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_~]|,(?!\s)))',
"urls_chars_path_end": '[a-z0-9=#\\/]',
"url_chars_query": '[a-z0-9!\\*\'\\(\\);:&=\\+\\$\\/%#\\[\\]\\-_\\.,~]',
"url_chars_query_end": '[a-z0-9_&=#\\/]',
...
},
},
"characters_reserved_per_media": 20,
"max_media_per_upload": 1,
"non_username_paths": [
"about",
"account",
"accounts",
"activity",
"all",
"announcements",
"anywhere",
"api_rules",
"api_terms",
"apirules",
"apps",
"auth",
"badges",
"blog",
"business",
"buttons",
"contacts",
"devices",
"direct_messages",
"download",
"downloads",
"edit_announcements",
"faq",
"favorites",
"find_sources",
"find_users",
"followers",
"following",
"friend_request",
"friendrequest",
"friends",
"goodies",
"help",
"home",
"im_account",
"inbox",
"invitations",
"invite",
"jobs",
"list",
"login",
"logout",
"me",
"mentions",
"messages",
"newtwitter",
"notifications",
"nudge",
"oauth",
"phoenix_search",
"positions",
"privacy",
"public_timeline",
"related_tweets",
"replies",
"retweeted_of_mine",
"retweets",
"retweets_by_others",
"rules",
"saved_searches",
"search",
"sent",
"settings",
"share",
"signup",
"signin",
"similar_to",
"statistics",
"terms",
"tos",
"translate",
"trends",
"tweetbutton",
"twttr",
"update_discoverability",
"users",
"welcome",
"who_to_follow",
"widgets",
"zendesk_auth",
"media_signup",
"phoenix_qunit_tests"
],
"photo_size_limit": 3145728,
"photo_sizes": {
"large": {
"w": 1024,
"resize": "fit",
"h": 2048
},
"medium": {
"w": 600,
"resize": "fit",
"h": 1200
},
"small": {
"w": 340,
"resize": "fit",
"h": 480
},
"thumb": {
"w": 150,
"resize": "crop",
"h": 150
}
},
"short_url_length_https": 20,
"short_url_length": 19
}
abstract class Twitter_Regex {
/**
* Expression to at sign characters
*
* @var string
*/
const REGEX_AT_SIGNS = '[@@]';
/**
* Expression to match characters that may come before a URL.
*
* @var string
*/
const REGEX_URL_CHARS_BEFORE = '(?:[^-\\/"\':!=a-z0-9_@@]|^|\\:)';
/**
* Expression to match the domain portion of a URL.
*
* @var string
*/
const REGEX_URL_DOMAIN = '(?:[^\\p{P}\\p{Lo}\\s][\\.-](?=[^\\p{P}\\p{Lo}\\s])|[^\\p{P}\\p{Lo}\\s])+\\.[a-z]{2,}(?::[0-9]+)?';
/**
* Expression to match handful of probable TLDs for protocol-less URLS.
*
* @var string
*/
const REGEX_PROBABLE_TLD = '/\\.(?:com|net|org|gov|edu)$/iu';
/**
* Expression to match characters that may come in the URL path.
*
* @var string
*/
const REGEX_URL_CHARS_PATH = '(?:(?:\\([a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_,~]+\\))|@[a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_,~]+\\/|[\\.\\,]?(?:[a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_~]|,(?!\s)))';
/**
* Expression to match characters that may come at the end of the URL path.
*
* @var string
*/
const REGEX_URL_CHARS_PATH_END = '[a-z0-9=#\\/]';
/**
* Expression to match characters that may come in the URL query string.
*
* @var string
*/
const REGEX_URL_CHARS_QUERY = '[a-z0-9!\\*\'\\(\\);:&=\\+\\$\\/%#\\[\\]\\-_\\.,~]';
/**
* Expression to match characters that may come at the end of the URL query
* string.
*
* @var string
*/
const REGEX_URL_CHARS_QUERY_END = '[a-z0-9_&=#\\/]';
/**
* Expression to match a username followed by a list.
*
* @var string
*/
const REGEX_USERNAME_LIST = '/([^a-z0-9_\/]|^|RT:?)([@@]+)([a-z0-9_]{1,20})(\/[a-z][-_a-z0-9\x80-\xFF]{0,24})?([@@\xC0-\xD6\xD8-\xF6\xF8-\xFF]?)/iu';
/**
* Expression to match a username mentioned anywhere in a tweet.
*
* @var string
*/
const REGEX_USERNAME_MENTION = '/(^|[^a-z0-9_])[@@]([a-z0-9_]{1,20})([@@\xC0-\xD6\xD8-\xF6\xF8-\xFF]?)/iu';
/**
* Expression to match a hashtag.
*
* @var string
*/
const REGEX_HASHTAG = '/(^|[^0-9A-Z&\/\?]+)([##]+)([0-9A-Z_]*[A-Z_]+[a-z0-9_üÀ-ÖØ-öø-ÿ]*)/iu';
/**
* Expression to match whitespace.
*
* Single byte whitespace characters
* 0x0009-0x000D White_Space # Cc # <control-0009>..<control-000D>
* 0x0020 White_Space # Zs # SPACE
* 0x0085 White_Space # Cc # <control-0085>
* 0x00A0 White_Space # Zs # NO-BREAK SPACE
* Multi byte whitespace characters
* 0x1680 White_Space # Zs # OGHAM SPACE MARK
* 0x180E White_Space # Zs # MONGOLIAN VOWEL SEPARATOR
* 0x2000-0x200A White_Space # Zs # EN QUAD..HAIR SPACE
* 0x2028 White_Space # Zl # LINE SEPARATOR
* 0x2029 White_Space # Zp # PARAGRAPH SEPARATOR
* 0x202F White_Space # Zs # NARROW NO-BREAK SPACE
* 0x205F White_Space # Zs # MEDIUM MATHEMATICAL SPACE
* 0x3000 White_Space # Zs # IDEOGRAPHIC SPACE
*
* @var string
*/
const REGEX_WHITESPACE = '[\x09-\x0D\x20\x85\xA0]|\xe1\x9a\x80|\xe1\xa0\x8e|\xe2\x80[\x80-\x8a,\xa8,\xa9,\xaf\xdf]|\xe3\x80\x80';
/**
* Contains the complete valid URL pattern string.
*
* This should be generated the first time the constructor is called.
*
* @var string The regex pattern for a valid URL.
*/
protected static $REGEX_VALID_URL = null;
/**
* Contains the reply username pattern string.
*
* This should be generated the first time the constructor is called.
*
* @var string The regex pattern for a reply username.
*/
protected static $REGEX_REPLY_USERNAME = null;
/**
* The tweet to be used in parsing. This should be populated by the
* constructor of all subclasses.
*
* @var string
*/
protected $tweet = '';
/**
* This constructor is used to populate some variables.
*
* @param string $tweet The tweet to parse.
*/
protected function __construct($tweet) {
if (is_null(self::$REGEX_VALID_URL)) {
self::$REGEX_VALID_URL = '/(?:' # $1 Complete match (preg_match already matches everything.)
. '('.self::REGEX_URL_CHARS_BEFORE.')' # $2 Preceding character
. '(' # $3 Complete URL
. '((?:https?:\\/\\/|www\\.)?)' # $4 Protocol (or www)
. '('.self::REGEX_URL_DOMAIN.')' # $5 Domain(s) (and port)
. '(\\/'.self::REGEX_URL_CHARS_PATH.'*' # $6 URL Path
. self::REGEX_URL_CHARS_PATH_END.'?)?'
. '(\\?'.self::REGEX_URL_CHARS_QUERY.'*' # $7 Query String
. self::REGEX_URL_CHARS_QUERY_END.')?'
. ')'
. ')/iux';
}
if (is_null(self::$REGEX_REPLY_USERNAME)) {
self::$REGEX_REPLY_USERNAME = '/^('.self::REGEX_WHITESPACE.')*[@@]([a-zA-Z0-9_]{1,20})/';
}
$this->tweet = $tweet;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment