episod/configuration.json

## configuration.json
{
  "twitter_text": {
    "version": "1.0-blah-blah",
    "regular_expressions": {
      "at_signs": '[@＠]',
      "url_chars_before": '(?:[^-\\/"\':!=a-z0-9_@＠]|^|\\:)',
      "url_domain": '(?:[^\\p{P}\\p{Lo}\\s][\\.-](?=[^\\p{P}\\p{Lo}\\s])|[^\\p{P}\\p{Lo}\\s])+\\.[a-z]{2,}(?::[0-9]+)?',
      "probable_tld": '/\\.(?:com|net|org|gov|edu)$/iu',
      "url_chars_path": '(?:(?:\\([a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_,~]+\\))|@[a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_,~]+\\/|[\\.\\,]?(?:[a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_~]|,(?!\s)))',
      "urls_chars_path_end": '[a-z0-9=#\\/]',
      "url_chars_query": '[a-z0-9!\\*\'\\(\\);:&=\\+\\$\\/%#\\[\\]\\-_\\.,~]',
      "url_chars_query_end": '[a-z0-9_&=#\\/]',
      ...
    },
  },
  "characters_reserved_per_media": 20,
  "max_media_per_upload": 1,
  "non_username_paths": [
    "about",
    "account",
    "accounts",
    "activity",
    "all",
    "announcements",
    "anywhere",
    "api_rules",
    "api_terms",
    "apirules",
    "apps",
    "auth",
    "badges",
    "blog",
    "business",
    "buttons",
    "contacts",
    "devices",
    "direct_messages",
    "download",
    "downloads",
    "edit_announcements",
    "faq",
    "favorites",
    "find_sources",
    "find_users",
    "followers",
    "following",
    "friend_request",
    "friendrequest",
    "friends",
    "goodies",
    "help",
    "home",
    "im_account",
    "inbox",
    "invitations",
    "invite",
    "jobs",
    "list",
    "login",
    "logout",
    "me",
    "mentions",
    "messages",
    "newtwitter",
    "notifications",
    "nudge",
    "oauth",
    "phoenix_search",
    "positions",
    "privacy",
    "public_timeline",
    "related_tweets",
    "replies",
    "retweeted_of_mine",
    "retweets",
    "retweets_by_others",
    "rules",
    "saved_searches",
    "search",
    "sent",
    "settings",
    "share",
    "signup",
    "signin",
    "similar_to",
    "statistics",
    "terms",
    "tos",
    "translate",
    "trends",
    "tweetbutton",
    "twttr",
    "update_discoverability",
    "users",
    "welcome",
    "who_to_follow",
    "widgets",
    "zendesk_auth",
    "media_signup",
    "phoenix_qunit_tests"
  ],
  "photo_size_limit": 3145728,
  "photo_sizes": {
    "large": {
      "w": 1024,
      "resize": "fit",
      "h": 2048
    },
    "medium": {
      "w": 600,
      "resize": "fit",
      "h": 1200
    },
    "small": {
      "w": 340,
      "resize": "fit",
      "h": 480
    },
    "thumb": {
      "w": 150,
      "resize": "crop",
      "h": 150
    }
  },
  "short_url_length_https": 20,
  "short_url_length": 19
}

abstract class Twitter_Regex {

  /**
   * Expression to at sign characters
   *
   * @var  string
   */
  const REGEX_AT_SIGNS = '[@＠]';

  /**
   * Expression to match characters that may come before a URL.
   *
   * @var  string
   */
  const REGEX_URL_CHARS_BEFORE = '(?:[^-\\/"\':!=a-z0-9_@＠]|^|\\:)';

  /**
   * Expression to match the domain portion of a URL.
   *
   * @var  string
   */
  const REGEX_URL_DOMAIN = '(?:[^\\p{P}\\p{Lo}\\s][\\.-](?=[^\\p{P}\\p{Lo}\\s])|[^\\p{P}\\p{Lo}\\s])+\\.[a-z]{2,}(?::[0-9]+)?';

  /**
   * Expression to match handful of probable TLDs for protocol-less URLS.
   *
   * @var  string
   */
  const REGEX_PROBABLE_TLD = '/\\.(?:com|net|org|gov|edu)$/iu';

  /**
   * Expression to match characters that may come in the URL path.
   *
   * @var  string
   */
  const REGEX_URL_CHARS_PATH = '(?:(?:\\([a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_,~]+\\))|@[a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_,~]+\\/|[\\.\\,]?(?:[a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_~]|,(?!\s)))';

  /**
   * Expression to match characters that may come at the end of the URL path.
   *
   * @var  string
   */
  const REGEX_URL_CHARS_PATH_END = '[a-z0-9=#\\/]';

  /**
   * Expression to match characters that may come in the URL query string.
   *
   * @var  string
   */
  const REGEX_URL_CHARS_QUERY = '[a-z0-9!\\*\'\\(\\);:&=\\+\\$\\/%#\\[\\]\\-_\\.,~]';

  /**
   * Expression to match characters that may come at the end of the URL query
   * string.
   *
   * @var  string
   */
  const REGEX_URL_CHARS_QUERY_END = '[a-z0-9_&=#\\/]';

  /**
   * Expression to match a username followed by a list.
   *
   * @var  string
   */
  const REGEX_USERNAME_LIST = '/([^a-z0-9_\/]|^|RT:?)([@＠]+)([a-z0-9_]{1,20})(\/[a-z][-_a-z0-9\x80-\xFF]{0,24})?([@＠\xC0-\xD6\xD8-\xF6\xF8-\xFF]?)/iu';

  /**
   * Expression to match a username mentioned anywhere in a tweet.
   *
   * @var  string
   */
  const REGEX_USERNAME_MENTION = '/(^|[^a-z0-9_])[@＠]([a-z0-9_]{1,20})([@＠\xC0-\xD6\xD8-\xF6\xF8-\xFF]?)/iu';

  /**
   * Expression to match a hashtag.
   *
   * @var  string
   */
  const REGEX_HASHTAG = '/(^|[^0-9A-Z&\/\?]+)([#＃]+)([0-9A-Z_]*[A-Z_]+[a-z0-9_üÀ-ÖØ-öø-ÿ]*)/iu';

  /**
   * Expression to match whitespace.
   *
   * Single byte whitespace characters
   *   0x0009-0x000D White_Space # Cc # <control-0009>..<control-000D>
   *   0x0020        White_Space # Zs # SPACE
   *   0x0085        White_Space # Cc # <control-0085>
   *   0x00A0        White_Space # Zs # NO-BREAK SPACE
   * Multi byte whitespace characters
   *   0x1680        White_Space # Zs # OGHAM SPACE MARK
   *   0x180E        White_Space # Zs # MONGOLIAN VOWEL SEPARATOR
   *   0x2000-0x200A White_Space # Zs # EN QUAD..HAIR SPACE
   *   0x2028        White_Space # Zl # LINE SEPARATOR
   *   0x2029        White_Space # Zp # PARAGRAPH SEPARATOR
   *   0x202F        White_Space # Zs # NARROW NO-BREAK SPACE
   *   0x205F        White_Space # Zs # MEDIUM MATHEMATICAL SPACE
   *   0x3000        White_Space # Zs # IDEOGRAPHIC SPACE
   *
   * @var  string
   */
  const REGEX_WHITESPACE = '[\x09-\x0D\x20\x85\xA0]|\xe1\x9a\x80|\xe1\xa0\x8e|\xe2\x80[\x80-\x8a,\xa8,\xa9,\xaf\xdf]|\xe3\x80\x80';

  /**
   * Contains the complete valid URL pattern string.
   *
   * This should be generated the first time the constructor is called.
   *
   * @var  string  The regex pattern for a valid URL.
   */
  protected static $REGEX_VALID_URL = null;

  /**
   * Contains the reply username pattern string.
   *
   * This should be generated the first time the constructor is called.
   *
   * @var  string  The regex pattern for a reply username.
   */
  protected static $REGEX_REPLY_USERNAME = null;

  /**
   * The tweet to be used in parsing.  This should be populated by the
   * constructor of all subclasses.
   *
   * @var  string
   */
  protected $tweet = '';

  /**
   * This constructor is used to populate some variables.
   *
   * @param  string  $tweet  The tweet to parse.
   */
  protected function __construct($tweet) {
    if (is_null(self::$REGEX_VALID_URL)) {
      self::$REGEX_VALID_URL = '/(?:'             # $1 Complete match (preg_match already matches everything.)
        . '('.self::REGEX_URL_CHARS_BEFORE.')'    # $2 Preceding character
        . '('                                     # $3 Complete URL
        . '((?:https?:\\/\\/|www\\.)?)'           # $4 Protocol (or www)
        . '('.self::REGEX_URL_DOMAIN.')'          # $5 Domain(s) (and port)
        . '(\\/'.self::REGEX_URL_CHARS_PATH.'*'   # $6 URL Path
        . self::REGEX_URL_CHARS_PATH_END.'?)?'
        . '(\\?'.self::REGEX_URL_CHARS_QUERY.'*'  # $7 Query String
        . self::REGEX_URL_CHARS_QUERY_END.')?'
        . ')'
        . ')/iux';
    }
    if (is_null(self::$REGEX_REPLY_USERNAME)) {
      self::$REGEX_REPLY_USERNAME = '/^('.self::REGEX_WHITESPACE.')*[@＠]([a-zA-Z0-9_]{1,20})/';
    }
    $this->tweet = $tweet;
  }

}
	{
	"twitter_text": {
	"version": "1.0-blah-blah",
	"regular_expressions": {
	"at_signs": '[@＠]',
	"url_chars_before": '(?:[^-\\/"\':!=a-z0-9_@＠]\|^\|\\:)',
	"url_domain": '(?:[^\\p{P}\\p{Lo}\\s][\\.-](?=[^\\p{P}\\p{Lo}\\s])\|[^\\p{P}\\p{Lo}\\s])+\\.[a-z]{2,}(?::[0-9]+)?',
	"probable_tld": '/\\.(?:com\|net\|org\|gov\|edu)$/iu',
	"url_chars_path": '(?:(?:\\([a-z0-9!\\\';:=\\+\\$\\/%#\\[\\]\\-_,~]+\\))\|@[a-z0-9!\\\';:=\\+\\$\\/%#\\[\\]\\-_,~]+\\/\|[\\.\\,]?(?:[a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_~]\|,(?!\s)))',
	"urls_chars_path_end": '[a-z0-9=#\\/]',
	"url_chars_query": '[a-z0-9!\\*\'\\(\\);:&=\\+\\$\\/%#\\[\\]\\-_\\.,~]',
	"url_chars_query_end": '[a-z0-9_&=#\\/]',
	...
	},
	},
	"characters_reserved_per_media": 20,
	"max_media_per_upload": 1,
	"non_username_paths": [
	"about",
	"account",
	"accounts",
	"activity",
	"all",
	"announcements",
	"anywhere",
	"api_rules",
	"api_terms",
	"apirules",
	"apps",
	"auth",
	"badges",
	"blog",
	"business",
	"buttons",
	"contacts",
	"devices",
	"direct_messages",
	"download",
	"downloads",
	"edit_announcements",
	"faq",
	"favorites",
	"find_sources",
	"find_users",
	"followers",
	"following",
	"friend_request",
	"friendrequest",
	"friends",
	"goodies",
	"help",
	"home",
	"im_account",
	"inbox",
	"invitations",
	"invite",
	"jobs",
	"list",
	"login",
	"logout",
	"me",
	"mentions",
	"messages",
	"newtwitter",
	"notifications",
	"nudge",
	"oauth",
	"phoenix_search",
	"positions",
	"privacy",
	"public_timeline",
	"related_tweets",
	"replies",
	"retweeted_of_mine",
	"retweets",
	"retweets_by_others",
	"rules",
	"saved_searches",
	"search",
	"sent",
	"settings",
	"share",
	"signup",
	"signin",
	"similar_to",
	"statistics",
	"terms",
	"tos",
	"translate",
	"trends",
	"tweetbutton",
	"twttr",
	"update_discoverability",
	"users",
	"welcome",
	"who_to_follow",
	"widgets",
	"zendesk_auth",
	"media_signup",
	"phoenix_qunit_tests"
	],
	"photo_size_limit": 3145728,
	"photo_sizes": {
	"large": {
	"w": 1024,
	"resize": "fit",
	"h": 2048
	},
	"medium": {
	"w": 600,
	"resize": "fit",
	"h": 1200
	},
	"small": {
	"w": 340,
	"resize": "fit",
	"h": 480
	},
	"thumb": {
	"w": 150,
	"resize": "crop",
	"h": 150
	}
	},
	"short_url_length_https": 20,
	"short_url_length": 19
	}

	abstract class Twitter_Regex {

	/**
	* Expression to at sign characters
	*
	* @var string
	*/
	const REGEX_AT_SIGNS = '[@＠]';

	/**
	* Expression to match characters that may come before a URL.
	*
	* @var string
	*/
	const REGEX_URL_CHARS_BEFORE = '(?:[^-\\/"\':!=a-z0-9_@＠]\|^\|\\:)';

	/**
	* Expression to match the domain portion of a URL.
	*
	* @var string
	*/
	const REGEX_URL_DOMAIN = '(?:[^\\p{P}\\p{Lo}\\s][\\.-](?=[^\\p{P}\\p{Lo}\\s])\|[^\\p{P}\\p{Lo}\\s])+\\.[a-z]{2,}(?::[0-9]+)?';

	/**
	* Expression to match handful of probable TLDs for protocol-less URLS.
	*
	* @var string
	*/
	const REGEX_PROBABLE_TLD = '/\\.(?:com\|net\|org\|gov\|edu)$/iu';

	/**
	* Expression to match characters that may come in the URL path.
	*
	* @var string
	*/
	const REGEX_URL_CHARS_PATH = '(?:(?:\\([a-z0-9!\\\';:=\\+\\$\\/%#\\[\\]\\-_,~]+\\))\|@[a-z0-9!\\\';:=\\+\\$\\/%#\\[\\]\\-_,~]+\\/\|[\\.\\,]?(?:[a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_~]\|,(?!\s)))';

	/**
	* Expression to match characters that may come at the end of the URL path.
	*
	* @var string
	*/
	const REGEX_URL_CHARS_PATH_END = '[a-z0-9=#\\/]';

	/**
	* Expression to match characters that may come in the URL query string.
	*
	* @var string
	*/
	const REGEX_URL_CHARS_QUERY = '[a-z0-9!\\*\'\\(\\);:&=\\+\\$\\/%#\\[\\]\\-_\\.,~]';

	/**
	* Expression to match characters that may come at the end of the URL query
	* string.
	*
	* @var string
	*/
	const REGEX_URL_CHARS_QUERY_END = '[a-z0-9_&=#\\/]';

	/**
	* Expression to match a username followed by a list.
	*
	* @var string
	*/
	const REGEX_USERNAME_LIST = '/([^a-z0-9_\/]\|^\|RT:?)([@＠]+)([a-z0-9_]{1,20})(\/[a-z][-_a-z0-9\x80-\xFF]{0,24})?([@＠\xC0-\xD6\xD8-\xF6\xF8-\xFF]?)/iu';

	/**
	* Expression to match a username mentioned anywhere in a tweet.
	*
	* @var string
	*/
	const REGEX_USERNAME_MENTION = '/(^\|[^a-z0-9_])[@＠]([a-z0-9_]{1,20})([@＠\xC0-\xD6\xD8-\xF6\xF8-\xFF]?)/iu';

	/**
	* Expression to match a hashtag.
	*
	* @var string
	*/
	const REGEX_HASHTAG = '/(^\|[^0-9A-Z&\/\?]+)([#＃]+)([0-9A-Z_][A-Z_]+[a-z0-9_üÀ-ÖØ-öø-ÿ])/iu';

	/**
	* Expression to match whitespace.
	*
	* Single byte whitespace characters
	* 0x0009-0x000D White_Space # Cc # <control-0009>..<control-000D>
	* 0x0020 White_Space # Zs # SPACE
	* 0x0085 White_Space # Cc # <control-0085>
	* 0x00A0 White_Space # Zs # NO-BREAK SPACE
	* Multi byte whitespace characters
	* 0x1680 White_Space # Zs # OGHAM SPACE MARK
	* 0x180E White_Space # Zs # MONGOLIAN VOWEL SEPARATOR
	* 0x2000-0x200A White_Space # Zs # EN QUAD..HAIR SPACE
	* 0x2028 White_Space # Zl # LINE SEPARATOR
	* 0x2029 White_Space # Zp # PARAGRAPH SEPARATOR
	* 0x202F White_Space # Zs # NARROW NO-BREAK SPACE
	* 0x205F White_Space # Zs # MEDIUM MATHEMATICAL SPACE
	* 0x3000 White_Space # Zs # IDEOGRAPHIC SPACE
	*
	* @var string
	*/
	const REGEX_WHITESPACE = '[\x09-\x0D\x20\x85\xA0]\|\xe1\x9a\x80\|\xe1\xa0\x8e\|\xe2\x80[\x80-\x8a,\xa8,\xa9,\xaf\xdf]\|\xe3\x80\x80';

	/**
	* Contains the complete valid URL pattern string.
	*
	* This should be generated the first time the constructor is called.
	*
	* @var string The regex pattern for a valid URL.
	*/
	protected static $REGEX_VALID_URL = null;

	/**
	* Contains the reply username pattern string.
	*
	* This should be generated the first time the constructor is called.
	*
	* @var string The regex pattern for a reply username.
	*/
	protected static $REGEX_REPLY_USERNAME = null;

	/**
	* The tweet to be used in parsing. This should be populated by the
	* constructor of all subclasses.
	*
	* @var string
	*/
	protected $tweet = '';

	/**
	* This constructor is used to populate some variables.
	*
	* @param string $tweet The tweet to parse.
	*/
	protected function __construct($tweet) {
	if (is_null(self::$REGEX_VALID_URL)) {
	self::$REGEX_VALID_URL = '/(?:' # $1 Complete match (preg_match already matches everything.)
	. '('.self::REGEX_URL_CHARS_BEFORE.')' # $2 Preceding character
	. '(' # $3 Complete URL
	. '((?:https?:\\/\\/\|www\\.)?)' # $4 Protocol (or www)
	. '('.self::REGEX_URL_DOMAIN.')' # $5 Domain(s) (and port)
	. '(\\/'.self::REGEX_URL_CHARS_PATH.'*' # $6 URL Path
	. self::REGEX_URL_CHARS_PATH_END.'?)?'
	. '(\\?'.self::REGEX_URL_CHARS_QUERY.'*' # $7 Query String
	. self::REGEX_URL_CHARS_QUERY_END.')?'
	. ')'
	. ')/iux';
	}
	if (is_null(self::$REGEX_REPLY_USERNAME)) {
	self::$REGEX_REPLY_USERNAME = '/^('.self::REGEX_WHITESPACE.')*[@＠]([a-zA-Z0-9_]{1,20})/';
	}
	$this->tweet = $tweet;
	}

	}