Skip to content

Instantly share code, notes, and snippets.

@requinix
Last active November 21, 2016 15:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save requinix/68f2810b8a9824239c23 to your computer and use it in GitHub Desktop.
Save requinix/68f2810b8a9824239c23 to your computer and use it in GitHub Desktop.
<?php
// This code/file/output has no license, though attribution would be appreciated.
// Output at http://pastebin.com/2ZLehM5N
if ($_SERVER["QUERY_STRING"] == "source") {
highlight_file(__FILE__);
return;
}
$ALPHA = "[A-Za-z]";
$CR = "\\r";
$CRLF = "\\r\\n";
$DIGIT = "[0-9]";
$DQUOTE = "\\x22";
$LF = "\\n";
$VCHAR = "[\\x21-\\x7E]";
$WSP = "[\\x20\\t]";
/*
* RFC 5322: Internet Message Format <http://tools.ietf.org/html/rfc5322>
*
* $ADDR_SPEC is the form of an email address
*
* There is special PCRE syntax used to deal with the $COMMENT/$CCONTENT recursion, namely
* using (?P<foo>...) to name the first instance of the subpattern and (?P>foo) to recurse.
* Since there can't be multiple subpatterns with the same name, later instances of the subpattern
* use (?P>foo) to reference it.
*
* Note: this will validate an email address, but the SMTP standards have their own additional rules,
* such as fully-qualified domain names and maximum lengths. Just because this considers an email
* to be valid, SMTP may consider it invalid. Additionally, SMTP agents may take shortcuts and
* falsely accept or reject addresses when they should not.
*/
// 4.1 Miscellaneous Obsolete Tokens
$OBS_NO_WS_CTL = "[\\x01-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F]";
$OBS_CTEXT = "$OBS_NO_WS_CTL";
$OBS_QP = "\\\\(\\0| $OBS_NO_WS_CTL | $LF | $CR )";
$OBS_QTEXT = "$OBS_NO_WS_CTL";
// 3.2.1 Quoted characters
$QUOTED_PAIR = "(\\\\( $VCHAR | $WSP )| $OBS_QP )";
// 4.2 Obsolete Folding White Space
$OBS_FWS = "( $WSP )+( $CRLF ( $WSP )+)*";
// 3.2.2 Folding White Space and Comments
$FWS = "((( $WSP )* $CRLF )?( $WSP )+| $OBS_FWS )";
$CTEXT = "([\\x21-\\x27\\x2A-\\x5B\\x5D-\\x7E]| $OBS_CTEXT )";
$CCONTENT = "( $CTEXT | $QUOTED_PAIR | (?P>COMMENT) )";
$_COMMENT = "(?P<COMMENT>\\((( $FWS )? $CCONTENT )*( $FWS )?\\))";
$COMMENT = '$COMMENT';
$CFWS = "((( $FWS )? $COMMENT )+( $FWS )? | $FWS )";
// 3.2.3 Atom
$ATEXT = "( $ALPHA | $DIGIT |[!#\$%&'*+\\-/=?^_`{|}~])";
$ATOM = "( $CFWS )?( $ATEXT )+( $CFWS )?";
$DOT_ATOM_TEXT = "( $ATEXT )+(\\.( $ATEXT )+)*";
$DOT_ATOM = "( $CFWS )? $DOT_ATOM_TEXT ( $CFWS )?";
// 3.2.4 Quoted Strings
$QTEXT = "([\\x21\\x23-\\x5B\\x5D-\\x7E] | $OBS_QTEXT )";
$QCONTENT = "( $QTEXT | $QUOTED_PAIR )";
$QUOTED_STRING = "( $CFWS )? $DQUOTE (( $FWS )? $QCONTENT )*( $FWS )? $DQUOTE ( $CFWS )?";
// 3.2.5 Miscellaneous Tokens
$WORD = "( $ATOM | $QUOTED_STRING )";
// 4.4 Obsolete Addressing
$OBS_LOCAL_PART = "$WORD (\\. $WORD )*";
$OBS_DOMAIN = "$ATOM (\\. $ATOM )*";
$OBS_DTEXT = "( $OBS_NO_WS_CTL | $QUOTED_PAIR )";
// 3.4.1 Addr-Spec Specification
$LOCAL_PART = "( $DOT_ATOM | $QUOTED_STRING | $OBS_LOCAL_PART )";
$DTEXT = "([\\x21-\\x5A\\x5E-\\x7E]| $OBS_DTEXT )";
$DOMAIN_LITERAL = "( $CFWS )? \\[(( $FWS )? $DTEXT )*( $FWS )?\\]( $CFWS )?";
$DOMAIN = "( $DOT_ATOM | $DOMAIN_LITERAL | $OBS_DOMAIN )";
$ADDR_SPEC = "$LOCAL_PART @ $DOMAIN";
///
// deal with the recursion problem
// replace the first $COMMENT with the correct expression, then subsequent ones with the recursive call
$first = strpos($ADDR_SPEC, $COMMENT);
$ADDR_SPEC = substr($ADDR_SPEC, 0, $first) . $_COMMENT . substr($ADDR_SPEC, $first + strlen($COMMENT));
$ADDR_SPEC = str_replace($COMMENT, "(?P>COMMENT)", $ADDR_SPEC);
// remove whitespace, escape incidental slashes, add ^ and $ anchors
$regex = '/^' . str_replace("/", "\\/", preg_replace('/\s+/', "", $ADDR_SPEC)) . '$/';
?>
<pre style="word-wrap:break-word;">
<?= htmlentities($regex) ?>
</pre>
<a href="<?= basename(__FILE__) ?>?source">Source rules</a>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment