Instantly share code, notes, and snippets.
Created
July 14, 2017 23:17
-
Save russianryebread/07db6aa50a0c3866e2dbba144921b258 to your computer and use it in GitHub Desktop.
PHP Whitespace Detector (phptags) [Requires PHP 5]
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/php -qCdshort_open_tag=1 | |
<?php | |
/** | |
* api: cli | |
* type: application | |
* title: PHP tag tidier | |
* description: Rewrites PHP scripts' short/long open tags, close tags, apply whitespace fixes | |
* version: 1.1 | |
* license: Public Domain | |
* author: mario <mario#include-once:org> | |
* category: utilities | |
* config: <file type="array" value="$HOME/.config/php/phptags.php" title="configuration defaults file" description="an ordinary return(array(...)); script to set interna options like regex=>1 or verbose=>1" /> | |
* url: http://freshcode.club/projects/phptags | |
* | |
* Simple command-line tool to rewrite PHP <?php open tags into | |
* long and short forms, adding or removing the closing ?> token, | |
* probing or removal of trailing whitespace and hidden markers | |
* (e.g. UTF-8 BOM, or NUL bytes). | |
* | |
* A simple invocation that rewrites all *.php scripts in a given | |
* directory (recursively) is: | |
* | |
* phptags --whitespace --close ./forum/ | |
* | |
* Also works on a list of files: | |
* | |
* phptags --warn -v *.php | |
* | |
* | |
* There are two distinct matching and rewriting modes, the default | |
* regex matching and traversing scripts per PHP tokenizer. Both | |
* work for all tasks theoretically. | |
* | |
* Whitespace detection and removal is always done using regular | |
* expressions. For --close and --unclose tasks it is likewise | |
* reliable. | |
* | |
* Rewriting short and long tags should preferrably done with the | |
* --tokenizer mode, because the regex rewrite is not context-aware | |
* (can affect open/close tags in strings or comments). Albeit that | |
* might be desireable in edge cases, and regex usage also retains | |
* spacing after short tags more prudently. | |
* | |
* | |
* Lastly this tool is Public Domain, compatible to all open | |
* source and Free software licenses. Thus redistributable with | |
* applications, scripts and libraries. BUT COMES WITH NO WARRANTY. | |
* | |
* | |
* @package phptags | |
* @license http://creativecommons.org/licenses/publicdomain/ | |
*/ | |
/** | |
* Fetch commandline options for later use. | |
* Also merges in the configuration file overrides, if present | |
* | |
*/ | |
$action = new ArrayObject( | |
(array)@(include(current(array_filter(array("$_SERVER[PHPTAGS_CONFIG]","$_SERVER[XDG_CONFIG_HOME]/php/phptags.php","$_SERVER[HOME]/.config/php/phptags.php","$_SERVER[APPDATA]/php/phptags.php"),"file_exists")))) | |
+ | |
array( | |
"help" => argv("-h", "-help", "--help", "/?", "-?"), | |
"syntax" => count($_SERVER["argv"]) < 2, | |
"long" => argv("-l", "-long", "--long", "--long-open"), | |
"php54" => argv("-54", "-php54", "--php54"), # don't rewrite <?= for PHP 5.4 | |
"php7" => argv("-7", "-php7", "--php7"), # test for <% ASP %> and <script language=PHP> tags | |
"short" => argv("-s", "-short", "--short", "--short-open"), | |
"shortall" => argv("-a", "-all", "--all", "-shortall", "--shortall", "--short-all", "-sa"), | |
"unclosed" => argv("-u", "-unclosed", "--unclosed", "--remove-closing", "--unclose", "--open", "--opened"), | |
"close" => argv("-c", "-close", "--close", "-closed", "--closed", "--add-closing"), | |
"white" => argv("-w", "-white", "--white", "--whitespace", "-ws", "--ws", "-space", "--space", "--fix-whitespace", "--bom"), | |
"warn" => argv("-W", "-warn", "--warn", "--warning"), | |
//"html" => argv("-H", "-html", "--html", "-text", "--text", "-content", "--content"), # warn also about plain HTML outside of PHP code | |
"recursive" => true + argv("-r"), # doing that anyway | |
"regex" => argv("--regex", "--rx", "-rx"), # use regex | |
"token" => argv("--tokenizer", "--token", "-t"),# use tokenizer | |
"color" => argv("-c", "--color", "--ansi") or isset($_SERVER["TERM"]), # colorize some output | |
"quiet" => argv("--quiet", "-quiet", "-q"), | |
"verbose" => argv("--verbose", "-verbose", "-v"), | |
"debug" => argv("--debug", "-D"), | |
"dry" => argv("--dry", "-d"), # dry run, don't save files | |
"new" => argv("--new", "--suffix"), # save files under file.php.new | |
"backup" => argv("-b", "--backup"), # renames to file.php~ before overwriting | |
"version" => argv("--version", "-V"), | |
"PCRE_VERSION" => PCRE_VERSION, | |
"files" => argv_files(), | |
), 2 | |
); | |
// show options | |
($action->debug) and print_r($action); | |
// print version | |
($action->version) and print(join(preg_grep('/^\s*\*\s*version+:/', file($_SERVER["argv"][0])))); | |
// print help | |
if ($action->help) { | |
print <<<HELP | |
syntax: phptags [options] [path/files] | |
Traverses a given directory or files to rewrite PHP script tags/tokens. | |
-l --long Convert every short <? into long <?php open tags. | |
-s --short Shorten small <?php sections into <?= or <? short tags. | |
-a --all Shorten \x1b[4mall\x1b[0m long tags into short tags. | |
-u --unclosed Strip closing ?> php token. (Albeit --whitespace fixing should be preferred.) | |
-c --close Add trailing ?> close token. | |
-w --white Strip whitespaces after ?> close tags, or UTF-8 BOM before opening <?php tag. | |
-W --warn Just warn about whitespace issues. | |
-t --token Use tokenizer for --short and --long conversion. (More diligent than --regex mode, but doesn't preserve indentation as well.) | |
-h --help This very helpful help text. | |
-v --verbose Extra output, use -h -v for all options.\n\n | |
HELP; | |
$action->verbose and print <<<VERBOSE | |
More options: | |
-D --debug Development notices. | |
-d --dry Dry run, don't update files. | |
-n --new Create file.php.new for updated files. | |
-b --backup Create file.php~ backup files on edits. | |
-54 --php54 Keep <?= always for --long conversion. | |
-7 --php7 Probe for ASP style <% and super long PHP <script> tags. Use in --warn mode, no rewriting is attempted (too rare). | |
-r --recursive Traverse subdirectories. | |
-c --color Colorize messages. | |
-q --quiet Supress notices. | |
-V --version Print phptags version.\n\n | |
VERBOSE; | |
} | |
// no command-line options were given, so print a short help | |
elseif ($action->syntax) { | |
print "\nexample: phptags --short --close --whitespace ./dir/ or *.php\nSee also --help\n\n"; | |
} | |
// were any files specified? | |
elseif (!$action->files) { | |
print "No files or directories specified.\n"; | |
} | |
// do something, do something! | |
elseif ($action->long || $action->short || $action->shortall || $action->close || $action->unclosed || $action->white || $action->warn || $action->dry) { | |
foreach (files($action->files, $action->recursive) as $I=>$fn) { | |
// read in file | |
$src = file_get_contents($fn); | |
$chksum = md5($src); | |
$action->debug and print("$fn: reading [$chksum]\n"); | |
// whitespace warnings | |
if ($action->warn) { | |
preg_test("/\?\>([\s\pZ\\0]+)\z/", $src, "TRAILING whitespace"); | |
# /\\xEF\\xBB\\xBF/ == /\x{FEFF}/u - But the dependency on a valid UTF-8 encoding can damage binaryish files | |
preg_test("/^\\xEF\\xBB\\xBF[\s\pZ]*<\?(php|\W)/i", $src, "UTF-8 BOM before <?php") | |
or preg_test("/^\\xEF\\xBB\\xBF/", $src, "UTF-8 BOM alone"); | |
preg_test("/^[\s\\0]+<\?(php|\W)/i", $src, "Whitespace BEFORE <?php") | |
or preg_test("/^[\pZ\s\\0]+<\?(php|\W)/i", $src, "Unicode whitespace BEFORE <?php"); | |
// Consecutive PHP tags which *currently* do not output whitespace | |
if ($action->verbose and preg_match("/^\<\?/", $src)) | |
preg_test("/\?\>(?!\\n<)\s+<\?/", $src, "Consecutive open+close tags with spacing (template?)") | |
or preg_test("/\?\>\\r?\\n\<\?/", $src, "Consecutive open+close tags with harmless linebreak"); | |
// Warn about ambigious PHP tags like `<?print(123)` | |
preg_test("/[\s\pZ]+\<\?((?!php|=|xml[-\s]|\w+:\w+)\w++)/i", $src, "Ambiguous PHP tag or unknown XML PI"); | |
} | |
// remove whitespace | |
if ($action->white) { | |
// spaces after ? > | |
preg_modify("/(\?\>)[\s\pZ\\0]+(\z)/", "$1$2", $src, "Removed trailing whitespace"); | |
// UTF-8 BOM before < ? - but retains spaces until next rule | |
preg_modify("/^\\xEF\\xBB\\xBF([\s\pZ\\0]*<\?)(php|\W)/i", "$1$2", $src, "Removed leading UTF-8 BOM"); | |
// spaces before < ? | |
preg_modify("/^[\s\pZ\\0]+(<\?)(php|\W)/i", "$1$2", $src, "Removed leading whitespace"); | |
} | |
// check if we have at least one opening tag | |
if (preg_match("/\<\?/", $src)) { | |
// (the regex approach is reliable enough for close tag removal/adding | |
// add missing close tags | |
if ($action->close) { | |
preg_modify("/ | |
\<\? # any opening tag | |
(?!.*\?\>).+ # filler, assert no '?>' close tags in between | |
[\s\pZ]* \z # whitespace, and end of file | |
/sx", | |
"$0\?\>", $src, "Added ?> close tag" | |
); | |
} | |
// remove any close tags | |
if ($action->unclosed) { | |
preg_modify("/ | |
\?\> # close tag | |
[\s\pZ]* \z # whitespace before end of file (gets removed too, but not if NUL bytes) | |
/sx", | |
"", $src, "Removed ?> close tag" | |
); | |
} | |
} | |
// probe for fringe <% ASP-style and super long <script> PHP tags, as removed with PHP7 | |
if ($action->php7) { | |
// <% and %> | |
preg_test("/ \<\% | \%\> /x", $src, "ASP-style tags detected"); | |
// just probe for opening PHP <script> tag | |
preg_test("/\<(?:script|server) \s+ language \s*=\s* ([\"\']?) PHP (\\1) \s*\>/ix", $src, "Long <script language=PHP> tags detected"); | |
} | |
// Tokenize source for easier processing. | |
if ($action->token && !$action->regex) { | |
// only usable (for this purpose) if php-cli.ini hasn't short open tags disabled | |
ini_get("short_open_tag") or exit("TokenizerCannotBeUsedAsShortTagsAreStillDisabled");; | |
defined("T_OPEN_TAG_WITH_ECHO") or exit("OhGodNoTokenizerIsNotAvailable");; | |
/* | |
The long T_OPEN_TAG always includes some space "<?php\s", | |
but consecutive/others are split into a separate token. | |
While the short versions are followed by a distinct | |
T_WHITESPACE or maybe completely absent. | |
Therefore multiple alternatives are required for each test. | |
*/ | |
$token = new token_list($src); | |
// add long tags | |
if ($action->long) { | |
$token->modify( | |
array(T_OPEN_TAG, "<?", T_WHITESPACE), | |
array("<?php"), | |
"Convert open tag into long tag, preserve space" | |
); | |
$token->modify( | |
array(T_OPEN_TAG, "<?"), | |
array("<?php "), | |
"Convert open tag into long tag, add space" | |
); | |
if (!$action->php54) { | |
$token->modify( | |
array(T_OPEN_TAG_WITH_ECHO, "<?=", T_WHITESPACE), | |
array("<?php echo"), | |
"Convert short echo (+space) into long tag" | |
); | |
$token->modify( | |
array(T_OPEN_TAG_WITH_ECHO), | |
array("<?php echo "), | |
"Convert short <?= into long <?phpâ£echo tag" | |
); | |
} | |
} | |
// convert echo tags into <?= | |
if ($action->short or $action->shortall) { //@todo we should actually have a set of T_PRINTs here too, equivality w/& regex mode | |
$token->modify( | |
array(T_OPEN_TAG, array("/\<\?php\s/i"), T_ECHO), | |
array("<?=", ""), // Does not preserve whitespace separator | |
"Convert long <?phpâ£echo tag into short <?= tag" | |
); | |
$token->modify( | |
array(T_OPEN_TAG, array("/\<\?php\s/i"), T_WHITESPACE, T_ECHO), | |
array("<?=", "", ""), // Does not preserve extra whitespace | |
"Convert long long <?phpâ£echo tag into short <?= tag" | |
); | |
} | |
// rewrite all remaining long tags | |
if ($action->shortall) { | |
$token->modify( // replaces <?php_ with short tag, regex retains any first not-space whitespace character (the lookbehind asserts the space before, but doesn't capture it for $retainspaces= replacement) | |
array(T_OPEN_TAG, array("/^\<\?php ?((?<=\\x20)|\s|\R)$/i")), | |
array("<?"), | |
"Convert long tag into short tag", | |
$retainspace=0 | |
); | |
} | |
// assemble back | |
$src = $token->merge(); | |
} | |
// Nah, let's just use regex for this. | |
// This could trip over "<?" and "\?\>" occurences within PHP comments or in string context etc. | |
else { | |
// convert short into long tags | |
if ($action->long && !$action->php54) { | |
preg_modify( | |
"/\<\?= ?(\R|\s)?/", | |
"\<\?php echo $1", | |
$src, | |
"Convert <?=⣠into long <?phpâ£echo tag" | |
); | |
} | |
if ($action->long) { | |
preg_modify( | |
array("/\<\?((?!php)\S)/i", "/\<\?(\s)/"), | |
array("\<\?php $1", "\<\?php$1"), | |
$src, | |
"Convert <?⣠into long <?php⣠tag" | |
); | |
} | |
// convert long into short tags | |
if ($action->short || $action->shortall) { | |
preg_modify( //retain leading newline // use trailing linebreak // no linebreak, no output spacing | |
array("/\<\?php(\R\s*)(echo|print)(\s)/", "/\<\?php(\s+)(echo|print)(\R\s*)/", "/\<\?php(\s+)(echo|print)(\s)/"), | |
array("\<\?=$1", "\<\?=$3", "\<\?="), | |
$src, | |
"Long <?phpâ£echo to short <?=" | |
); | |
preg_modify( // look for single-line <?php ... ? > occurences, those should always be shortened | |
"/ (?<!^) # not at the file begin | |
\<\?php # opening \<\?php | |
(\s+.+\?\>) # space, filler, closing \?\> | |
/x", | |
"\<\?$1", | |
$src, | |
"Single line <?php...?> into short tag" | |
); | |
} | |
// even the initial open tag and longer sections | |
if ($action->shortall) { | |
preg_modify( // look for mixed syntax '<? echo', strip any initial spacing type | |
"/\<\?(\s*)(echo|print)\b/", "\<\?=", | |
$src, | |
"Mixed <?â£echo to short <?=" | |
); | |
preg_modify( | |
// keep space character after <?php, do not allow any non-space separate alternative expressions (e.g. <?php( or <?php/ would actually be short tag plus php() function call or an expression const/division). | |
"/\<\?php(\R|\s)/", "\<\?$1", | |
$src, | |
"Any <?php⣠into short tag" | |
); | |
} | |
} | |
// write back if file was changed | |
if ($chksum !== md5($src) && strlen($src)) { | |
if ($action->backup) { | |
rename($fn, "$fn~"); | |
} | |
elseif ($action->new) { | |
$fn .= ".new"; | |
} | |
if ($action->dry) { | |
print("$fn: Unsaved changes (--dry)\n"); | |
} | |
else { | |
file_put_contents($fn, $src) and print("$fn: Changed and saved\n"); | |
} | |
} | |
} | |
} | |
// no recognized option | |
else { | |
echo "No action flag (-w or -c, -l etc) specified. (See --help.)\n"; | |
} | |
#-- utility code -- | |
/** | |
* Checks for option presence in ARGV. | |
* | |
* @param+ string | |
*/ | |
function argv() { | |
$args = func_get_args(); | |
return count(array_intersect($_SERVER["argv"], $args)); | |
} | |
/** | |
* Return all ARGs without leading - hyphen. | |
* | |
*/ | |
function argv_files() { | |
return preg_grep('/^[^-]/', array_slice($_SERVER["argv"], 1)); | |
} | |
/** | |
* Convert list of filespecs (dirs or file.* names) into iterator list. | |
* | |
* @return iterator | |
*/ | |
function files($list, $recursive=1) { | |
foreach ($list as &$fn) { | |
$fn = is_dir($fn) && $recursive /*should actually mask the recdiriterator below*/ | |
? new RegexIterator(new RecursiveIteratorIterator(new RecursiveDirectoryIterator($fn)), '/\.(php[345]?|phtml)$/') | |
: new ArrayIterator(file_exists($fn) ? array($fn) : glob($fn)); | |
} | |
//@bug: https://bugs.php.net/bug.php?id=49104 | |
$l = new AppendIterator(); | |
$l->append($workaround = new ArrayIterator(array(1))); foreach ($list as $i) { if ($i) { $l->append($i); } } unset($workaround[0]); | |
return($l); | |
} | |
#-- regex -- | |
/** | |
* Test with regex, return result and print message if matched. | |
* | |
*/ | |
function preg_test($regex, $src, $message) { | |
return preg_match($regex, $src, $m) and message($message, '"'.nonprint_visualize($m[0]).'"', 1); | |
} | |
/** | |
* Modify source string with regex, print message if anything changed. | |
* | |
*/ | |
function preg_modify($regex, $replace, &$src, $message) { | |
$replace = is_array($replace) ? array_map("stripslashes", $replace): stripslashes($replace); // for parity with the regex, the replacement string contains extraneous backslashes | |
$tmp = preg_replace($regex, $replace, $src, -1, $changed); | |
// assert that the regex succeeded, and didn't return a failure status instead | |
if ($tmp === NULL) { | |
trigger_error("preg_modify regex failure: '" . strtr($regex, "\n", " ") . "'", E_USER_ERROR); | |
} | |
// if succesful | |
elseif (is_string($tmp) && ($src != $tmp)) { | |
$src = $tmp; | |
message($message, $changed); | |
} | |
} | |
/** | |
* Print out current activity. If --verbose enabled. | |
* | |
*/ | |
function message($message, $count, $nq=FALSE) { | |
global $fn, $action; /* oh no, how evil. that's the tipping point that will make this completely unmaintainable.. */ | |
if ($count and ($action->verbose or $nq and !$action->quiet)) { | |
print "$fn: $message ($count)\n"; | |
} | |
return 1; | |
} | |
/** | |
* Colorize control + non-printable characters and/or replace with C-string escapes. | |
* | |
*/ | |
function nonprint_visualize($str) { | |
global $action; | |
$map = $action->color ? array( | |
"\r" => "\x1b[32m\\r\x1b[39m", | |
"\n" => "\x1b[31m\\n\x1b[39m", | |
"\t" => "\x1b[36m\\t\x1b[39m", | |
" " => "\x1b[34m\\x20\x1b[39m", | |
"\\0" => "\x1b[41;1;33m\\0\x1b[0;39m", | |
"\\" => "\x1b[35m\\\\\x1b[39m", | |
"\xEF\xBB\xBF" => "\x1b[4;1;36m\\xEF\\xBB\\xBF\x1b[0;39m", | |
) : array(); | |
return preg_replace("~\\xEF\\xBB\\xBF|[^\w-.,;:#*+´`\'\"!§$%&/()={}?<>|]~e", | |
"isset(\$map['$0']) ? \$map['$0'] : '\\x'.strtoupper(current(unpack('H*','$0')))", $str); | |
} | |
#-- tokenizer -- | |
/** | |
* Simple token list traversal. | |
* | |
*/ | |
class token_list extends ArrayObject { | |
/** | |
* Call tokenizer for initialization. Simplify token stream. | |
*/ | |
function __construct($src) { | |
parent::__construct( token_get_all($src) ); | |
// chop off the line numbers, turn raw string entries into arrays | |
foreach ($this as $i=>$t) { | |
$this[$i] = is_array($t) ? array($t[0], $t[1]) : array(T_CHARACTER, $t); | |
} | |
} | |
/** | |
* Just merge all token parts / raw strings. | |
*/ | |
function merge($src = "") { | |
return join(array_map("end", $this->getArrayCopy() )); | |
} | |
/** | |
* Traverse token list and replace found things. | |
* | |
* @param array Token types or strings to search for (where each entry: int=T_TOKEN, str="literal", array=regex; the list must always alternate with an integer token type to advance the search pointer) | |
* @param array Replacement token string(s) | |
* @param string Activity/success message to print out | |
*/ | |
function modify($from, $to, $message, $retainspace=NULL, $count=0) { | |
// loop over token list ($this==array) | |
foreach ($this as $i=>$t) { | |
if (($t[0] == $from[0]) // compare main token type | |
and $this->compare_list($i, $from) ) // additional tokens/strings | |
{ | |
// add replacement strings | |
$this->overwrite($i, (array)$to, $retainspace); | |
$count++; | |
} | |
} | |
message($message, $count); | |
} | |
/** | |
* Compares a series of T_TYPES, strings, or against a regex (=if param is wrapped as array). | |
*/ | |
function compare_list($i, $tokens) { | |
$this->captured = NULL; | |
foreach ($tokens as $x=>$find) { | |
// get value to compare against according to type (string / T_TOKEN int) | |
$cmp = $this[$i][is_int($find) ? 0 : 1]; | |
// exact match | |
if (is_scalar($find) and ($cmp == $find)) { | |
/* keep comparing */ | |
} | |
elseif (is_array($find) and preg_match(current($find), $cmp, $this->captured)) { | |
/* keep comparing */ | |
} | |
else { | |
return false; // | |
} | |
// move in token stream if next search entry is a T_TYPE | |
$i += isset($tokens[$x+1]) && is_int($tokens[$x+1]); | |
} | |
return true; | |
} | |
/** | |
* Overwrite entries in token stream with list of strings. (Or NULL where to skip.) | |
*/ | |
function overwrite($i, $with, $patch) { | |
// piggyback the last regex-captured space characters back into one of the replacement strings; usually it gets appended to the first $to[0] overwrite string | |
if (is_int($patch) && $this->captured[1]) { | |
$with[$patch] = preg_replace("/ ?$/", $this->captured[1], $with[$patch]); | |
} | |
// overwrite string entries | |
foreach ($with as $new_code) if (is_string($new_code)) { | |
$this[$i++] = array(T_INLINE_HTML, $new_code); | |
} else { $i++; } | |
} | |
} | |
// OH LOOK! A close tag! | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment