Created
January 11, 2016 10:44
-
-
Save ngugijames/9e9a5842f2d57ec944b1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Special thanks to http://webcheatsheet.com/php/reading_the_clean_text_from_rtf.php | |
*/ | |
<?php | |
// Function that checks whether the data are the on-screen text. | |
// It works in the following way: | |
// an array arrfailAt stores the control words for the current state of the stack, which show that | |
// input data are something else than plain text. | |
// For example, there may be a description of font or color palette etc. | |
function rtf_isPlainText($s) { | |
$arrfailAt = array("*", "fonttbl", "colortbl", "datastore", "themedata"); | |
for ($i = 0; $i < count($arrfailAt); $i++) | |
if (!empty($s[$arrfailAt[$i]])) return false; | |
return true; | |
} | |
function rtf2text($filename) { | |
// Read the data from the input file. | |
$text = file_get_contents($filename); | |
if (!strlen($text)) | |
return ""; | |
// Create empty stack array. | |
$document = ""; | |
$stack = array(); | |
$j = -1; | |
// Read the data character-by- character… | |
for ($i = 0, $len = strlen($text); $i < $len; $i++) { | |
$c = $text[$i]; | |
// Depending on current character select the further actions. | |
switch ($c) { | |
// the most important key word backslash | |
case "\\": | |
// read next character | |
$nc = $text[$i + 1]; | |
// If it is another backslash or nonbreaking space or hyphen, | |
// then the character is plain text and add it to the output stream. | |
if ($nc == '\\' && rtf_isPlainText($stack[$j])) $document .= '\\'; | |
elseif ($nc == '~' && rtf_isPlainText($stack[$j])) $document .= ' '; | |
elseif ($nc == '_' && rtf_isPlainText($stack[$j])) $document .= '-'; | |
// If it is an asterisk mark, add it to the stack. | |
elseif ($nc == '*') $stack[$j]["*"] = true; | |
// If it is a single quote, read next two characters that are the hexadecimal notation | |
// of a character we should add to the output stream. | |
elseif ($nc == "'") { | |
$hex = substr($text, $i + 2, 2); | |
if (rtf_isPlainText($stack[$j])) | |
$document .= html_entity_decode("&#".hexdec($hex).";"); | |
//Shift the pointer. | |
$i += 2; | |
// Since, we’ve found the alphabetic character, the next characters are control word | |
// and, possibly, some digit parameter. | |
} elseif ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') { | |
$word = ""; | |
$param = null; | |
// Start reading characters after the backslash. | |
for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) { | |
$nc = $text[$k]; | |
// If the current character is a letter and there were no digits before it, | |
// then we’re still reading the control word. If there were digits, we should stop | |
// since we reach the end of the control word. | |
if ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') { | |
if (empty($param)) | |
$word .= $nc; | |
else | |
break; | |
// If it is a digit, store the parameter. | |
} elseif ($nc >= '0' && $nc <= '9') | |
$param .= $nc; | |
// Since minus sign may occur only before a digit parameter, check whether | |
// $param is empty. Otherwise, we reach the end of the control word. | |
elseif ($nc == '-') { | |
if (empty($param)) | |
$param .= $nc; | |
else | |
break; | |
} else | |
break; | |
} | |
// Shift the pointer on the number of read characters. | |
$i += $m - 1; | |
// Start analyzing what we’ve read. We are interested mostly in control words. | |
$toText = ""; | |
switch (strtolower($word)) { | |
// If the control word is "u", then its parameter is the decimal notation of the | |
// Unicode character that should be added to the output stream. | |
// We need to check whether the stack contains \ucN control word. If it does, | |
// we should remove the N characters from the output stream. | |
case "u": | |
$toText .= html_entity_decode("&#x".dechex($param).";"); | |
$ucDelta = @$stack[$j]["uc"]; | |
if ($ucDelta > 0) | |
$i += $ucDelta; | |
break; | |
// Select line feeds, spaces and tabs. | |
case "par": case "page": case "column": case "line": case "lbr": | |
$toText .= "\n"; | |
break; | |
case "emspace": case "enspace": case "qmspace": | |
$toText .= " "; | |
break; | |
case "tab": $toText .= "\t"; break; | |
// Add current date and time instead of corresponding labels. | |
case "chdate": $toText .= date("m.d.Y"); break; | |
case "chdpl": $toText .= date("l, j F Y"); break; | |
case "chdpa": $toText .= date("D, j M Y"); break; | |
case "chtime": $toText .= date("H:i:s"); break; | |
// Replace some reserved characters to their html analogs. | |
case "emdash": $toText .= html_entity_decode("—"); break; | |
case "endash": $toText .= html_entity_decode("–"); break; | |
case "bullet": $toText .= html_entity_decode("•"); break; | |
case "lquote": $toText .= html_entity_decode("‘"); break; | |
case "rquote": $toText .= html_entity_decode("’"); break; | |
case "ldblquote": $toText .= html_entity_decode("«"); break; | |
case "rdblquote": $toText .= html_entity_decode("»"); break; | |
// Add all other to the control words stack. If a control word | |
// does not include parameters, set ¶m to true. | |
default: | |
$stack[$j][strtolower($word)] = empty($param) ? true : $param; | |
break; | |
} | |
// Add data to the output stream if required. | |
if (rtf_isPlainText($stack[$j])) | |
$document .= $toText; | |
} | |
$i++; | |
break; | |
// If we read the opening brace {, then new subgroup starts and we add | |
// new array stack element and write the data from previous stack element to it. | |
case "{": | |
array_push($stack, $stack[$j++]); | |
break; | |
// If we read the closing brace }, then we reach the end of subgroup and should remove | |
// the last stack element. | |
case "}": | |
array_pop($stack); | |
$j--; | |
break; | |
// Skip “trash”. | |
case '\0': case '\r': case '\f': case '\n': break; | |
// Add other data to the output stream if required. | |
default: | |
if (rtf_isPlainText($stack[$j])) | |
$document .= $c; | |
break; | |
} | |
} | |
// Return result. | |
return $document; | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment