Skip to content

Instantly share code, notes, and snippets.

@circleb
Forked from jackreichert/document.xml
Last active August 29, 2015 14:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save circleb/fda10464675ebaa15c4f to your computer and use it in GitHub Desktop.
Save circleb/fda10464675ebaa15c4f to your computer and use it in GitHub Desktop.
// set location of docx text content file
$xmlFile = $targetDir."/word/document.xml";
$reader = new XMLReader;
$reader->open($xmlFile);
// set up variables for formatting
$text = ''; $formatting['bold'] = 'closed'; $formatting['italic'] = 'closed'; $formatting['underline'] = 'closed'; $formatting['header'] = 0;
// loop through docx xml dom
while ($reader->read()){
// look for new paragraphs
if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:p'){
// set up new instance of XMLReader for parsing paragraph independantly
$paragraph = new XMLReader;
$p = $reader->readOuterXML();
$paragraph->xml($p);
// search for heading
preg_match('/<w:pStyle w:val="(Heading.*?[1-6])"/',$p,$matches);
switch($matches[1]){
case 'Heading1': $formatting['header'] = 1; break;
case 'Heading2': $formatting['header'] = 2; break;
case 'Heading3': $formatting['header'] = 3; break;
case 'Heading4': $formatting['header'] = 4; break;
case 'Heading5': $formatting['header'] = 5; break;
case 'Heading6': $formatting['header'] = 6; break;
default: $formatting['header'] = 0; break;
}
// open h-tag or paragraph
$text .= ($formatting['header'] > 0) ? '<h'.$formatting['header'].'>' : '<p>';
// loop through paragraph dom
while ($paragraph->read()){
// look for deletions
if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:del'){
$text .= '<del>'.
htmlentities(iconv('UTF-8', 'ASCII//TRANSLIT',$paragraph->expand()->textContent)).
'</del>';
// skip so that it doesn't get added to $text twice
$paragraph->next();
}
// look for insertions
if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:ins'){
$text .= '<ins>'.
htmlentities(iconv('UTF-8', 'ASCII//TRANSLIT',$paragraph->expand()->textContent)).
'</ins>';
// skip so that it doesn't get added to $text twice
$paragraph->next();
}
// look for elements
if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:r'){
$node = trim($paragraph->readInnerXML());
// add <br> tags
if (strstr($node,'<w:br ')) $text .= '<br>';
// look for formatting tags
$formatting['bold'] = (strstr($node,'<w:b/>')) ? (($formatting['bold'] == 'closed') ? 'open' : $formatting['bold']) : (($formatting['bold'] == 'opened') ? 'close' : $formatting['bold']);
$formatting['italic'] = (strstr($node,'<w:i/>')) ? (($formatting['italic'] == 'closed') ? 'open' : $formatting['italic']) : (($formatting['italic'] == 'opened') ? 'close' : $formatting['italic']);
$formatting['underline'] = (strstr($node,'<w:u ')) ? (($formatting['underline'] == 'closed') ? 'open' : $formatting['underline']) : (($formatting['underline'] == 'opened') ? 'close' : $formatting['underline']);
// build text string of doc
$text .= (($formatting['bold'] == 'open') ? '<strong>' : '').
(($formatting['italic'] == 'open') ? '<em>' : '').
(($formatting['underline'] == 'open') ? '<u>' : '').
htmlentities(iconv('UTF-8', 'ASCII//TRANSLIT',$paragraph->expand()->textContent)).
(($formatting['underline'] == 'close') ? '</u>' : '').
(($formatting['italic'] == 'close') ? '</em>' : '').
(($formatting['bold'] == 'close') ? '</strong>' : '');
// reset formatting variables
foreach ($formatting as $key=>$format){
if ($format == 'open') $formatting[$key] = 'opened';
if ($format == 'close') $formatting[$key] = 'closed';
}
}
}
$text .= ($formatting['header'] > 0) ? '</h'.$formatting['header'].'>' : '</p>';
}
}
$reader->close();
// suppress warnings. loadHTML does not require valid HTML but still warns against it...
// fix invalid html
$doc = new DOMDocument();
$doc->encoding = 'UTF-8';
@$doc->loadHTML($text);
$goodHTML = simplexml_import_dom($doc)->asXML();
@circleb
Copy link
Author

circleb commented May 13, 2015

Highlight changes that have been tracked in Word...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment