Skip to content

Instantly share code, notes, and snippets.

@jackreichert
Last active May 25, 2022 15:18
Show Gist options
  • Star 10 You must be signed in to star a gist
  • Fork 7 You must be signed in to fork a gist
  • Save jackreichert/4052029 to your computer and use it in GitHub Desktop.
Save jackreichert/4052029 to your computer and use it in GitHub Desktop.
Convert Docx XML to HTML
<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
xmlns:w10="urn:schemas-microsoft-com:office:word"
xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml"
xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup"
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" mc:Ignorable="w14">
<w:background w:color="FFFFFF"/>
<w:body>
<w:p>
<w:pPr>
<w:pStyle w:val="Body A"/>
</w:pPr>
</w:p>
<w:p>
<w:pPr>
<w:pStyle w:val="Title"/>
<w:rPr>
<w:sz w:val="24"/>
<w:szCs w:val="24"/>
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:sz w:val="24"/>
<w:szCs w:val="24"/>
<w:rtl w:val="0"/>
<w:lang w:val="it-IT"/>
</w:rPr>
<w:t>Hello World</w:t>
</w:r>
</w:p>
<w:p>
<w:pPr>
<w:pStyle w:val="Default"/>
<w:spacing w:line="280" w:lineRule="atLeast"/>
<w:rPr>
<w:sz w:val="24"/>
<w:szCs w:val="24"/>
</w:rPr>
</w:pPr>
</w:p>
<w:p>
<w:pPr>
<w:pStyle w:val="Default"/>
<w:spacing w:line="280" w:lineRule="atLeast"/>
</w:pPr>
<w:r>
<w:rPr>
<w:sz w:val="24"/>
<w:szCs w:val="24"/>
<w:rtl w:val="0"/>
<w:lang w:val="en-US"/>
</w:rPr>
<w:t xml:space="preserve">This is a </w:t>
</w:r>
<w:r>
<w:rPr>
<w:b w:val="1"/>
<w:bCs w:val="1"/>
<w:sz w:val="24"/>
<w:szCs w:val="24"/>
<w:rtl w:val="0"/>
<w:lang w:val="en-US"/>
</w:rPr>
<w:t>very short</w:t>
</w:r>
<w:r>
<w:rPr>
<w:sz w:val="24"/>
<w:szCs w:val="24"/>
<w:rtl w:val="0"/>
<w:lang w:val="en-US"/>
</w:rPr>
<w:t xml:space="preserve"> paragraph. It only contains </w:t>
</w:r>
<w:r>
<w:rPr>
<w:i w:val="1"/>
<w:iCs w:val="1"/>
<w:sz w:val="24"/>
<w:szCs w:val="24"/>
<w:rtl w:val="0"/>
<w:lang w:val="en-US"/>
</w:rPr>
<w:t>three</w:t>
</w:r>
<w:r>
<w:rPr>
<w:sz w:val="24"/>
<w:szCs w:val="24"/>
<w:rtl w:val="0"/>
<w:lang w:val="en-US"/>
</w:rPr>
<w:t xml:space="preserve"> sentences. This is the </w:t>
</w:r>
<w:r>
<w:rPr>
<w:sz w:val="24"/>
<w:szCs w:val="24"/>
<w:u w:val="single"/>
<w:rtl w:val="0"/>
<w:lang w:val="en-US"/>
</w:rPr>
<w:t>third sentence</w:t>
</w:r>
<w:r>
<w:rPr>
<w:sz w:val="24"/>
<w:szCs w:val="24"/>
<w:rtl w:val="0"/>
<w:lang w:val="en-US"/>
</w:rPr>
<w:t>.</w:t>
</w:r>
</w:p>
<w:sectPr>
<w:headerReference w:type="default" r:id="rId4"/>
<w:footerReference w:type="default" r:id="rId5"/>
<w:pgSz w:w="12240" w:h="15840" w:orient="portrait"/>
<w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440" w:header="720" w:footer="864"/>
<w:bidi w:val="0"/>
</w:sectPr>
</w:body>
</w:document>
<?php
// set location of docx text content file
$xmlFile = $targetDir."/word/document.xml";
$reader = new XMLReader;
$reader->open($xmlFile);
// set up variables for formatting
$text = ''; $formatting['bold'] = 'closed'; $formatting['italic'] = 'closed'; $formatting['underline'] = 'closed'; $formatting['header'] = 0;
// loop through docx xml dom
while ($reader->read()){
// look for new paragraphs
if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:p'){
// set up new instance of XMLReader for parsing paragraph independantly
$paragraph = new XMLReader;
$p = $reader->readOuterXML();
$paragraph->xml($p);
// search for heading
preg_match('/<w:pStyle w:val="(Heading.*?[1-6])"/',$p,$matches);
switch($matches[1]){
case 'Heading1': $formatting['header'] = 1; break;
case 'Heading2': $formatting['header'] = 2; break;
case 'Heading3': $formatting['header'] = 3; break;
case 'Heading4': $formatting['header'] = 4; break;
case 'Heading5': $formatting['header'] = 5; break;
case 'Heading6': $formatting['header'] = 6; break;
default: $formatting['header'] = 0; break;
}
// open h-tag or paragraph
$text .= ($formatting['header'] > 0) ? '<h'.$formatting['header'].'>' : '<p>';
// loop through paragraph dom
while ($paragraph->read()){
// look for elements
if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:r'){
$node = trim($paragraph->readInnerXML());
// add <br> tags
if (strstr($node,'<w:br ')) $text .= '<br>';
// look for formatting tags
$formatting['bold'] = (strstr($node,'<w:b/>')) ? (($formatting['bold'] == 'closed') ? 'open' : $formatting['bold']) : (($formatting['bold'] == 'opened') ? 'close' : $formatting['bold']);
$formatting['italic'] = (strstr($node,'<w:i/>')) ? (($formatting['italic'] == 'closed') ? 'open' : $formatting['italic']) : (($formatting['italic'] == 'opened') ? 'close' : $formatting['italic']);
$formatting['underline'] = (strstr($node,'<w:u ')) ? (($formatting['underline'] == 'closed') ? 'open' : $formatting['underline']) : (($formatting['underline'] == 'opened') ? 'close' : $formatting['underline']);
// build text string of doc
$text .= (($formatting['bold'] == 'open') ? '<strong>' : '').
(($formatting['italic'] == 'open') ? '<em>' : '').
(($formatting['underline'] == 'open') ? '<u>' : '').
htmlentities(iconv('UTF-8', 'ASCII//TRANSLIT',$paragraph->expand()->textContent)).
(($formatting['underline'] == 'close') ? '</u>' : '').
(($formatting['italic'] == 'close') ? '</em>' : '').
(($formatting['bold'] == 'close') ? '</strong>' : '');
// reset formatting variables
foreach ($formatting as $key=>$format){
if ($format == 'open') $formatting[$key] = 'opened';
if ($format == 'close') $formatting[$key] = 'closed';
}
}
}
$text .= ($formatting['header'] > 0) ? '</h'.$formatting['header'].'>' : '</p>';
}
}
$reader->close();
// suppress warnings. loadHTML does not require valid HTML but still warns against it...
// fix invalid html
$doc = new DOMDocument();
$doc->encoding = 'UTF-8';
@$doc->loadHTML($text);
$goodHTML = simplexml_import_dom($doc)->asXML();
@godfreymakori
Copy link

Hey, Jackreichert

How can I make the code consider smallCaps that have line breaks?

Would like them converted to html

@Leniyou
Copy link

Leniyou commented May 29, 2019

Hi Jackreichert,

This is a nice piece of work.

How can I make this to run on my local server, because I am getting this "error: Warning: XMLReader::open(): Unable to open source data".

Thanks,

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment