jackreichert/document.xml

## document.xml
<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
            xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
            xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
            xmlns:w10="urn:schemas-microsoft-com:office:word"
            xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml"
            xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
            xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup"
            xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
            xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office"
            xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" mc:Ignorable="w14">
    <w:background w:color="FFFFFF"/>
    <w:body>
        <w:p>
            <w:pPr>
                <w:pStyle w:val="Body A"/>
            </w:pPr>
        </w:p>
        <w:p>
            <w:pPr>
                <w:pStyle w:val="Title"/>
                <w:rPr>
                    <w:sz w:val="24"/>
                    <w:szCs w:val="24"/>
                </w:rPr>
            </w:pPr>
            <w:r>
                <w:rPr>
                    <w:sz w:val="24"/>
                    <w:szCs w:val="24"/>
                    <w:rtl w:val="0"/>
                    <w:lang w:val="it-IT"/>
                </w:rPr>
                <w:t>Hello World</w:t>
            </w:r>
        </w:p>
        <w:p>
            <w:pPr>
                <w:pStyle w:val="Default"/>
                <w:spacing w:line="280" w:lineRule="atLeast"/>
                <w:rPr>
                    <w:sz w:val="24"/>
                    <w:szCs w:val="24"/>
                </w:rPr>
            </w:pPr>
        </w:p>
        <w:p>
            <w:pPr>
                <w:pStyle w:val="Default"/>
                <w:spacing w:line="280" w:lineRule="atLeast"/>
            </w:pPr>
            <w:r>
                <w:rPr>
                    <w:sz w:val="24"/>
                    <w:szCs w:val="24"/>
                    <w:rtl w:val="0"/>
                    <w:lang w:val="en-US"/>
                </w:rPr>
                <w:t xml:space="preserve">This is a </w:t>
            </w:r>
            <w:r>
                <w:rPr>
                    <w:b w:val="1"/>
                    <w:bCs w:val="1"/>
                    <w:sz w:val="24"/>
                    <w:szCs w:val="24"/>
                    <w:rtl w:val="0"/>
                    <w:lang w:val="en-US"/>
                </w:rPr>
                <w:t>very short</w:t>
            </w:r>
            <w:r>
                <w:rPr>
                    <w:sz w:val="24"/>
                    <w:szCs w:val="24"/>
                    <w:rtl w:val="0"/>
                    <w:lang w:val="en-US"/>
                </w:rPr>
                <w:t xml:space="preserve"> paragraph. It only contains </w:t>
            </w:r>
            <w:r>
                <w:rPr>
                    <w:i w:val="1"/>
                    <w:iCs w:val="1"/>
                    <w:sz w:val="24"/>
                    <w:szCs w:val="24"/>
                    <w:rtl w:val="0"/>
                    <w:lang w:val="en-US"/>
                </w:rPr>
                <w:t>three</w:t>
            </w:r>
            <w:r>
                <w:rPr>
                    <w:sz w:val="24"/>
                    <w:szCs w:val="24"/>
                    <w:rtl w:val="0"/>
                    <w:lang w:val="en-US"/>
                </w:rPr>
                <w:t xml:space="preserve"> sentences. This is the </w:t>
            </w:r>
            <w:r>
                <w:rPr>
                    <w:sz w:val="24"/>
                    <w:szCs w:val="24"/>
                    <w:u w:val="single"/>
                    <w:rtl w:val="0"/>
                    <w:lang w:val="en-US"/>
                </w:rPr>
                <w:t>third sentence</w:t>
            </w:r>
            <w:r>
                <w:rPr>
                    <w:sz w:val="24"/>
                    <w:szCs w:val="24"/>
                    <w:rtl w:val="0"/>
                    <w:lang w:val="en-US"/>
                </w:rPr>
                <w:t>.</w:t>
            </w:r>
        </w:p>
        <w:sectPr>
            <w:headerReference w:type="default" r:id="rId4"/>
            <w:footerReference w:type="default" r:id="rId5"/>
            <w:pgSz w:w="12240" w:h="15840" w:orient="portrait"/>
            <w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440" w:header="720" w:footer="864"/>
            <w:bidi w:val="0"/>
        </w:sectPr>
    </w:body>
</w:document>

## docx2html.php
<?php
    // set location of docx text content file
    $xmlFile = $targetDir."/word/document.xml";
    $reader = new XMLReader;
    $reader->open($xmlFile);

    // set up variables for formatting
    $text = ''; $formatting['bold'] = 'closed'; $formatting['italic'] = 'closed'; $formatting['underline'] = 'closed'; $formatting['header'] = 0;

    // loop through docx xml dom
    while ($reader->read()){
        // look for new paragraphs
        if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:p'){
            // set up new instance of XMLReader for parsing paragraph independantly
            $paragraph = new XMLReader;
            $p = $reader->readOuterXML();
            $paragraph->xml($p);

            // search for heading
            preg_match('/<w:pStyle w:val="(Heading.*?[1-6])"/',$p,$matches);
            switch($matches[1]){
                case 'Heading1': $formatting['header'] = 1; break;
                case 'Heading2': $formatting['header'] = 2; break;
                case 'Heading3': $formatting['header'] = 3; break;
                case 'Heading4': $formatting['header'] = 4; break;
                case 'Heading5': $formatting['header'] = 5; break;
                case 'Heading6': $formatting['header'] = 6; break;
                default:  $formatting['header'] = 0; break;
            }

            // open h-tag or paragraph
            $text .= ($formatting['header'] > 0) ? '<h'.$formatting['header'].'>' : '<p>';

            // loop through paragraph dom
            while ($paragraph->read()){
                // look for elements
                if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:r'){
                    $node = trim($paragraph->readInnerXML());

                    // add <br> tags
                    if (strstr($node,'<w:br ')) $text .= '<br>';

                    // look for formatting tags
                    $formatting['bold'] = (strstr($node,'<w:b/>')) ? (($formatting['bold'] == 'closed') ? 'open' : $formatting['bold']) : (($formatting['bold'] == 'opened') ? 'close' : $formatting['bold']);
                    $formatting['italic'] = (strstr($node,'<w:i/>')) ? (($formatting['italic'] == 'closed') ? 'open' : $formatting['italic']) : (($formatting['italic'] == 'opened') ? 'close' : $formatting['italic']);
                    $formatting['underline'] = (strstr($node,'<w:u ')) ? (($formatting['underline'] == 'closed') ? 'open' : $formatting['underline']) : (($formatting['underline'] == 'opened') ? 'close' : $formatting['underline']);

                    // build text string of doc
                    $text .=     (($formatting['bold'] == 'open') ? '<strong>' : '').
                                (($formatting['italic'] == 'open') ? '<em>' : '').
                                (($formatting['underline'] == 'open') ? '<u>' : '').
                                htmlentities(iconv('UTF-8', 'ASCII//TRANSLIT',$paragraph->expand()->textContent)).
                                (($formatting['underline'] == 'close') ? '</u>' : '').
                                (($formatting['italic'] == 'close') ? '</em>' : '').
                                (($formatting['bold'] == 'close') ? '</strong>' : '');

                    // reset formatting variables
                    foreach ($formatting as $key=>$format){
                        if ($format == 'open') $formatting[$key] = 'opened';
                        if ($format == 'close') $formatting[$key] = 'closed';
                    }
                }
            }
            $text .= ($formatting['header'] > 0) ? '</h'.$formatting['header'].'>' : '</p>';
        }

    }
    $reader->close();

    // suppress warnings. loadHTML does not require valid HTML but still warns against it...
    // fix invalid html
    $doc = new DOMDocument();
    $doc->encoding = 'UTF-8';
    @$doc->loadHTML($text);
    $goodHTML = simplexml_import_dom($doc)->asXML();
	<?xml version="1.0" encoding="UTF-8"?>
	<w:document xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
	xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
	xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
	xmlns:w10="urn:schemas-microsoft-com:office:word"
	xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml"
	xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
	xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup"
	xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
	xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office"
	xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" mc:Ignorable="w14">
	<w:background w:color="FFFFFF"/>
	<w:body>
	<w:p>
	<w:pPr>
	<w:pStyle w:val="Body A"/>
	</w:pPr>
	</w:p>
	<w:p>
	<w:pPr>
	<w:pStyle w:val="Title"/>
	<w:rPr>
	<w:sz w:val="24"/>
	<w:szCs w:val="24"/>
	</w:rPr>
	</w:pPr>
	<w:r>
	<w:rPr>
	<w:sz w:val="24"/>
	<w:szCs w:val="24"/>
	<w:rtl w:val="0"/>
	<w:lang w:val="it-IT"/>
	</w:rPr>
	<w:t>Hello World</w:t>
	</w:r>
	</w:p>
	<w:p>
	<w:pPr>
	<w:pStyle w:val="Default"/>
	<w:spacing w:line="280" w:lineRule="atLeast"/>
	<w:rPr>
	<w:sz w:val="24"/>
	<w:szCs w:val="24"/>
	</w:rPr>
	</w:pPr>
	</w:p>
	<w:p>
	<w:pPr>
	<w:pStyle w:val="Default"/>
	<w:spacing w:line="280" w:lineRule="atLeast"/>
	</w:pPr>
	<w:r>
	<w:rPr>
	<w:sz w:val="24"/>
	<w:szCs w:val="24"/>
	<w:rtl w:val="0"/>
	<w:lang w:val="en-US"/>
	</w:rPr>
	<w:t xml:space="preserve">This is a </w:t>
	</w:r>
	<w:r>
	<w:rPr>
	<w:b w:val="1"/>
	<w:bCs w:val="1"/>
	<w:sz w:val="24"/>
	<w:szCs w:val="24"/>
	<w:rtl w:val="0"/>
	<w:lang w:val="en-US"/>
	</w:rPr>
	<w:t>very short</w:t>
	</w:r>
	<w:r>
	<w:rPr>
	<w:sz w:val="24"/>
	<w:szCs w:val="24"/>
	<w:rtl w:val="0"/>
	<w:lang w:val="en-US"/>
	</w:rPr>
	<w:t xml:space="preserve"> paragraph. It only contains </w:t>
	</w:r>
	<w:r>
	<w:rPr>
	<w:i w:val="1"/>
	<w:iCs w:val="1"/>
	<w:sz w:val="24"/>
	<w:szCs w:val="24"/>
	<w:rtl w:val="0"/>
	<w:lang w:val="en-US"/>
	</w:rPr>
	<w:t>three</w:t>
	</w:r>
	<w:r>
	<w:rPr>
	<w:sz w:val="24"/>
	<w:szCs w:val="24"/>
	<w:rtl w:val="0"/>
	<w:lang w:val="en-US"/>
	</w:rPr>
	<w:t xml:space="preserve"> sentences. This is the </w:t>
	</w:r>
	<w:r>
	<w:rPr>
	<w:sz w:val="24"/>
	<w:szCs w:val="24"/>
	<w:u w:val="single"/>
	<w:rtl w:val="0"/>
	<w:lang w:val="en-US"/>
	</w:rPr>
	<w:t>third sentence</w:t>
	</w:r>
	<w:r>
	<w:rPr>
	<w:sz w:val="24"/>
	<w:szCs w:val="24"/>
	<w:rtl w:val="0"/>
	<w:lang w:val="en-US"/>
	</w:rPr>
	<w:t>.</w:t>
	</w:r>
	</w:p>
	<w:sectPr>
	<w:headerReference w:type="default" r:id="rId4"/>
	<w:footerReference w:type="default" r:id="rId5"/>
	<w:pgSz w:w="12240" w:h="15840" w:orient="portrait"/>
	<w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440" w:header="720" w:footer="864"/>
	<w:bidi w:val="0"/>
	</w:sectPr>
	</w:body>
	</w:document>
	<?php
	// set location of docx text content file
	$xmlFile = $targetDir."/word/document.xml";
	$reader = new XMLReader;
	$reader->open($xmlFile);

	// set up variables for formatting
	$text = ''; $formatting['bold'] = 'closed'; $formatting['italic'] = 'closed'; $formatting['underline'] = 'closed'; $formatting['header'] = 0;

	// loop through docx xml dom
	while ($reader->read()){
	// look for new paragraphs
	if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:p'){
	// set up new instance of XMLReader for parsing paragraph independantly
	$paragraph = new XMLReader;
	$p = $reader->readOuterXML();
	$paragraph->xml($p);

	// search for heading
	preg_match('/<w:pStyle w:val="(Heading.*?[1-6])"/',$p,$matches);
	switch($matches[1]){
	case 'Heading1': $formatting['header'] = 1; break;
	case 'Heading2': $formatting['header'] = 2; break;
	case 'Heading3': $formatting['header'] = 3; break;
	case 'Heading4': $formatting['header'] = 4; break;
	case 'Heading5': $formatting['header'] = 5; break;
	case 'Heading6': $formatting['header'] = 6; break;
	default: $formatting['header'] = 0; break;
	}

	// open h-tag or paragraph
	$text .= ($formatting['header'] > 0) ? '<h'.$formatting['header'].'>' : '<p>';

	// loop through paragraph dom
	while ($paragraph->read()){
	// look for elements
	if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:r'){
	$node = trim($paragraph->readInnerXML());

	// add <br> tags
	if (strstr($node,'<w:br ')) $text .= '<br>';

	// look for formatting tags
	$formatting['bold'] = (strstr($node,'<w:b/>')) ? (($formatting['bold'] == 'closed') ? 'open' : $formatting['bold']) : (($formatting['bold'] == 'opened') ? 'close' : $formatting['bold']);
	$formatting['italic'] = (strstr($node,'<w:i/>')) ? (($formatting['italic'] == 'closed') ? 'open' : $formatting['italic']) : (($formatting['italic'] == 'opened') ? 'close' : $formatting['italic']);
	$formatting['underline'] = (strstr($node,'<w:u ')) ? (($formatting['underline'] == 'closed') ? 'open' : $formatting['underline']) : (($formatting['underline'] == 'opened') ? 'close' : $formatting['underline']);

	// build text string of doc
	$text .= (($formatting['bold'] == 'open') ? '<strong>' : '').
	(($formatting['italic'] == 'open') ? '<em>' : '').
	(($formatting['underline'] == 'open') ? '<u>' : '').
	htmlentities(iconv('UTF-8', 'ASCII//TRANSLIT',$paragraph->expand()->textContent)).
	(($formatting['underline'] == 'close') ? '</u>' : '').
	(($formatting['italic'] == 'close') ? '</em>' : '').
	(($formatting['bold'] == 'close') ? '</strong>' : '');

	// reset formatting variables
	foreach ($formatting as $key=>$format){
	if ($format == 'open') $formatting[$key] = 'opened';
	if ($format == 'close') $formatting[$key] = 'closed';
	}
	}
	}
	$text .= ($formatting['header'] > 0) ? '</h'.$formatting['header'].'>' : '</p>';
	}

	}
	$reader->close();

	// suppress warnings. loadHTML does not require valid HTML but still warns against it...
	// fix invalid html
	$doc = new DOMDocument();
	$doc->encoding = 'UTF-8';
	@$doc->loadHTML($text);
	$goodHTML = simplexml_import_dom($doc)->asXML();