Created
July 10, 2011 08:07
-
-
Save thinkhy/1074372 to your computer and use it in GitHub Desktop.
Convert XHTML to OOXML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
################################################################# | |
# filename: xhtml2ooxml.pl | |
# brief: Convert xhtml to ooxml format for Founder WordEditor | |
# creator: thinkhy | |
# date: 11/06/23 | |
# changelog: | |
# | |
################################################################# | |
use Cwd; | |
use Encode; | |
use Win32::Guidgen; | |
use MIME::Base64 qw(encode_base64); | |
use Win32::OLE qw(in with); | |
use strict; | |
use constant MSXMLDOM => "MSXML2.DOMDOCUMENT.4.0"; | |
########################################################################### | |
# | |
# Main | |
# | |
########################################################################### | |
#my $mainFile = 'E:/work/浙教题库升级/ImportWork/XSLT/code/pic2/main.xml'; | |
my $mainFile = 'E:/work/浙教题库升级/ImportWork/testdata/library__44/{FF35B8A6-985C-4644-87B1-3FE83D1A50BD}/main.xml'; | |
my $xsltFile = 'E:/work/浙教题库升级/ImportWork/XSLT/code/xhtml2ooxml.xsl'; | |
my $outputFile = 'E:/work/浙教题库升级/ImportWork/XSLT/code/pic2/ooxml/output.xml'; | |
my $drawFile = 'E:/work/浙教题库升级/ImportWork/XSLT/code/drawing.xml'; | |
my $intermediaFile = 'main.xml'; | |
my $rootPath = 'E:/formular/backup'; | |
#my $rootPath = 'E:/work/浙教题库升级/ImportWork/testdata/library__44/backup'; | |
chdir $rootPath; | |
opendir my $fh, $rootPath or print $!; | |
my @pathes = grep { -d and $_ ne "." and $_ ne ".." } readdir $fh; | |
# Create COM Object, then export word document. | |
my $exportToWord = Win32::OLE->new("Founder.WordPlugin.ExportToWord.ExportToWord") or die "Couldn't create ExportToWord COM object"; | |
foreach(@pathes) | |
{ | |
# Just for Debug | |
# next if $_ ne '{0D7EF8D7-E1A0-400F-B1C7-FFD0D40CA302}'; | |
# next if $_ ne '100CFXEX35RSA2119460'; | |
# Enter into work directory. | |
chdir $rootPath."/".$_; | |
$mainFile = $rootPath."/".$_.'/main.xml'; | |
# Output file with the format of docx. | |
my $docxFile = $rootPath."/".$_.'/word.docx'; | |
# Maindoc file generated by XMLEditor | |
my $maindocFile = $rootPath."/".$_.'/maindoc.xml'; | |
# maindoc.xml must exist. | |
next if !-f $maindocFile; | |
# Generate main.xml from maindoc.xml | |
GenMainxml($maindocFile); | |
# main.xml must be generated. | |
my $mainFile = $rootPath."/".$_.'/main.xml'; | |
next if !-f $mainFile; | |
# Convert html fragment to ooxml. | |
my $ooxmlFile = ConvertMainxml($mainFile) if -e $mainFile; | |
# Insert ooxml filepath into /ROOT/Element[@name='题目']/OOXML in main.xml | |
my $docMain = LoadDocFromFile($mainFile); | |
my $nodeOoxml = $docMain->SelectSingleNode('/ROOT/Element[@name="题目"]/OOXML'); | |
print "\nmain file: ".$mainFile."\n"; | |
print "\nXML file: ".$ooxmlFile."\n"; | |
if ($nodeOoxml) | |
{ | |
$nodeOoxml->{Text} = $ooxmlFile; | |
} | |
my $strResult = $docMain->{xml}; | |
$strResult =~ s/^\s*<\?xml.*?\?>//g; | |
$strResult =~ s/^/<\?xml version="1.0" encoding="gb2312"\?>/g; | |
#$docMain->save($mainFile); | |
# work around the problem for entity. | |
# TODO: Refator | |
#my $strResult = $docMain->{xml}; | |
$strResult =~ s/\*\*(.*?)\*\*/&#$1;/sg; | |
open OUT,">$mainFile"; | |
syswrite(OUT, $strResult); | |
close OUT; | |
my $msg = 0; | |
my $retVal = $exportToWord->Export($mainFile, | |
'E:\work\浙教题库升级\Test\docxTemplate\WordTemplate.docx', | |
$docxFile, | |
"0", | |
$msg); | |
print "输出文件:$docxFile 返回值: ".$retVal."\n"; | |
} | |
sub ConvertMainxml | |
{ | |
my $mainFile = shift; | |
# Load xml document | |
my $docMain = LoadDocFromFile($mainFile); | |
# Select xhtml node | |
my $xhtmlNode = $docMain->SelectSingleNode('ROOT/Element[@name="题目"]/XHTML'); | |
$xhtmlNode || die 'No xhtml content in ROOT/Element[@name="题目"]/XHTML $!\n'; | |
my $xhtmlText = $xhtmlNode->{xml}; | |
my $elementNode = $docMain->SelectSingleNode('ROOT/Element/@name'); | |
my $elementName = $elementNode->{value} || ""; | |
my $tqIDNode = $docMain->SelectSingleNode('ROOT/Element/Attributes/Attr[@name="tqID"]/@value'); | |
my $tqID; | |
if ($tqIDNode) | |
{ | |
$tqID = $tqIDNode->{value}; | |
} | |
else | |
{ | |
$tqID = ""; | |
} | |
print $tqID; | |
my $parentIDNode = $docMain->SelectSingleNode('ROOT/Element/Attributes/Attr[@name="parentID"]/@value'); | |
my $parentID; | |
if ($parentIDNode) | |
{ | |
$parentID = $parentIDNode->{value}; | |
} | |
else # there is no parentID attribute | |
{ | |
$parentID = ""; | |
} | |
# Replace redundant character "\r". | |
$xhtmlText =~ s/\r(?!\n)//g; | |
# Add proper processing instruction | |
$xhtmlText =~ s/^\s*<\?xml.*?\?>//g; | |
$xhtmlText =~ s/^/<\?xml version="1.0" encoding="gb2312"\?>/g; | |
# print $xhtmlText; | |
my $tmpFile = "__out__.xml"; | |
open OUTXHTML, ">$tmpFile"; | |
print OUTXHTML $xhtmlText; | |
close OUTXHTML; | |
# Perform the transformation and save the resulting DOM object | |
# my $xsltOutput = $docXhtml->transformNode($docXslt); | |
# Prepare parameters for XSLT | |
my %hash = ("elementName"=>"$elementName", | |
"parentID"=>"$parentID", | |
"tqID"=>"$tqID",); | |
# Transform with parameters. | |
print "\n##################################################\n"; | |
my $result = &TransformWithArgs($xsltFile, $tmpFile, \%hash); | |
unlink($tmpFile); | |
# Replace single '\r' character | |
$result =~ s/\r(?!\n)//g; | |
# Set encoding to gb2312, otherwise C# program could not read the document. | |
$result =~ s/^\s*<\?xml.*?\?>//g; | |
$result =~ s/^/<\?xml version="1.0" encoding="gb2312"\?>/g; | |
# Convert image nodes | |
my $docResult = LoadDocFromText($result); | |
my $imageNodes = $docResult->selectNodes("//img"); | |
#print "Len:".$imageNodes->length."\n"; | |
# Get the node of Customer | |
$docResult->setProperty("SelectionNamespaces", | |
"xmlns:xhtml='http://www.w3.org/1999/xhtml' | |
xmlns:w='http://schemas.openxmlformats.org/wordprocessingml/2006/main'"); | |
my $customNode = $docResult->SelectSingleNode(".//w:customXml"); | |
$customNode || die "No custom node(.//w:customXml) in the result document"; | |
# Enter the directory of main.xml | |
my $oldCwd = getcwd(); | |
my($dirpath,$filename) = ($mainFile =~ /^((?:.*[:\\\/])?)(.*$)/s); | |
chdir($dirpath); | |
print "\n".$dirpath; | |
ProcessImageNode($docResult, $customNode, $imageNodes); | |
# mkdir for ooxml fragment | |
if (-d 'ooxml') | |
{ | |
# Empty ooxml directory. | |
opendir my $ooxmlDir, 'ooxml'; | |
my @ooxmlFiles = readdir $ooxmlDir; | |
foreach(@ooxmlFiles) | |
{ | |
unlink 'ooxml/'.$_; | |
} | |
} | |
else | |
{ | |
mkdir 'ooxml' or die "Failed to create ooxml directory"; | |
} | |
# Generate intermediate file. | |
my $guid = Win32::Guidgen::create(); | |
my $outputFile = 'ooxml/'.$guid.'.xml'; | |
#$docResult->save($outputFile); | |
# work around the problem for entity. | |
# TODO: Refator | |
my $strResult = $docResult->{xml}; | |
$strResult =~ s/^\s*<\?xml.*?\?>//g; | |
$strResult =~ s/^/<\?xml version="1.0" encoding="gb2312"\?>/g; | |
$strResult =~ s/\*\*(.*?)\*\*/&#$1;/sg; | |
open OUT,">$outputFile"; | |
#syswrite(OUT, encode("utf8", $strResult)); | |
#$strResult = encode("utf8",decode("ascii", $strResult)); | |
syswrite(OUT, $strResult); | |
close OUT; | |
chdir $oldCwd; | |
$outputFile; | |
} | |
sub GenMainxml | |
{ | |
my $maindocFile = shift; | |
# Load xml document and set namespace. | |
my $docMain = LoadDocFromFile($maindocFile); | |
$docMain->setProperty("SelectionNamespaces", | |
"xmlns:xhtml='http://www.w3.org/1999/xhtml' | |
xmlns:mml='http://www.w3.org/1998/Math/MathML'"); | |
# Get question id | |
my $nodeTitle = $docMain->SelectSingleNode("/document/properties/property/text()"); | |
my $questionID = ""; | |
if ($nodeTitle) { | |
$questionID = $nodeTitle->{xml}; | |
$questionID =~ s/^.*\[(.*)\].*$/$1/ig; | |
} | |
#print GetQuestionPart($docMain, "题干"); | |
#print GetQuestionPart($docMain, "答案"); | |
#print GetQuestionPart($docMain, "解题过程"); | |
#print GetQuestionPart($docMain, "分析"); | |
#print GetQuestionPart($docMain, "常见错误"); | |
# Get question structure part | |
#my ($nodeSubject, $subject) = GetQuestionPart($docMain, "题干"); | |
my $nodeSubject = $docMain->SelectSingleNode("//xhtml:td[following-sibling::xhtml:td='题干' or preceding-sibling::xhtml:td='题干']/xhtml:div"); | |
my $subject = $nodeSubject->{xml}."\n"; | |
# Create output document | |
my $docOutput= Win32::OLE->new(MSXMLDOM) or die "Couldn't create DOM document"; | |
$docOutput->{async} = "False"; | |
# $docOutput->{validateOnParse} = "True"; | |
$docOutput->setProperty("SelectionNamespaces", | |
"xmlns:xhtml='http://www.w3.org/1999/xhtml' | |
xmlns:mml='http://www.w3.org/1998/Math/MathML'"); | |
#$docOutput->{validateOnParse} = "True"; | |
# Create PI node, default encoding is utf-8. | |
my $pi = $docOutput->createProcessingInstruction("xml", "version='1.0' encoding='utf-8'"); | |
$docOutput->appendChild($pi); | |
# Create root node. | |
my $nodeRoot = $docOutput->createElement("ROOT"); | |
$docOutput->appendChild($nodeRoot); | |
$docOutput->{documentElement} = $nodeRoot; | |
# Create element node. | |
my $nodeElment = $docOutput->createElement("Element"); | |
$nodeRoot->appendChild($nodeElment); | |
$nodeElment->setAttribute("name","题目"); | |
# Create Attributes node. | |
my $nodeAttributes = $docOutput->createElement("Attributes"); | |
$nodeElment->appendChild($nodeAttributes); | |
# Create Attr node and set necessary attributes. | |
my $nodeAttr = $docOutput->createElement("Attr"); | |
$nodeAttributes->appendChild($nodeAttr); | |
$nodeAttr->setAttribute("name", "tqID"); | |
$nodeAttr->setAttribute("value",$questionID); | |
$nodeAttr = $docOutput->createElement("Attr"); | |
$nodeAttributes->appendChild($nodeAttr); | |
$nodeAttr->setAttribute("name","outputIMG"); | |
$nodeAttr->setAttribute("value","false"); | |
$nodeAttr = $docOutput->createElement("Attr"); | |
$nodeAttributes->appendChild($nodeAttr); | |
$nodeAttr->setAttribute("name","parentID"); | |
$nodeAttr->setAttribute("value",""); | |
# Create XHTML node with namespace. | |
my $nodeXhtml = $docOutput->createElement("XHTML"); | |
$nodeElment->appendChild($nodeXhtml); | |
# Append subject node that's from maindoc.xml to the node of XHTML | |
$nodeSubject = $nodeSubject->cloneNode(1); | |
$nodeXhtml->appendChild($nodeSubject); | |
# Create OOXML node with namespace. | |
my $nodeOoxml = $docOutput->createElement("OOXML"); | |
$nodeElment->appendChild($nodeOoxml); | |
# my $childs = $nodeSubject->SelectNodes('//*');; | |
# foreach my $node (in $childs) # make sure you include the 'in' | |
# { | |
# $node->setAttribute("xmlns", ""); | |
# } | |
# Save xml file. | |
$docOutput->save("main.xml"); | |
print "\n".$maindocFile."\n"; | |
# Replace text | |
open FH, "+<main.xml"; | |
local $/; | |
my $content = <FH>; | |
# Namepsace xhtml is unnecessary, remove it. | |
$content =~ s#\s+xmlns="http://www.w3.org/1999/xhtml"(?=(\s+|>))# #ig; | |
seek(FH,0,0); | |
truncate(FH, 0); | |
print FH $content; | |
close FH; | |
} | |
sub GetQuestionPart | |
{ | |
my ($docMain, $structName) = @_; | |
# Select node of question subject | |
my $nodeSubject = $docMain->SelectSingleNode("//xhtml:td[following-sibling::xhtml:td='$structName' or preceding-sibling::xhtml:td='$structName']/xhtml:div"); | |
my $subject = $nodeSubject->{xml}."\n"; | |
($nodeSubject, $subject); | |
} | |
########################################################################### | |
# | |
# Process image node in the document generated by XSLT. | |
# | |
########################################################################### | |
sub ProcessImageNode { | |
my($doc, $customNode, $imageNodes) = @_; | |
print "ProcessImageNode\n"; | |
# Read drawing fragment | |
my $docDrawing = LoadDocFromFile($drawFile); | |
$docDrawing->setProperty("SelectionNamespaces", | |
"xmlns:w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' xmlns:wp='http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'"); | |
my $drawingNode = $docDrawing->SelectSingleNode("/root/w:drawing"); | |
$drawingNode || die "Failed to load drawing node"; | |
foreach my $imgNode (in $imageNodes) { | |
#$pNode->appendChild($imgNode); | |
# Get value from img node in XHTML document. | |
my $srcText = GetAttrValue($imgNode, "src"); | |
next if !-e $srcText; #"No src attribution in img node"; | |
my($dirpath,$basename,$extname) = ($srcText =~ /^((?:.*[:\\\/])?)(.*)(\.[^.]*$)/s); | |
# Get height and width of image from CSS style. | |
my $width = 0; | |
my $height = 0; | |
# Give preference to CSS Style node if it exist. | |
my $styleText = GetAttrValue($imgNode, "style"); | |
if ($styleText ne "") | |
{ | |
($width) = ($styleText =~ /width:\s*([\d\.]+)/iog); | |
($height) = ($styleText =~ /height:\s*([\d\.]+)/iog); | |
} | |
else # Otherwise get value from attributes: width and height. | |
{ | |
$width = GetAttrValue($imgNode, "width"); | |
$height = GetAttrValue($imgNode, "height"); | |
} | |
# print "\nCSS:".$width."\t".$height; | |
# Create a new node(w:r) | |
my $rNode = $doc->createNode(1, | |
"w:r", | |
"http://schemas.openxmlformats.org/wordprocessingml/2006/main"); | |
# Append w:r to w:p | |
my $parentNode = $imgNode->parentNode(); | |
if( $parentNode->{nodeName} eq "w:p" ) # Parent node is w:p | |
{ | |
# Just replace img node with w:r node. | |
$parentNode->replaceChild($rNode, $imgNode); | |
} | |
else | |
{ | |
# Otherwise,create w:p node and replace original image node with the new w:p node. | |
my $pNode = $doc->createNode(1, | |
"w:p", | |
"http://schemas.openxmlformats.org/wordprocessingml/2006/main"); | |
$pNode->appendChild($rNode); | |
$parentNode->replaceChild($pNode, $imgNode); | |
} | |
# Append drawing node into w:r node | |
my $NewDrawingNode = $drawingNode->cloneNode(1); | |
# Set width(cx) and height(cy) for picture. | |
my $extNode = $NewDrawingNode->SelectSingleNode('./wp:inline/wp:extent'); | |
$extNode || die "Failed to get node wp:inline for picture node.Check drawing.txt.\n"; | |
# Convert distance from Millimeter2ExtendDist. | |
$width = Millimeter2ExtendDist($width); | |
$height = Millimeter2ExtendDist($height); | |
$extNode->setAttribute("cx", $width); | |
$extNode->setAttribute("cy", $height); | |
print $extNode->{xml}."\n"; | |
my $cyNode = $drawingNode->SelectSingleNode('./wp:inline/wp:extent/@cy'); | |
$cyNode || die "Failed to get attribute cy for picture node.Check drawing.txt.\n"; | |
print $cyNode->{xml}."\n"; | |
$rNode->appendChild($NewDrawingNode); | |
# Construct xhtml:img node inside w:r | |
my $newImgNode = $doc->createNode(1, | |
"xhtml:img", | |
"http://www.w3.org/1999/xhtml"); | |
$rNode->appendChild($newImgNode); | |
$newImgNode->setAttribute("xhtml:src", "image\\".$basename.$extname); | |
$newImgNode->setAttribute("xhtml:width", $width); | |
$newImgNode->setAttribute("xhtml:height",$height); | |
$newImgNode->setAttribute("xhtml:srcOOXml", "media/".$basename.$extname); | |
# Construct pkg:part node inside w:r | |
my $newPartNode = $doc->createNode(1, | |
"pkg:part", | |
"http://schemas.microsoft.com/office/2006/xmlPackage"); | |
# Append newPartNode to custom node | |
$customNode->appendChild($newPartNode); | |
$newPartNode->setAttribute("pkg:name", "/word/media/".$basename.$extname); | |
$newPartNode->setAttribute("pkg:contentType", "image/".$extname); | |
$newPartNode->setAttribute("pkg:compression", "store"); | |
# Fill base64 data | |
my $newBinData = $doc->createElement("pkg:binaryData"); | |
$newPartNode->appendChild($newBinData); | |
my $imageData = ReadFromFile($srcText); | |
my $base64Data = encode_base64($imageData); | |
$newBinData->{Text} = $base64Data; | |
} | |
} | |
sub Millimeter2ExtendDist | |
{ | |
my $width = shift; | |
my $magicNum = 3.78; # Considering that dpi is 96. | |
# Firstly, convert mm to pixel unit. | |
$width = $width * $magicNum; | |
# Then, convert pixel to wordprocessDrawing extent unit. | |
$width = $width * 12700 / 1.333; | |
# Finnaly, rounding decimal into integer number. | |
$width = int($width + 0.5); | |
$width; | |
} | |
########################################################################### | |
# | |
# Subroutines for MSXML | |
# | |
########################################################################### | |
sub LoadDocFromFile | |
{ | |
my($file); | |
($file)= @_; | |
my $doc= Win32::OLE->new('MSXML2.DOMDocument.4.0') or die "Couldn't create DOM document"; | |
$doc->{async} = "False"; | |
$doc->{validateOnParse} = "True"; | |
my $bLoad = $doc->Load($file); | |
if (!$bLoad) | |
{ | |
die "Failed to read $file,$!"; | |
} | |
$doc; | |
} | |
sub LoadDocFromText | |
{ | |
my($text); | |
($text)= @_; | |
my $doc= Win32::OLE->new('MSXML2.DOMDocument.4.0') or die "Couldn't create DOM document"; | |
$doc->{async} = "False"; | |
$doc->{validateOnParse} = "True"; | |
my $bLoad = $doc->loadXML($text); | |
if (!$bLoad) | |
{ | |
die "Failed to load xml from text $!\n"; | |
} | |
$doc; | |
} | |
sub GetAttrValue { | |
my ($node, $name) = @_; | |
return "" if !$node->Attributes; | |
my $srcNode = $node->Attributes->getNamedItem($name); | |
if ($srcNode) | |
{ | |
return $srcNode->{Text}; | |
} | |
else | |
{ | |
return ""; | |
} | |
} | |
sub ReadFromFile { | |
my ($file) = @_; | |
my $content = ''; | |
open my $fh, $file or die $!; | |
{ | |
local $/; | |
$content = <$fh>; | |
} | |
close $fh; | |
$content; | |
} | |
sub TransformWithArgs { | |
my ($xsltFile, $xmlFile, $argsHash) = @_; | |
my $template= Win32::OLE->new('MSXML2.XSLTemplate.4.0') | |
or die "Couldn't create MSXML2.XSLTemplate object.\n"; | |
my $xsltDoc= Win32::OLE->new('MSXML2.FreeThreadedDOMDocument.4.0') | |
or die "Couldn't create MSXML2.FreeThreadedDOMDocument.4.0 object.\n "; | |
$xsltDoc->{async} = "false"; | |
$xsltDoc->{resolveExternals} = "false"; | |
$xsltDoc->load($xsltFile) || print "Error: $!\n", return ""; | |
print "\n************************************\n"; | |
$template->{stylesheet} = $xsltDoc; | |
my $xsltProc = $template->createProcessor(); | |
my $xmlDoc = Win32::OLE->new("Msxml2.DOMDocument.4.0"); | |
$xmlDoc->load($xmlFile); | |
$xmlDoc || die "Failed to load $xmlFile\n"; | |
# Note: ? The code can not work here: $xsltProc->{input} = $docXhtml; | |
$xsltProc->LetProperty('input' , $xmlDoc); | |
# Add parametes of XSLT | |
foreach my $key(keys %$argsHash) { | |
$xsltProc->addParameter($key, $argsHash->{$key}); | |
} | |
$xsltProc->transform() or die "Faied to transform.\n"; | |
my $result = $xsltProc->{output}; | |
$result; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment