Skip to content

Instantly share code, notes, and snippets.

@thinkhy
Created July 10, 2011 08:07
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thinkhy/1074372 to your computer and use it in GitHub Desktop.
Save thinkhy/1074372 to your computer and use it in GitHub Desktop.
Convert XHTML to OOXML
#!/usr/bin/perl -w
#################################################################
# filename: xhtml2ooxml.pl
# brief: Convert xhtml to ooxml format for Founder WordEditor
# creator: thinkhy
# date: 11/06/23
# changelog:
#
#################################################################
use Cwd;
use Encode;
use Win32::Guidgen;
use MIME::Base64 qw(encode_base64);
use Win32::OLE qw(in with);
use strict;
use constant MSXMLDOM => "MSXML2.DOMDOCUMENT.4.0";
###########################################################################
#
# Main
#
###########################################################################
#my $mainFile = 'E:/work/浙教题库升级/ImportWork/XSLT/code/pic2/main.xml';
my $mainFile = 'E:/work/浙教题库升级/ImportWork/testdata/library__44/{FF35B8A6-985C-4644-87B1-3FE83D1A50BD}/main.xml';
my $xsltFile = 'E:/work/浙教题库升级/ImportWork/XSLT/code/xhtml2ooxml.xsl';
my $outputFile = 'E:/work/浙教题库升级/ImportWork/XSLT/code/pic2/ooxml/output.xml';
my $drawFile = 'E:/work/浙教题库升级/ImportWork/XSLT/code/drawing.xml';
my $intermediaFile = 'main.xml';
my $rootPath = 'E:/formular/backup';
#my $rootPath = 'E:/work/浙教题库升级/ImportWork/testdata/library__44/backup';
chdir $rootPath;
opendir my $fh, $rootPath or print $!;
my @pathes = grep { -d and $_ ne "." and $_ ne ".." } readdir $fh;
# Create COM Object, then export word document.
my $exportToWord = Win32::OLE->new("Founder.WordPlugin.ExportToWord.ExportToWord") or die "Couldn't create ExportToWord COM object";
foreach(@pathes)
{
# Just for Debug
# next if $_ ne '{0D7EF8D7-E1A0-400F-B1C7-FFD0D40CA302}';
# next if $_ ne '100CFXEX35RSA2119460';
# Enter into work directory.
chdir $rootPath."/".$_;
$mainFile = $rootPath."/".$_.'/main.xml';
# Output file with the format of docx.
my $docxFile = $rootPath."/".$_.'/word.docx';
# Maindoc file generated by XMLEditor
my $maindocFile = $rootPath."/".$_.'/maindoc.xml';
# maindoc.xml must exist.
next if !-f $maindocFile;
# Generate main.xml from maindoc.xml
GenMainxml($maindocFile);
# main.xml must be generated.
my $mainFile = $rootPath."/".$_.'/main.xml';
next if !-f $mainFile;
# Convert html fragment to ooxml.
my $ooxmlFile = ConvertMainxml($mainFile) if -e $mainFile;
# Insert ooxml filepath into /ROOT/Element[@name='题目']/OOXML in main.xml
my $docMain = LoadDocFromFile($mainFile);
my $nodeOoxml = $docMain->SelectSingleNode('/ROOT/Element[@name="题目"]/OOXML');
print "\nmain file: ".$mainFile."\n";
print "\nXML file: ".$ooxmlFile."\n";
if ($nodeOoxml)
{
$nodeOoxml->{Text} = $ooxmlFile;
}
my $strResult = $docMain->{xml};
$strResult =~ s/^\s*<\?xml.*?\?>//g;
$strResult =~ s/^/<\?xml version="1.0" encoding="gb2312"\?>/g;
#$docMain->save($mainFile);
# work around the problem for entity.
# TODO: Refator
#my $strResult = $docMain->{xml};
$strResult =~ s/\*\*(.*?)\*\*/&#$1;/sg;
open OUT,">$mainFile";
syswrite(OUT, $strResult);
close OUT;
my $msg = 0;
my $retVal = $exportToWord->Export($mainFile,
'E:\work\浙教题库升级\Test\docxTemplate\WordTemplate.docx',
$docxFile,
"0",
$msg);
print "输出文件:$docxFile 返回值: ".$retVal."\n";
}
sub ConvertMainxml
{
my $mainFile = shift;
# Load xml document
my $docMain = LoadDocFromFile($mainFile);
# Select xhtml node
my $xhtmlNode = $docMain->SelectSingleNode('ROOT/Element[@name="题目"]/XHTML');
$xhtmlNode || die 'No xhtml content in ROOT/Element[@name="题目"]/XHTML $!\n';
my $xhtmlText = $xhtmlNode->{xml};
my $elementNode = $docMain->SelectSingleNode('ROOT/Element/@name');
my $elementName = $elementNode->{value} || "";
my $tqIDNode = $docMain->SelectSingleNode('ROOT/Element/Attributes/Attr[@name="tqID"]/@value');
my $tqID;
if ($tqIDNode)
{
$tqID = $tqIDNode->{value};
}
else
{
$tqID = "";
}
print $tqID;
my $parentIDNode = $docMain->SelectSingleNode('ROOT/Element/Attributes/Attr[@name="parentID"]/@value');
my $parentID;
if ($parentIDNode)
{
$parentID = $parentIDNode->{value};
}
else # there is no parentID attribute
{
$parentID = "";
}
# Replace redundant character "\r".
$xhtmlText =~ s/\r(?!\n)//g;
# Add proper processing instruction
$xhtmlText =~ s/^\s*<\?xml.*?\?>//g;
$xhtmlText =~ s/^/<\?xml version="1.0" encoding="gb2312"\?>/g;
# print $xhtmlText;
my $tmpFile = "__out__.xml";
open OUTXHTML, ">$tmpFile";
print OUTXHTML $xhtmlText;
close OUTXHTML;
# Perform the transformation and save the resulting DOM object
# my $xsltOutput = $docXhtml->transformNode($docXslt);
# Prepare parameters for XSLT
my %hash = ("elementName"=>"$elementName",
"parentID"=>"$parentID",
"tqID"=>"$tqID",);
# Transform with parameters.
print "\n##################################################\n";
my $result = &TransformWithArgs($xsltFile, $tmpFile, \%hash);
unlink($tmpFile);
# Replace single '\r' character
$result =~ s/\r(?!\n)//g;
# Set encoding to gb2312, otherwise C# program could not read the document.
$result =~ s/^\s*<\?xml.*?\?>//g;
$result =~ s/^/<\?xml version="1.0" encoding="gb2312"\?>/g;
# Convert image nodes
my $docResult = LoadDocFromText($result);
my $imageNodes = $docResult->selectNodes("//img");
#print "Len:".$imageNodes->length."\n";
# Get the node of Customer
$docResult->setProperty("SelectionNamespaces",
"xmlns:xhtml='http://www.w3.org/1999/xhtml'
xmlns:w='http://schemas.openxmlformats.org/wordprocessingml/2006/main'");
my $customNode = $docResult->SelectSingleNode(".//w:customXml");
$customNode || die "No custom node(.//w:customXml) in the result document";
# Enter the directory of main.xml
my $oldCwd = getcwd();
my($dirpath,$filename) = ($mainFile =~ /^((?:.*[:\\\/])?)(.*$)/s);
chdir($dirpath);
print "\n".$dirpath;
ProcessImageNode($docResult, $customNode, $imageNodes);
# mkdir for ooxml fragment
if (-d 'ooxml')
{
# Empty ooxml directory.
opendir my $ooxmlDir, 'ooxml';
my @ooxmlFiles = readdir $ooxmlDir;
foreach(@ooxmlFiles)
{
unlink 'ooxml/'.$_;
}
}
else
{
mkdir 'ooxml' or die "Failed to create ooxml directory";
}
# Generate intermediate file.
my $guid = Win32::Guidgen::create();
my $outputFile = 'ooxml/'.$guid.'.xml';
#$docResult->save($outputFile);
# work around the problem for entity.
# TODO: Refator
my $strResult = $docResult->{xml};
$strResult =~ s/^\s*<\?xml.*?\?>//g;
$strResult =~ s/^/<\?xml version="1.0" encoding="gb2312"\?>/g;
$strResult =~ s/\*\*(.*?)\*\*/&#$1;/sg;
open OUT,">$outputFile";
#syswrite(OUT, encode("utf8", $strResult));
#$strResult = encode("utf8",decode("ascii", $strResult));
syswrite(OUT, $strResult);
close OUT;
chdir $oldCwd;
$outputFile;
}
sub GenMainxml
{
my $maindocFile = shift;
# Load xml document and set namespace.
my $docMain = LoadDocFromFile($maindocFile);
$docMain->setProperty("SelectionNamespaces",
"xmlns:xhtml='http://www.w3.org/1999/xhtml'
xmlns:mml='http://www.w3.org/1998/Math/MathML'");
# Get question id
my $nodeTitle = $docMain->SelectSingleNode("/document/properties/property/text()");
my $questionID = "";
if ($nodeTitle) {
$questionID = $nodeTitle->{xml};
$questionID =~ s/^.*\[(.*)\].*$/$1/ig;
}
#print GetQuestionPart($docMain, "题干");
#print GetQuestionPart($docMain, "答案");
#print GetQuestionPart($docMain, "解题过程");
#print GetQuestionPart($docMain, "分析");
#print GetQuestionPart($docMain, "常见错误");
# Get question structure part
#my ($nodeSubject, $subject) = GetQuestionPart($docMain, "题干");
my $nodeSubject = $docMain->SelectSingleNode("//xhtml:td[following-sibling::xhtml:td='题干' or preceding-sibling::xhtml:td='题干']/xhtml:div");
my $subject = $nodeSubject->{xml}."\n";
# Create output document
my $docOutput= Win32::OLE->new(MSXMLDOM) or die "Couldn't create DOM document";
$docOutput->{async} = "False";
# $docOutput->{validateOnParse} = "True";
$docOutput->setProperty("SelectionNamespaces",
"xmlns:xhtml='http://www.w3.org/1999/xhtml'
xmlns:mml='http://www.w3.org/1998/Math/MathML'");
#$docOutput->{validateOnParse} = "True";
# Create PI node, default encoding is utf-8.
my $pi = $docOutput->createProcessingInstruction("xml", "version='1.0' encoding='utf-8'");
$docOutput->appendChild($pi);
# Create root node.
my $nodeRoot = $docOutput->createElement("ROOT");
$docOutput->appendChild($nodeRoot);
$docOutput->{documentElement} = $nodeRoot;
# Create element node.
my $nodeElment = $docOutput->createElement("Element");
$nodeRoot->appendChild($nodeElment);
$nodeElment->setAttribute("name","题目");
# Create Attributes node.
my $nodeAttributes = $docOutput->createElement("Attributes");
$nodeElment->appendChild($nodeAttributes);
# Create Attr node and set necessary attributes.
my $nodeAttr = $docOutput->createElement("Attr");
$nodeAttributes->appendChild($nodeAttr);
$nodeAttr->setAttribute("name", "tqID");
$nodeAttr->setAttribute("value",$questionID);
$nodeAttr = $docOutput->createElement("Attr");
$nodeAttributes->appendChild($nodeAttr);
$nodeAttr->setAttribute("name","outputIMG");
$nodeAttr->setAttribute("value","false");
$nodeAttr = $docOutput->createElement("Attr");
$nodeAttributes->appendChild($nodeAttr);
$nodeAttr->setAttribute("name","parentID");
$nodeAttr->setAttribute("value","");
# Create XHTML node with namespace.
my $nodeXhtml = $docOutput->createElement("XHTML");
$nodeElment->appendChild($nodeXhtml);
# Append subject node that's from maindoc.xml to the node of XHTML
$nodeSubject = $nodeSubject->cloneNode(1);
$nodeXhtml->appendChild($nodeSubject);
# Create OOXML node with namespace.
my $nodeOoxml = $docOutput->createElement("OOXML");
$nodeElment->appendChild($nodeOoxml);
# my $childs = $nodeSubject->SelectNodes('//*');;
# foreach my $node (in $childs) # make sure you include the 'in'
# {
# $node->setAttribute("xmlns", "");
# }
# Save xml file.
$docOutput->save("main.xml");
print "\n".$maindocFile."\n";
# Replace text
open FH, "+<main.xml";
local $/;
my $content = <FH>;
# Namepsace xhtml is unnecessary, remove it.
$content =~ s#\s+xmlns="http://www.w3.org/1999/xhtml"(?=(\s+|>))# #ig;
seek(FH,0,0);
truncate(FH, 0);
print FH $content;
close FH;
}
sub GetQuestionPart
{
my ($docMain, $structName) = @_;
# Select node of question subject
my $nodeSubject = $docMain->SelectSingleNode("//xhtml:td[following-sibling::xhtml:td='$structName' or preceding-sibling::xhtml:td='$structName']/xhtml:div");
my $subject = $nodeSubject->{xml}."\n";
($nodeSubject, $subject);
}
###########################################################################
#
# Process image node in the document generated by XSLT.
#
###########################################################################
sub ProcessImageNode {
my($doc, $customNode, $imageNodes) = @_;
print "ProcessImageNode\n";
# Read drawing fragment
my $docDrawing = LoadDocFromFile($drawFile);
$docDrawing->setProperty("SelectionNamespaces",
"xmlns:w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' xmlns:wp='http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'");
my $drawingNode = $docDrawing->SelectSingleNode("/root/w:drawing");
$drawingNode || die "Failed to load drawing node";
foreach my $imgNode (in $imageNodes) {
#$pNode->appendChild($imgNode);
# Get value from img node in XHTML document.
my $srcText = GetAttrValue($imgNode, "src");
next if !-e $srcText; #"No src attribution in img node";
my($dirpath,$basename,$extname) = ($srcText =~ /^((?:.*[:\\\/])?)(.*)(\.[^.]*$)/s);
# Get height and width of image from CSS style.
my $width = 0;
my $height = 0;
# Give preference to CSS Style node if it exist.
my $styleText = GetAttrValue($imgNode, "style");
if ($styleText ne "")
{
($width) = ($styleText =~ /width:\s*([\d\.]+)/iog);
($height) = ($styleText =~ /height:\s*([\d\.]+)/iog);
}
else # Otherwise get value from attributes: width and height.
{
$width = GetAttrValue($imgNode, "width");
$height = GetAttrValue($imgNode, "height");
}
# print "\nCSS:".$width."\t".$height;
# Create a new node(w:r)
my $rNode = $doc->createNode(1,
"w:r",
"http://schemas.openxmlformats.org/wordprocessingml/2006/main");
# Append w:r to w:p
my $parentNode = $imgNode->parentNode();
if( $parentNode->{nodeName} eq "w:p" ) # Parent node is w:p
{
# Just replace img node with w:r node.
$parentNode->replaceChild($rNode, $imgNode);
}
else
{
# Otherwise,create w:p node and replace original image node with the new w:p node.
my $pNode = $doc->createNode(1,
"w:p",
"http://schemas.openxmlformats.org/wordprocessingml/2006/main");
$pNode->appendChild($rNode);
$parentNode->replaceChild($pNode, $imgNode);
}
# Append drawing node into w:r node
my $NewDrawingNode = $drawingNode->cloneNode(1);
# Set width(cx) and height(cy) for picture.
my $extNode = $NewDrawingNode->SelectSingleNode('./wp:inline/wp:extent');
$extNode || die "Failed to get node wp:inline for picture node.Check drawing.txt.\n";
# Convert distance from Millimeter2ExtendDist.
$width = Millimeter2ExtendDist($width);
$height = Millimeter2ExtendDist($height);
$extNode->setAttribute("cx", $width);
$extNode->setAttribute("cy", $height);
print $extNode->{xml}."\n";
my $cyNode = $drawingNode->SelectSingleNode('./wp:inline/wp:extent/@cy');
$cyNode || die "Failed to get attribute cy for picture node.Check drawing.txt.\n";
print $cyNode->{xml}."\n";
$rNode->appendChild($NewDrawingNode);
# Construct xhtml:img node inside w:r
my $newImgNode = $doc->createNode(1,
"xhtml:img",
"http://www.w3.org/1999/xhtml");
$rNode->appendChild($newImgNode);
$newImgNode->setAttribute("xhtml:src", "image\\".$basename.$extname);
$newImgNode->setAttribute("xhtml:width", $width);
$newImgNode->setAttribute("xhtml:height",$height);
$newImgNode->setAttribute("xhtml:srcOOXml", "media/".$basename.$extname);
# Construct pkg:part node inside w:r
my $newPartNode = $doc->createNode(1,
"pkg:part",
"http://schemas.microsoft.com/office/2006/xmlPackage");
# Append newPartNode to custom node
$customNode->appendChild($newPartNode);
$newPartNode->setAttribute("pkg:name", "/word/media/".$basename.$extname);
$newPartNode->setAttribute("pkg:contentType", "image/".$extname);
$newPartNode->setAttribute("pkg:compression", "store");
# Fill base64 data
my $newBinData = $doc->createElement("pkg:binaryData");
$newPartNode->appendChild($newBinData);
my $imageData = ReadFromFile($srcText);
my $base64Data = encode_base64($imageData);
$newBinData->{Text} = $base64Data;
}
}
sub Millimeter2ExtendDist
{
my $width = shift;
my $magicNum = 3.78; # Considering that dpi is 96.
# Firstly, convert mm to pixel unit.
$width = $width * $magicNum;
# Then, convert pixel to wordprocessDrawing extent unit.
$width = $width * 12700 / 1.333;
# Finnaly, rounding decimal into integer number.
$width = int($width + 0.5);
$width;
}
###########################################################################
#
# Subroutines for MSXML
#
###########################################################################
sub LoadDocFromFile
{
my($file);
($file)= @_;
my $doc= Win32::OLE->new('MSXML2.DOMDocument.4.0') or die "Couldn't create DOM document";
$doc->{async} = "False";
$doc->{validateOnParse} = "True";
my $bLoad = $doc->Load($file);
if (!$bLoad)
{
die "Failed to read $file,$!";
}
$doc;
}
sub LoadDocFromText
{
my($text);
($text)= @_;
my $doc= Win32::OLE->new('MSXML2.DOMDocument.4.0') or die "Couldn't create DOM document";
$doc->{async} = "False";
$doc->{validateOnParse} = "True";
my $bLoad = $doc->loadXML($text);
if (!$bLoad)
{
die "Failed to load xml from text $!\n";
}
$doc;
}
sub GetAttrValue {
my ($node, $name) = @_;
return "" if !$node->Attributes;
my $srcNode = $node->Attributes->getNamedItem($name);
if ($srcNode)
{
return $srcNode->{Text};
}
else
{
return "";
}
}
sub ReadFromFile {
my ($file) = @_;
my $content = '';
open my $fh, $file or die $!;
{
local $/;
$content = <$fh>;
}
close $fh;
$content;
}
sub TransformWithArgs {
my ($xsltFile, $xmlFile, $argsHash) = @_;
my $template= Win32::OLE->new('MSXML2.XSLTemplate.4.0')
or die "Couldn't create MSXML2.XSLTemplate object.\n";
my $xsltDoc= Win32::OLE->new('MSXML2.FreeThreadedDOMDocument.4.0')
or die "Couldn't create MSXML2.FreeThreadedDOMDocument.4.0 object.\n ";
$xsltDoc->{async} = "false";
$xsltDoc->{resolveExternals} = "false";
$xsltDoc->load($xsltFile) || print "Error: $!\n", return "";
print "\n************************************\n";
$template->{stylesheet} = $xsltDoc;
my $xsltProc = $template->createProcessor();
my $xmlDoc = Win32::OLE->new("Msxml2.DOMDocument.4.0");
$xmlDoc->load($xmlFile);
$xmlDoc || die "Failed to load $xmlFile\n";
# Note: ? The code can not work here: $xsltProc->{input} = $docXhtml;
$xsltProc->LetProperty('input' , $xmlDoc);
# Add parametes of XSLT
foreach my $key(keys %$argsHash) {
$xsltProc->addParameter($key, $argsHash->{$key});
}
$xsltProc->transform() or die "Faied to transform.\n";
my $result = $xsltProc->{output};
$result;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment