Skip to content

Instantly share code, notes, and snippets.

@sixtyfive
Last active April 13, 2022 12:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sixtyfive/35c192ce780ad6375ab24545bd6e06ba to your computer and use it in GitHub Desktop.
Save sixtyfive/35c192ce780ad6375ab24545bd6e06ba to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
require 'docx'
require 'nokogiri'
def docxparas2html(finpath)
doc = Docx::Document.open(finpath)
xsl = Nokogiri::XSLT File.read 'docxparas2html.xsl'
nodes = doc.paragraphs.map(&:node)
nodes.each{|node|
node['xmlns:w'] = 'http://schemas.microsoft.com/office/word/2018/wordml'
doc_from_node = Nokogiri::XML node.to_xml
html_fragment = xsl.transform(doc_from_node, ['key', 'value']).to_xml.split(/\n/)[1..].join
puts html_fragment
}
end
ARGV.each{|arg| docxparas2html arg if arg.downcase.match? /\.docx$/}
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:w="http://schemas.microsoft.com/office/word/2018/wordml">
<xsl:template match="/">
<xsl:for-each select="//w:r">
<xsl:variable name="styles">
<xsl:if test="w:rPr/w:rFonts"><xsl:variable name="fontsize" select="w:rPr/w:sz/@w:val"/>font-family:'<xsl:apply-templates select="w:rPr/w:rFonts/@w:ascii"/>';font-size:<xsl:value-of select="$fontsize div 2"/>;</xsl:if>
<xsl:if test="w:rPr/w:b">font-weight:bold;</xsl:if>
<xsl:if test="w:rPr/w:i">font-style:italic;</xsl:if>
<xsl:if test="w:rPr/w:u"><xsl:if test="not(w:rPr/w:u/@w:val)">text-decoration:underline;</xsl:if></xsl:if>
<xsl:if test="w:rPr/w:strike"><xsl:if test="not(w:rPr/w:strike/@w:val)">text-decoration:line-through;</xsl:if></xsl:if>
<xsl:if test="w:rPr/w:color">color:<xsl:apply-templates select="w:rPr/w:color/@w:val"/>;</xsl:if>
<xsl:if test="w:rPr/w:shd">background-color:<xsl:apply-templates select="w:rPr/w:shd/@w:fill"/>;</xsl:if>
</xsl:variable>
<xsl:choose>
<xsl:when test="$styles = ''"><xsl:apply-templates select="w:t"/></xsl:when>
<xsl:when test="$styles = ' '"><xsl:apply-templates select="w:t"/></xsl:when>
<xsl:otherwise><span style="{$styles}"><xsl:apply-templates select="w:t"/></span></xsl:otherwise>
</xsl:choose>
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" mc:Ignorable="w14 wp14">
<w:body>
<w:p>
<w:pPr>
<w:pStyle w:val="Normal"/>
<w:bidi w:val="0"/>
<w:jc w:val="left"/>
<w:rPr/>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:ascii="Lucida Handwriting" w:hAnsi="Lucida Handwriting"/>
<w:sz w:val="22"/>
<w:szCs w:val="22"/>
</w:rPr>
<w:t>Test text</w:t>
</w:r>
<w:r>
<w:rPr/>
<w:t xml:space="preserve"> with </w:t>
</w:r>
<w:r>
<w:rPr>
<w:i/>
<w:iCs/>
</w:rPr>
<w:t>various</w:t>
</w:r>
<w:r>
<w:rPr/>
<w:t xml:space="preserve"> </w:t>
</w:r>
<w:r>
<w:rPr>
<w:b/>
<w:bCs/>
</w:rPr>
<w:t>styles</w:t>
</w:r>
<w:r>
<w:rPr/>
<w:t xml:space="preserve">, </w:t>
</w:r>
<w:r>
<w:rPr>
<w:b/>
<w:bCs/>
<w:i/>
<w:iCs/>
<w:strike w:val="false"/>
<w:dstrike w:val="false"/>
<w:u w:val="none"/>
</w:rPr>
<w:t>among</w:t>
</w:r>
<w:r>
<w:rPr/>
<w:t xml:space="preserve"> them </w:t>
</w:r>
<w:r>
<w:rPr>
<w:color w:val="C9211E"/>
</w:rPr>
<w:t>color</w:t>
</w:r>
<w:r>
<w:rPr/>
<w:t xml:space="preserve"> </w:t>
</w:r>
<w:r>
<w:rPr>
<w:shd w:fill="FFFF00" w:val="clear"/>
</w:rPr>
<w:t>and</w:t>
</w:r>
<w:r>
<w:rPr/>
<w:t xml:space="preserve"> </w:t>
</w:r>
<w:r>
<w:rPr>
<w:u w:val="single"/>
</w:rPr>
<w:t>underlined</w:t>
</w:r>
<w:r>
<w:rPr/>
<w:t xml:space="preserve"> text as well as </w:t>
</w:r>
<w:r>
<w:rPr>
<w:strike/>
</w:rPr>
<w:t>strikethrough</w:t>
</w:r>
<w:r>
<w:rPr/>
<w:t>.</w:t>
</w:r>
</w:p>
<w:sectPr>
<w:type w:val="nextPage"/>
<w:pgSz w:w="12240" w:h="15840"/>
<w:pgMar w:left="1134" w:right="1134" w:gutter="0" w:header="0" w:top="1134" w:footer="0" w:bottom="1134"/>
<w:pgNumType w:fmt="decimal"/>
<w:formProt w:val="false"/>
<w:textDirection w:val="lrTb"/>
</w:sectPr>
</w:body>
</w:document>
source 'https://rubygems.org'
gem 'docx'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment