Skip to content

Instantly share code, notes, and snippets.

@paweljasinski
Last active August 29, 2015 14:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save paweljasinski/090273f7fd48867d196e to your computer and use it in GitHub Desktop.
Save paweljasinski/090273f7fd48867d196e to your computer and use it in GitHub Desktop.
explode and reformat docx
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:fo="http://www.w3.org/1999/XSL/Format">
<xsl:output method="xml" encoding="UTF-8" indent="yes"/>
<!-- Copy everything -->
<xsl:template match="@*|node()|text()|comment()|processing-instruction()">
<xsl:copy>
<xsl:apply-templates select="@*|node()|text()|comment()|processing-instruction()"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
#!/bin/sh
# this scripts unpacks and reformat docx files
#
# you need xslt processor (Transform) in your path
# /c/Program Files/Saxonica/SaxonHE9.4N/bin/Transform
#
# make sure to copy remove-rsid.xslt and copy.xslt
if [ "$1" = "-r" ]; then
remove_rsid=1
shift
fi
if [ "$1" = "" ]; then
echo expected name of the word document to be exploded
exit 1
fi
suffix=${1##*.}
name="$1"
if [ "$suffix" = "xml" ]; then
suffix=docx
name=${1/%.xml/.docx}
fi
if [ "$suffix" = "$1" ]; then
suffix=docx
name=$1.docx
fi
corename=$(basename "$name" .$suffix)
if [ -z "$corename" ]; then
echo can not work with empty name
exit 1
fi
DIR="$( cd "$( dirname "$0" )" && pwd )"
DOSDIR=$(cygpath -m $DIR)
FLAT=/tmp/flat.$$
FLATOUT=/tmp/flat.$$.out
if [ "$remove_rsid" == "1" ]; then
transform=$DOSDIR/remove-rsid.xslt
else
transform=$DOSDIR/copy.xslt
fi
# $1 - file name
#
# formats file as xml
_reformat_xml() {
echo $1
xmllint --format $1 -o $1.new
mv $1.new $1
}
flaten() {
# xml
xmls=""
for f in $(find . -name '*.xml'); do
ff=$(echo ${f#./} | tr '/' 'Ø')
mv $f $FLAT/$ff
xmls="$xmls $ff"
done
# for rels, rename into .xml
rels=""
for f in $(find . -name '*.rels'); do
ff=$(echo ${f#./} | tr '/' 'Ø')
rels="$rels $ff.xml"
mv $f $FLAT/$ff.xml
done
}
expand_dirs() {
target_dir=$(pwd)
cd $FLATOUT
for f in $rels ; do
ff=$(echo ${f%.xml} | tr 'Ø' '/')
mv $f "$target_dir/$ff"
done
for f in $xmls ; do
ff=$(echo $f | tr 'Ø' '/')
mv $f "$target_dir/$ff"
done
cd "$target_dir"
}
if [ -e "$corename" ]; then
if [ -e "$corename.bak" ];then
# echo removing $corename.bak
rm -rf "$corename.bak"
fi
# echo backing up $corename
mv "$corename" "$corename.bak"
fi
mkdir "$corename"
cd "$corename"
unzip -q "../$name"
if [ -a $FLAT ]; then
rm -rf $FLAT
fi
mkdir $FLAT
flaten
if [ -a $FLATOUT ]; then
rm -rf $FLATOUT
fi
mkdir $FLATOUT
dosflat=$(cygpath -m $FLAT)
Transform -xsl:$transform -s:$dosflat -o:$dosflat.out
expand_dirs
rm -rf $FLAT $FLATOUT
<?xml version="1.0" encoding="UTF-8"?>
<!-- Remove unwanted attributes or/and nodes -->
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:fo="http://www.w3.org/1999/XSL/Format"
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<xsl:output method="xml" encoding="UTF-8" indent="yes"/>
<!-- Copy everything -->
<xsl:template match="@*|node()|text()|comment()|processing-instruction()">
<xsl:copy>
<xsl:apply-templates select="@*|node()|text()|comment()|processing-instruction()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="w:rsid"/>
<xsl:template match="@w:rsidDel"/>
<xsl:template match="@w:rsidP"/>
<xsl:template match="@w:rsidR"/>
<xsl:template match="@w:rsidRDefault"/>
<xsl:template match="@w:rsidRPr"/>
<xsl:template match="@w:rsidSect"/>
<xsl:template match="@w:rsidTr"/>
<xsl:template match="w:proofErr"/>
<xsl:template match="w:lang"/>
</xsl:stylesheet>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment