Skip to content

Instantly share code, notes, and snippets.

@amake
Last active November 10, 2015 05:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amake/6f1490e266efdfcf05eb to your computer and use it in GitHub Desktop.
Save amake/6f1490e266efdfcf05eb to your computer and use it in GitHub Desktop.
Convert CLDR data to TMX
<?xml version="1.0" encoding="UTF-8"?>
<!--
Pipeline for converting pairs of CLDR LDML data files into TMX
(for translation reference, etc.).
http://www.unicode.org/repos/cldr/trunk/common/main/
Load this pipeline into Okapi Rainbow and set the input files, e.g.:
Input List 1: en.xml
Input List 2: ja.xml
Use the okf_xml@cldr.fprm filter config included in this gist as the
configuration for both files. Be sure to set languages and encodings
appropriately, e.g.:
Source: en, UTF-8
Target: ja, UTF-8
Set the TMX output location in the Id-Based Aligner step.
-->
<rainbowPipeline version="1"><step class="net.sf.okapi.steps.common.RawDocumentToFilterEventsStep"></step>
<step class="net.sf.okapi.steps.idaligner.IdBasedAlignerStep">#v1
tmxOutputPath=aligned.tmx
generateTMX.b=true
replaceWithSource.b=false
copyToTarget.b=true
storeAsAltTranslation.b=false
suppressTusWithNoTarget.b=true</step>
</rainbowPipeline>
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<its:rules xmlns:its="http://www.w3.org/2005/11/its"
xmlns:itsx="http://www.w3.org/2008/12/its-extensions"
xmlns:okp="okapi-framework:xmlfilter-options"
xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0">
<its:translateRule selector="//languages/language[not(@alt)]" translate="yes" itsx:idValue="concat('language_', @type)" />
<its:translateRule selector="//script[not(@alt)]" translate="yes" itsx:idValue="concat('script_', @type)" />
<its:translateRule selector="//territory[not(@alt)]" translate="yes" itsx:idValue="concat('territory_', @type)" />
<its:translateRule selector="//variant[not(@alt)]" translate="yes" itsx:idValue="concat('variant_', @type)" />
<its:translateRule selector="//key[not(@alt)]" translate="yes" itsx:idValue="concat('key_', @type)" />
<its:translateRule selector="//type[not(@alt)]" translate="yes" itsx:idValue="concat('type_', @key, '_', @type)" />
<its:translateRule selector="//transformName[not(@alt)]" translate="yes" itsx:idValue="concat('type_', @type)" />
<its:translateRule selector="//measurementSystemName[not(@alt)]" translate="yes" itsx:idValue="concat('measurementSystemName_', @type)" />
<its:translateRule selector="//cyclicName[not(@alt)]" translate="yes" itsx:idValue="concat('cyclicName_', ../../../../../@type, '_', ../../../@type, '_', @type)" />
<its:translateRule selector="//monthWidth[@type='wide']/monthContext[@type='format']/month[not(@alt) and not(@yeartype)]" translate="yes" itsx:idValue="concat('month_', ../@type, '_', ../../../../@type, '_', @type)" />
<its:translateRule selector="//dayWidth[@type='wide']/day[not(@alt)]" translate="yes" itsx:idValue="concat('day_', ../../@type, '_', @type)" />
<its:translateRule selector="//quarterWidth[@type='wide']/quarter[not(@alt)]" translate="yes" itsx:idValue="concat('quarter_', ../../@type, '_', @type)" />
<its:translateRule selector="//dayPeriodContext[@type='format']/dayPeriodWidth[@type='wide']/dayPeriod[not(@alt)]" translate="yes" itsx:idValue="concat('dayPeriod_', ../../../../@type, '_', @type)" />
<its:translateRule selector="//eraNames/era[not(@alt)]" translate="yes" itsx:idValue="concat('era_', ../../../@type, '_', @type)" />
<its:translateRule selector="//relative[not(@alt)]" translate="yes" itsx:idValue="concat(../@type, '_relative_', @type)" />
<its:translateRule selector="//exemplarCity[not(@alt)]" translate="yes" itsx:idValue="concat('zone_', ../@type, '_exemplarCity')" />
<its:translateRule selector="//long/generic[not(@alt)]" translate="yes" itsx:idValue="concat('zone_', ../../@type, '_generic')" />
<its:translateRule selector="//long/daylight[not(@alt)]" translate="yes" itsx:idValue="concat('zone_', ../../@type, '_daylight')" />
<its:translateRule selector="//long/standard[not(@alt)]" translate="yes" itsx:idValue="concat('zone_', ../../@type, '_standard')" />
<its:translateRule selector="//long/special[not(@alt)]" translate="yes" itsx:idValue="concat('zone_', ../../@type, '_special')" />
<its:translateRule selector="//currency/displayName[not(@alt) and not(@count)]" translate="yes" itsx:idValue="concat('currency_', ../@type)" />
<its:translateRule selector="//unit/displayName[not(@alt) and not(@count)]" translate="yes" itsx:idValue="concat('unit_', ../../@type, '_', ../@type)" />
<its:translateRule selector="*" translate="no"/>
</its:rules>
#!/bin/sh
# Sample script to generate CLDR TMX files
# Preparing the environment ($CLDR_HOME, $OKAPI_HOME, etc.)
# is an exercise left to the reader.
if [ "$#" != 2 ]; then
echo "Usage: cldr-gen.sh SRC_LANG TRG_LANG"
exit 1
fi
SRC_LANG=$1
TRG_LANG=$2
echo "Extracting ${SRC_LANG}-to-${TRG_LANG} TMX"
RAINBOW="java -XstartOnFirstThread -jar ${OKAPI_HOME}/lib/rainbow.jar"
FILTERS_DIR="${OKAPI_FILTERS_HOME}"
FILTER_CONFIG="okf_xml@cldr"
OUT_FILE="CLDR_${SRC_LANG}_${TRG_LANG}.tmx"
FILE1="${CLDR_HOME}/${SRC_LANG}.xml"
FILE2="${CLDR_HOME}/${TRG_LANG}.xml"
echo "File 1: ${FILE1}"
echo "File 2: ${FILE2}"
echo "Out: ${OUT_FILE}"
PIPELINE=$(pwd)/cldr-gen-pipeline.pln
cat <<EOF > "$PIPELINE"
<?xml version="1.0" encoding="UTF-8"?>
<rainbowPipeline version="1"><step class="net.sf.okapi.steps.common.RawDocumentToFilterEventsStep"></step>
<step class="net.sf.okapi.steps.idaligner.IdBasedAlignerStep">#v1
tmxOutputPath=$(pwd)/${OUT_FILE}
generateTMX.b=true
replaceWithSource.b=false
copyToTarget.b=true
storeAsAltTranslation.b=false
suppressTusWithNoTarget.b=true</step>
</rainbowPipeline>
EOF
$RAINBOW -sl $SRC_LANG -tl $TRG_LANG \
-se utf-8 -te utf-8 \
-pln "$PIPELINE" \
-pd "$FILTERS_DIR" \
-np \
"$FILE1" -fc "$FILTER_CONFIG" \
"$FILE2" -fc "$FILTER_CONFIG"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment