Skip to content

Instantly share code, notes, and snippets.

@danielnaber
Last active November 29, 2017 13:17
Show Gist options
  • Save danielnaber/fd20cff8d86a4608c22143dcf1fe62b0 to your computer and use it in GitHub Desktop.
Save danielnaber/fd20cff8d86a4608c22143dcf1fe62b0 to your computer and use it in GitHub Desktop.
#!/bin/bash
echo "Create morfologik spelling dictionary, based on Hunspell dictionary"
echo "This script assumes you have the full LanguageTool build environment"
echo "Please call this script from the LanguageTool top-level directory"
echo ""
if [ $# -ne 2 ]
then
SCRIPT=`basename $0`
echo "Usage: $SCRIPT <langCode> <countryCode>"
echo " For example: $SCRIPT de AT"
exit 1
fi
REPO=/home/dnaber/.m2/repository
LT_VERSION=4.0-SNAPSHOT
# see https://sourceforge.net/projects/germandict/files/ (conversion to UTF-8 needed):
ADDITIONAL_DICT_FILE=/tmp/empty
# see https://sourceforge.net/p/germandict/code/HEAD/tree/hunspell_words.txt?format=raw (conversion to UTF-8 needed):
UNKNOWN_TO_HUNSPELL=/tmp/empty
# get frequency data from https://github.com/mozilla-b2g/gaia/tree/master/apps/keyboard/js/imes/latin/dictionaries -
# this is optional, remove "-freq $FREQ_FILE" below for not using frequencies:
FREQ_FILE=/home/dnaber/lt/occurrence_counts/fr_wordlist.xml
INPUT_ENCODING=utf8
OUTPUT_ENCODING=utf8
TEMP_FILE=/tmp/lt-dictionary.dump
FINAL_FILE=/tmp/lt-dictionary.new
OUTPUT_FILE=/tmp/fr_FR.dict
CPATH=$REPO/com/carrotsearch/hppc/0.7.1/hppc-0.7.1.jar:$REPO/com/beust/jcommander/1.48/jcommander-1.48.jar:$REPO/org/carrot2/morfologik-fsa-builders/2.1.2/morfologik-fsa-builders-2.1.2.jar:$REPO/org/carrot2/morfologik-stemming/2.1.2/morfologik-stemming-2.1.2.jar:$REPO/org/carrot2/morfologik-fsa/2.1.2/morfologik-fsa-2.1.2.jar:$REPO/org/carrot2/morfologik-tools/2.1.2/morfologik-tools-2.1.2.jar:$REPO/commons-cli/commons-cli/1.2/commons-cli-1.2.jar:languagetool-tools/target/languagetool-tools-${LT_VERSION}.jar
LANG_CODE=$1
COUNTRY_CODE=$2
PREFIX=${LANG_CODE}_${COUNTRY_CODE}
TOKENIZER_LANG=${LANG_CODE}-${COUNTRY_CODE}
CONTENT_DIR=languagetool-language-modules/${LANG_CODE}/src/main/resources/org/languagetool/resource/$LANG_CODE/hunspell
INFO_FILE=${CONTENT_DIR}/${PREFIX}.info
DIC_NO_SUFFIX=$CONTENT_DIR/$PREFIX
DIC_FILE=$DIC_NO_SUFFIX.dic
if [ ! -f $ADDITIONAL_DICT_FILE ]; then
echo "File not found: $ADDITIONAL_DICT_FILE"
exit
fi
if [ ! -f $UNKNOWN_TO_HUNSPELL ]; then
echo "File not found: $UNKNOWN_TO_HUNSPELL"
exit
fi
echo "Using $CONTENT_DIR/$PREFIX.dic and affix $CONTENT_DIR/$PREFIX.aff..."
mvn clean package -DskipTests &&
unmunch $DIC_FILE $CONTENT_DIR/$PREFIX.aff | \
# unmunch doesn't properly work for languages with compounds, thus we filter
# the result using hunspell:
recode $INPUT_ENCODING..$OUTPUT_ENCODING | grep -v "^#" | hunspell -d $DIC_NO_SUFFIX -G -l >$TEMP_FILE
echo "Input sizes:"
wc -l $TEMP_FILE
wc -l $ADDITIONAL_DICT_FILE
sort $ADDITIONAL_DICT_FILE >/tmp/additional_dict_file_sorted
sort $UNKNOWN_TO_HUNSPELL >/tmp/unknown_to_hunspell_sorted
# remove the words that hunspell wouldn't accept (see https://github.com/languagetool-org/languagetool/issues/725#issuecomment-312961626):
comm -23 /tmp/additional_dict_file_sorted /tmp/unknown_to_hunspell_sorted >/tmp/additional_without_hunspell_unknown
cat /tmp/additional_without_hunspell_unknown $TEMP_FILE | sort | uniq >$FINAL_FILE
echo "Final size:"
wc -l $FINAL_FILE
java -cp $CPATH:languagetool-standalone/target/LanguageTool-$LT_VERSION/LanguageTool-$LT_VERSION/languagetool.jar:languagetool-standalone/target/LanguageTool-$LT_VERSION/LanguageTool-$LT_VERSION/libs/languagetool-tools.jar \
org.languagetool.tools.SpellDictionaryBuilder -i $FINAL_FILE -info $INFO_FILE -o $OUTPUT_FILE -freq $FREQ_FILE
rm $TEMP_FILE
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment