Last active
November 29, 2017 13:17
-
-
Save danielnaber/fd20cff8d86a4608c22143dcf1fe62b0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
echo "Create morfologik spelling dictionary, based on Hunspell dictionary" | |
echo "This script assumes you have the full LanguageTool build environment" | |
echo "Please call this script from the LanguageTool top-level directory" | |
echo "" | |
if [ $# -ne 2 ] | |
then | |
SCRIPT=`basename $0` | |
echo "Usage: $SCRIPT <langCode> <countryCode>" | |
echo " For example: $SCRIPT de AT" | |
exit 1 | |
fi | |
REPO=/home/dnaber/.m2/repository | |
LT_VERSION=4.0-SNAPSHOT | |
# see https://sourceforge.net/projects/germandict/files/ (conversion to UTF-8 needed): | |
ADDITIONAL_DICT_FILE=/tmp/empty | |
# see https://sourceforge.net/p/germandict/code/HEAD/tree/hunspell_words.txt?format=raw (conversion to UTF-8 needed): | |
UNKNOWN_TO_HUNSPELL=/tmp/empty | |
# get frequency data from https://github.com/mozilla-b2g/gaia/tree/master/apps/keyboard/js/imes/latin/dictionaries - | |
# this is optional, remove "-freq $FREQ_FILE" below for not using frequencies: | |
FREQ_FILE=/home/dnaber/lt/occurrence_counts/fr_wordlist.xml | |
INPUT_ENCODING=utf8 | |
OUTPUT_ENCODING=utf8 | |
TEMP_FILE=/tmp/lt-dictionary.dump | |
FINAL_FILE=/tmp/lt-dictionary.new | |
OUTPUT_FILE=/tmp/fr_FR.dict | |
CPATH=$REPO/com/carrotsearch/hppc/0.7.1/hppc-0.7.1.jar:$REPO/com/beust/jcommander/1.48/jcommander-1.48.jar:$REPO/org/carrot2/morfologik-fsa-builders/2.1.2/morfologik-fsa-builders-2.1.2.jar:$REPO/org/carrot2/morfologik-stemming/2.1.2/morfologik-stemming-2.1.2.jar:$REPO/org/carrot2/morfologik-fsa/2.1.2/morfologik-fsa-2.1.2.jar:$REPO/org/carrot2/morfologik-tools/2.1.2/morfologik-tools-2.1.2.jar:$REPO/commons-cli/commons-cli/1.2/commons-cli-1.2.jar:languagetool-tools/target/languagetool-tools-${LT_VERSION}.jar | |
LANG_CODE=$1 | |
COUNTRY_CODE=$2 | |
PREFIX=${LANG_CODE}_${COUNTRY_CODE} | |
TOKENIZER_LANG=${LANG_CODE}-${COUNTRY_CODE} | |
CONTENT_DIR=languagetool-language-modules/${LANG_CODE}/src/main/resources/org/languagetool/resource/$LANG_CODE/hunspell | |
INFO_FILE=${CONTENT_DIR}/${PREFIX}.info | |
DIC_NO_SUFFIX=$CONTENT_DIR/$PREFIX | |
DIC_FILE=$DIC_NO_SUFFIX.dic | |
if [ ! -f $ADDITIONAL_DICT_FILE ]; then | |
echo "File not found: $ADDITIONAL_DICT_FILE" | |
exit | |
fi | |
if [ ! -f $UNKNOWN_TO_HUNSPELL ]; then | |
echo "File not found: $UNKNOWN_TO_HUNSPELL" | |
exit | |
fi | |
echo "Using $CONTENT_DIR/$PREFIX.dic and affix $CONTENT_DIR/$PREFIX.aff..." | |
mvn clean package -DskipTests && | |
unmunch $DIC_FILE $CONTENT_DIR/$PREFIX.aff | \ | |
# unmunch doesn't properly work for languages with compounds, thus we filter | |
# the result using hunspell: | |
recode $INPUT_ENCODING..$OUTPUT_ENCODING | grep -v "^#" | hunspell -d $DIC_NO_SUFFIX -G -l >$TEMP_FILE | |
echo "Input sizes:" | |
wc -l $TEMP_FILE | |
wc -l $ADDITIONAL_DICT_FILE | |
sort $ADDITIONAL_DICT_FILE >/tmp/additional_dict_file_sorted | |
sort $UNKNOWN_TO_HUNSPELL >/tmp/unknown_to_hunspell_sorted | |
# remove the words that hunspell wouldn't accept (see https://github.com/languagetool-org/languagetool/issues/725#issuecomment-312961626): | |
comm -23 /tmp/additional_dict_file_sorted /tmp/unknown_to_hunspell_sorted >/tmp/additional_without_hunspell_unknown | |
cat /tmp/additional_without_hunspell_unknown $TEMP_FILE | sort | uniq >$FINAL_FILE | |
echo "Final size:" | |
wc -l $FINAL_FILE | |
java -cp $CPATH:languagetool-standalone/target/LanguageTool-$LT_VERSION/LanguageTool-$LT_VERSION/languagetool.jar:languagetool-standalone/target/LanguageTool-$LT_VERSION/LanguageTool-$LT_VERSION/libs/languagetool-tools.jar \ | |
org.languagetool.tools.SpellDictionaryBuilder -i $FINAL_FILE -info $INFO_FILE -o $OUTPUT_FILE -freq $FREQ_FILE | |
rm $TEMP_FILE |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment