Created
February 26, 2020 10:30
-
-
Save bricksdont/7a9ac764d874b90853eff88d53971033 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# author: Mathias Mueller / mathias.mueller@uzh.ch | |
# purpose: train word alignment models with fast_align | |
# usage information | |
if [ $# -lt 4 ] | |
then | |
echo "[ERROR] Too few arguments. Expected 4 command line arguments." 1>&2 | |
echo "Usage: $0 <language 1> <language 2> <path to training set without language suffix> <output directory for trained model>" 1>&2 | |
exit 1 | |
fi | |
# fixed paths | |
CDEC=/mnt/storage/clfiles/resources/paths/CL_applications/mt/cdec/vGithub | |
MOSES=/mnt/storage/clfiles/resources/applications/mt/moses/v3.0/scripts | |
FAST_ALIGN=/mnt/storage/clfiles/projects/clresources/resources/applications/aligner/fast_align/vGit/build | |
TMP=$(mktemp -d) | |
# fixed variables for word alignment | |
export OMP_NUM_THREADS="16" | |
# languages involved | |
lang1=$1 | |
lang2=$2 | |
# path to training set, including file name without lang suffix | |
traindir=$3 | |
# output directory for the trained model | |
outputdir=$4 | |
# make directories | |
mkdir -p $outputdir | |
echo "@ tokenizer.perl" 1>&2 | |
# tokenization | |
tokenize_command="$MOSES/tokenizer/tokenizer.perl -threads 4" | |
cat $traindir.de | $MOSES/tokenizer/normalize-punctuation.perl -l de | $tokenize_command -l de -no-escape > $TMP/$lang1-$lang2.tok.de & # 2> $TMP/train.log & | |
cat $traindir.en | $MOSES/tokenizer/normalize-punctuation.perl -l en | $tokenize_command -l en -no-escape > $TMP/$lang1-$lang2.tok.en & | |
# lowercase | |
echo "@ lowercase.perl" 1>&2 | |
wait | |
for lang in {$lang1,$lang2}; do | |
cat $TMP/$lang1-$lang2.tok.$lang | $MOSES/tokenizer/lowercase.perl > $TMP/$lang1-$lang2.tok.lc.$lang & # 2>> $TMP/train.log & | |
done | |
# convert to two-column format | |
echo "@ paste_files.pl" 1>&2 | |
wait | |
perl $CDEC/corpus/paste-files.pl $TMP/$lang1-$lang2.tok.lc.$lang1 $TMP/$lang1-$lang2.tok.lc.$lang2 > $TMP/corpus.$lang1-$lang2.tmp # 2>> $TMP/train.log | |
# filter max sentence length: 200 tokens | |
echo "@ filter-length.pl" 1>&2 | |
wait | |
perl $CDEC/corpus/filter-length.pl -200 $TMP/corpus.$lang1-$lang2.tmp > $TMP/corpus.$lang1-$lang2 # 2>> $TMP/train.log | |
# train forward model | |
wait | |
FWD_CMD="$FAST_ALIGN/fast_align -i $TMP/corpus.$lang1-$lang2 -d -v -o -p $outputdir/corpus.$lang1-$lang2.fwd_params" | |
echo "Executing $FWD_CMD > $outputdir/corpus.$lang1-$lang2.fwd_align 2> $outputdir/corpus.$lang1-$lang2.fwd_err" 1>&2 | |
$FWD_CMD > $outputdir/corpus.$lang1-$lang2.fwd_align 2> $outputdir/corpus.$lang1-$lang2.fwd_err # & do not run in background as this would double the number of threads used | |
# train reverse model | |
REV_CMD="$FAST_ALIGN/fast_align -i $TMP/corpus.$lang1-$lang2 -r -d -v -o -p $outputdir/corpus.$lang1-$lang2.rev_params" | |
echo "Executing $REV_CMD > $outputdir/corpus.$lang1-$lang2.rev_align 2> $outputdir/corpus.$lang1-$lang2.rev_err" 1>&2 | |
$REV_CMD > $outputdir/corpus.$lang1-$lang2.rev_align 2> $outputdir/corpus.$lang1-$lang2.rev_err #& | |
# cleanup | |
rm -rf $TMP | |
# no need for the alignments from training, comment out if you need them | |
rm -r $outputdir/corpus.$lang1-$lang2.fwd_align | |
rm -r $outputdir/corpus.$lang1-$lang2.rev_align |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment