Skip to content

Instantly share code, notes, and snippets.

@bricksdont
Created February 26, 2020 10:30
Show Gist options
  • Save bricksdont/7a9ac764d874b90853eff88d53971033 to your computer and use it in GitHub Desktop.
Save bricksdont/7a9ac764d874b90853eff88d53971033 to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
# author: Mathias Mueller / mathias.mueller@uzh.ch
# purpose: train word alignment models with fast_align
# usage information
if [ $# -lt 4 ]
then
echo "[ERROR] Too few arguments. Expected 4 command line arguments." 1>&2
echo "Usage: $0 <language 1> <language 2> <path to training set without language suffix> <output directory for trained model>" 1>&2
exit 1
fi
# fixed paths
CDEC=/mnt/storage/clfiles/resources/paths/CL_applications/mt/cdec/vGithub
MOSES=/mnt/storage/clfiles/resources/applications/mt/moses/v3.0/scripts
FAST_ALIGN=/mnt/storage/clfiles/projects/clresources/resources/applications/aligner/fast_align/vGit/build
TMP=$(mktemp -d)
# fixed variables for word alignment
export OMP_NUM_THREADS="16"
# languages involved
lang1=$1
lang2=$2
# path to training set, including file name without lang suffix
traindir=$3
# output directory for the trained model
outputdir=$4
# make directories
mkdir -p $outputdir
echo "@ tokenizer.perl" 1>&2
# tokenization
tokenize_command="$MOSES/tokenizer/tokenizer.perl -threads 4"
cat $traindir.de | $MOSES/tokenizer/normalize-punctuation.perl -l de | $tokenize_command -l de -no-escape > $TMP/$lang1-$lang2.tok.de & # 2> $TMP/train.log &
cat $traindir.en | $MOSES/tokenizer/normalize-punctuation.perl -l en | $tokenize_command -l en -no-escape > $TMP/$lang1-$lang2.tok.en &
# lowercase
echo "@ lowercase.perl" 1>&2
wait
for lang in {$lang1,$lang2}; do
cat $TMP/$lang1-$lang2.tok.$lang | $MOSES/tokenizer/lowercase.perl > $TMP/$lang1-$lang2.tok.lc.$lang & # 2>> $TMP/train.log &
done
# convert to two-column format
echo "@ paste_files.pl" 1>&2
wait
perl $CDEC/corpus/paste-files.pl $TMP/$lang1-$lang2.tok.lc.$lang1 $TMP/$lang1-$lang2.tok.lc.$lang2 > $TMP/corpus.$lang1-$lang2.tmp # 2>> $TMP/train.log
# filter max sentence length: 200 tokens
echo "@ filter-length.pl" 1>&2
wait
perl $CDEC/corpus/filter-length.pl -200 $TMP/corpus.$lang1-$lang2.tmp > $TMP/corpus.$lang1-$lang2 # 2>> $TMP/train.log
# train forward model
wait
FWD_CMD="$FAST_ALIGN/fast_align -i $TMP/corpus.$lang1-$lang2 -d -v -o -p $outputdir/corpus.$lang1-$lang2.fwd_params"
echo "Executing $FWD_CMD > $outputdir/corpus.$lang1-$lang2.fwd_align 2> $outputdir/corpus.$lang1-$lang2.fwd_err" 1>&2
$FWD_CMD > $outputdir/corpus.$lang1-$lang2.fwd_align 2> $outputdir/corpus.$lang1-$lang2.fwd_err # & do not run in background as this would double the number of threads used
# train reverse model
REV_CMD="$FAST_ALIGN/fast_align -i $TMP/corpus.$lang1-$lang2 -r -d -v -o -p $outputdir/corpus.$lang1-$lang2.rev_params"
echo "Executing $REV_CMD > $outputdir/corpus.$lang1-$lang2.rev_align 2> $outputdir/corpus.$lang1-$lang2.rev_err" 1>&2
$REV_CMD > $outputdir/corpus.$lang1-$lang2.rev_align 2> $outputdir/corpus.$lang1-$lang2.rev_err #&
# cleanup
rm -rf $TMP
# no need for the alignments from training, comment out if you need them
rm -r $outputdir/corpus.$lang1-$lang2.fwd_align
rm -r $outputdir/corpus.$lang1-$lang2.rev_align
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment