Skip to content

Instantly share code, notes, and snippets.

@bricksdont
Created February 26, 2020 10:31
Show Gist options
  • Save bricksdont/0d1718c7c3fc05714b582afe4c3b5005 to your computer and use it in GitHub Desktop.
Save bricksdont/0d1718c7c3fc05714b582afe4c3b5005 to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
# author: Mathias Mueller / mathias.mueller@uzh.ch
# purpose: apply word alignment models trained with fast_align
# usage information
if [ $# -lt 6 ]
then
echo "[ERROR] Too few arguments. Expected 6 command line arguments." 1>&2
echo "Usage: $0 <language 1> <language 2> <source txt file> <target txt file> <directory of trained model> <output file path>" 1>&2
exit 1
fi
# fixed paths
CDEC=/mnt/storage/clfiles/resources/paths/CL_applications/mt/cdec/vGithub
MOSES=/mnt/storage/clfiles/resources/applications/mt/moses/v3.0/scripts
FAST_ALIGN=/mnt/storage/clfiles/resources/applications/aligner/fast_align/vGit/build
TMP=$(mktemp -d)
# fixed variables for word alignment
export OMP_NUM_THREADS="4"
# languages involved
lang1=$1
lang2=$2
# path to input files
sourcetok=$3
targettok=$4
# path to trained fast_align model
modeldir=$5
# path where alignments should be written
outputpath=$6
# tokenization is already supplied
#tokenize_command="$MOSES/tokenizer/tokenizer.perl -q"
#
cat $sourcetok > $TMP/$lang1-$lang2.tok.$lang1
cat $targettok > $TMP/$lang1-$lang2.tok.$lang2
#cat $sourcetxt | $tokenize_command -l $lang1 > $TMP/$lang1-$lang2.tok.$lang1 & # 2> $TMP/apply.log &
#cat $targettxt | $tokenize_command -l $lang2 > $TMP/$lang1-$lang2.tok.$lang2 & # 2> $TMP/apply.log &
# lowercase
#wait
for lang in {$lang1,$lang2}; do
cat $TMP/$lang1-$lang2.tok.$lang | $MOSES/tokenizer/lowercase.perl > $TMP/$lang1-$lang2.tok.lc.$lang & # 2>> $TMP/apply.log &
done
# convert to two-column format
wait
perl $CDEC/corpus/paste-files.pl $TMP/$lang1-$lang2.tok.lc.$lang1 $TMP/$lang1-$lang2.tok.lc.$lang2 > $TMP/test.$lang1-$lang2 # 2>> $TMP/apply.log
# force-align with existing model
CMD="$FAST_ALIGN/force_align.py $modeldir/corpus.$lang1-$lang2.fwd_params $modeldir/corpus.$lang1-$lang2.fwd_err \
$modeldir/corpus.$lang1-$lang2.rev_params $modeldir/corpus.$lang1-$lang2.rev_err"
echo "Executing $CMD < $TMP/test.$lang1-$lang2 > $outputpath" 1>&2
$CMD < $TMP/test.$lang1-$lang2 > $outputpath
# combine sentences and alignments into single *.in file
#$CDEC/corpus/paste-files.pl $TMP/$lang1-$lang2.tok.lc.$lang1 $TMP/$lang1-$lang2.tok.lc.$lang2 $outputdir/test.$lang1-$lang2.gdfa > $outputdir/test.$lang1-$lang2.in
# cleanup (comment out for debug)
# rm -rf $TMP
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment