Created
February 26, 2020 10:31
-
-
Save bricksdont/0d1718c7c3fc05714b582afe4c3b5005 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# author: Mathias Mueller / mathias.mueller@uzh.ch | |
# purpose: apply word alignment models trained with fast_align | |
# usage information | |
if [ $# -lt 6 ] | |
then | |
echo "[ERROR] Too few arguments. Expected 6 command line arguments." 1>&2 | |
echo "Usage: $0 <language 1> <language 2> <source txt file> <target txt file> <directory of trained model> <output file path>" 1>&2 | |
exit 1 | |
fi | |
# fixed paths | |
CDEC=/mnt/storage/clfiles/resources/paths/CL_applications/mt/cdec/vGithub | |
MOSES=/mnt/storage/clfiles/resources/applications/mt/moses/v3.0/scripts | |
FAST_ALIGN=/mnt/storage/clfiles/resources/applications/aligner/fast_align/vGit/build | |
TMP=$(mktemp -d) | |
# fixed variables for word alignment | |
export OMP_NUM_THREADS="4" | |
# languages involved | |
lang1=$1 | |
lang2=$2 | |
# path to input files | |
sourcetok=$3 | |
targettok=$4 | |
# path to trained fast_align model | |
modeldir=$5 | |
# path where alignments should be written | |
outputpath=$6 | |
# tokenization is already supplied | |
#tokenize_command="$MOSES/tokenizer/tokenizer.perl -q" | |
# | |
cat $sourcetok > $TMP/$lang1-$lang2.tok.$lang1 | |
cat $targettok > $TMP/$lang1-$lang2.tok.$lang2 | |
#cat $sourcetxt | $tokenize_command -l $lang1 > $TMP/$lang1-$lang2.tok.$lang1 & # 2> $TMP/apply.log & | |
#cat $targettxt | $tokenize_command -l $lang2 > $TMP/$lang1-$lang2.tok.$lang2 & # 2> $TMP/apply.log & | |
# lowercase | |
#wait | |
for lang in {$lang1,$lang2}; do | |
cat $TMP/$lang1-$lang2.tok.$lang | $MOSES/tokenizer/lowercase.perl > $TMP/$lang1-$lang2.tok.lc.$lang & # 2>> $TMP/apply.log & | |
done | |
# convert to two-column format | |
wait | |
perl $CDEC/corpus/paste-files.pl $TMP/$lang1-$lang2.tok.lc.$lang1 $TMP/$lang1-$lang2.tok.lc.$lang2 > $TMP/test.$lang1-$lang2 # 2>> $TMP/apply.log | |
# force-align with existing model | |
CMD="$FAST_ALIGN/force_align.py $modeldir/corpus.$lang1-$lang2.fwd_params $modeldir/corpus.$lang1-$lang2.fwd_err \ | |
$modeldir/corpus.$lang1-$lang2.rev_params $modeldir/corpus.$lang1-$lang2.rev_err" | |
echo "Executing $CMD < $TMP/test.$lang1-$lang2 > $outputpath" 1>&2 | |
$CMD < $TMP/test.$lang1-$lang2 > $outputpath | |
# combine sentences and alignments into single *.in file | |
#$CDEC/corpus/paste-files.pl $TMP/$lang1-$lang2.tok.lc.$lang1 $TMP/$lang1-$lang2.tok.lc.$lang2 $outputdir/test.$lang1-$lang2.gdfa > $outputdir/test.$lang1-$lang2.in | |
# cleanup (comment out for debug) | |
# rm -rf $TMP |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment