Skip to content

Instantly share code, notes, and snippets.

@shijieyao
Created June 13, 2018 16:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shijieyao/389e9656758d58fdf164d07a0dcb1dba to your computer and use it in GitHub Desktop.
Save shijieyao/389e9656758d58fdf164d07a0dcb1dba to your computer and use it in GitHub Desktop.
#!/bin/bash
source venv/bin/activate
lang=$1
type=$2
experiment_id=$3
# theano device, in case you do not want to compute on gpu, change it to cpu
device=gpu0
basedir=.
datadir=${basedir}/data/languages
modeldir=${basedir}/models/${lang}-${type}-${experiment_id}
datadir=${basedir}/data/languages/${lang}-${type}
mkdir -p ${modeldir}
mkdir -p ${modeldir}/data
#theano compilation directory
base_compiledir=theano
mkdir -p ${base_compiledir}
cp ${basedir}/validate.sh ${modeldir}/.
echo "Copying data sets"
cp ${datadir}/train.* ${modeldir}/data/.
cp ${datadir}/test.* ${modeldir}/data/.
cp ${datadir}/dev.* ${modeldir}/data/.
echo "Building Dictionaries"
python ${basedir}/data/build_dictionary.py ${modeldir}/data/train.in ${modeldir}/data/train.out
dim_word=300
dim=100
batch_size=60
n_words_src=($(wc -l ${modeldir}/data/train.in.json))
n_words_src=$((n_words_src-1))
n_words_trg=($(wc -l ${modeldir}/data/train.out.json))
n_words_trg=$((n_words_trg-1))
maxlen=150
optimizer="adadelta"
dispFreq=100
validate_every_n_epochs=1 #increase to make training faster
valid_freq=($(wc -l ${modeldir}/data/train.in))
valid_freq=$((valid_freq / batch_size * ${validate_every_n_epochs}))
burn_in_for_n_epochs=0 #increase to make training faster
validBurnIn=($(wc -l ${modeldir}/data/train.in))
validBurnIn=$((validBurnIn *${burn_in_for_n_epochs} / batch_size))
max_epochs=1000
echo "Starting training"
THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device,base_compiledir=${base_compiledir} python ${basedir}/nematus/nmt.py \
--model ${modeldir}/model.npz \
--datasets ${modeldir}/data/train.in ${modeldir}/data/train.out \
--valid_datasets ${modeldir}/data/dev.in ${modeldir}/data/dev.out \
--dictionaries ${modeldir}/data/train.in.json ${modeldir}/data/train.out.json \
--dim_word ${dim_word} \
--dim ${dim} \
--n_words_src ${n_words_src} \
--n_words ${n_words_trg} \
--maxlen ${maxlen} \
--optimizer ${optimizer} \
--batch_size ${batch_size} \
--dispFreq ${dispFreq} \
--max_epochs ${max_epochs} \
--external_validation_script ${modeldir}/validate.sh \
--weight_normalisation \
--reload \
--no_reload_training_progress \
--use_dropout \
--enc_depth 2 \
--dec_depth 2 \
--patience 10 \
--validBurnIn ${validBurnIn} \
--validFreq ${valid_freq} &>> ${modeldir}/training.log
echo "End of training"
echo "Lemmatizing test set"
THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device,on_unused_input=warn,base_compiledir=${base_compiledir} python ${basedir}/nematus/translate.py \
-m ${modeldir}/best_model/model.npz \
-i ${modeldir}/data/test.in \
-o ${modeldir}/best_model/test-hypothesis \
-k 12 -n -p 1
echo "Done"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment