Last active
February 18, 2022 07:38
-
-
Save cbaziotis/8047f2077f1fad74709c7f82d87893a5 to your computer and use it in GitHub Desktop.
Fairseq preprocess for multilingual denoising (e.g., mBART)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#----------------------------------------------------------------------------------------------------------------------- | |
# This script contains the preprocessing pipeline for some predefined datasets. | |
# 1. It learns a joint sentencepiece model on a sub-set of the data (training-side) | |
# 2. It tokenizes with the sentencepice model all the data | |
# 3. It binarizes them for training with faireq | |
#----------------------------------------------------------------------------------------------------------------------- | |
DATASET=$1 | |
#DATASET=flores_neen | |
# main paths | |
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" | |
MAIN_PATH=$(readlink -f "$DIR") | |
TOOLS_PATH=$MAIN_PATH/tools | |
DATA_PATH=$MAIN_PATH/data | |
VOCAB_PATH=$MAIN_PATH/vocab | |
BIN_PATH=$MAIN_PATH/data-bin | |
TOK_PATH=$MAIN_PATH/tok | |
# create paths | |
mkdir -p $MAIN_PATH | |
mkdir -p $TOOLS_PATH | |
mkdir -p $DATA_PATH | |
mkdir -p $BIN_PATH | |
mkdir -p $TOK_PATH | |
mkdir -p $VOCAB_PATH | |
SPM_SUBSAMPLE=5000000 | |
SPM_COVERAGE=1 | |
SPM_DATA=spm.$DATASET.train | |
#--------------------------------------------------------------------------- | |
# define dataset fname paths | |
#--------------------------------------------------------------------------- | |
if [ "$DATASET" == "flores_sien" ]; then | |
VOCAB_SIZE=20000 | |
SPM_COVERAGE=0.9995 | |
L1=si | |
L2=en | |
TRAIN_RAW_L1=$DATA_PATH/flores/mono/mono.norm.sample5000000.$L1 | |
TRAIN_RAW_L2=$DATA_PATH/flores/mono/mono.sample5000000.$L2 | |
VALID_RAW_L2=$DATA_PATH/flores/wiki_si_en_bpe5000/valid.$L2 | |
VALID_RAW_L1=$DATA_PATH/flores/wiki_si_en_bpe5000/valid.$L1 | |
# This also maintains 1:1 ratio between languages | |
# echo "Sub-sampling $SPM_SUBSAMPLE sentences from the data of each language to train sentencepiece..." | |
# shuf -n $SPM_SUBSAMPLE $DATA_PATH/flores/wiki_si_en_bpe5000/train.$L1 >$SPM_DATA | |
# shuf -n $SPM_SUBSAMPLE $DATA_PATH/flores/wiki_si_en_bpe5000/train.$L2 >>$SPM_DATA | |
SPM_DATA="${TRAIN_RAW_L1},${TRAIN_RAW_L2}" | |
elif [ "$DATASET" == "flores_neen" ]; then | |
VOCAB_SIZE=20000 | |
SPM_COVERAGE=0.9995 | |
L1=ne | |
L2=en | |
TRAIN_RAW_L1=$DATA_PATH/flores/mono/mono.norm.sample5000000.$L1 | |
TRAIN_RAW_L2=$DATA_PATH/flores/mono/mono.sample5000000.$L2 | |
VALID_RAW_L2=$DATA_PATH/flores/wiki_ne_en_bpe5000/valid.$L2 | |
VALID_RAW_L1=$DATA_PATH/flores/wiki_ne_en_bpe5000/valid.$L1 | |
# This also maintains 1:1 ratio between languages | |
# echo "Sub-sampling $SPM_SUBSAMPLE sentences from the data of each language to train sentencepiece..." | |
# shuf -n $SPM_SUBSAMPLE $DATA_PATH/flores/wiki_ne_en_bpe5000/train.$L1 >$SPM_DATA | |
# shuf -n $SPM_SUBSAMPLE $DATA_PATH/flores/wiki_ne_en_bpe5000/train.$L2 >>$SPM_DATA | |
SPM_DATA="${TRAIN_RAW_L1},${TRAIN_RAW_L2}" | |
elif [ "$DATASET" == "wmt14_fren" ]; then | |
VOCAB_SIZE=60000 | |
L1=fr | |
L2=en | |
TRAIN_RAW_L1=$DATA_PATH/wmt_unmt/mono/fr/all.fr.normal | |
TRAIN_RAW_L2=$DATA_PATH/wmt_unmt/mono/en/all.en.normal | |
VALID_RAW_L1=$DATA_PATH/wmt_unmt/para/evaluation_sets/newstest2013-ref.fr | |
VALID_RAW_L2=$DATA_PATH/wmt_unmt/para/evaluation_sets/newstest2013-ref.en | |
echo "Sub-sampling $SPM_SUBSAMPLE sentences from the data of each language to train sentencepiece..." | |
shuf -n $SPM_SUBSAMPLE $TRAIN_RAW_L1 >$SPM_DATA | |
shuf -n $SPM_SUBSAMPLE $TRAIN_RAW_L2 >>$SPM_DATA | |
elif [ "$DATASET" == "wmt16_deen" ]; then | |
VOCAB_SIZE=60000 | |
L1=de | |
L2=en | |
TRAIN_RAW_L1=$DATA_PATH/wmt_unmt/mono/de/all.de.normal | |
TRAIN_RAW_L2=$DATA_PATH/wmt_unmt/mono/en/all.en.normal | |
VALID_RAW_L1=$DATA_PATH/wmt_unmt/para/evaluation_sets/newstest2016-ende-ref.de | |
VALID_RAW_L2=$DATA_PATH/wmt_unmt/para/evaluation_sets/newstest2016-deen-ref.en | |
TEST_RAW_L1=$DATA_PATH/wmt_unmt/para/evaluation_sets/newstest2013-ref.de | |
TEST_RAW_L2=$DATA_PATH/wmt_unmt/para/evaluation_sets/newstest2013-ref.en | |
echo "Sub-sampling $SPM_SUBSAMPLE sentences from the data of each language to train sentencepiece..." | |
shuf -n $SPM_SUBSAMPLE $TRAIN_RAW_L1 >$SPM_DATA | |
shuf -n $SPM_SUBSAMPLE $TRAIN_RAW_L2 >>$SPM_DATA | |
elif [ "$DATASET" == "wmt16_roen" ]; then | |
VOCAB_SIZE=60000 | |
L1=ro | |
L2=en | |
TRAIN_RAW_L1=$DATA_PATH/wmt_unmt/mono/ro/all.ro.normal | |
TRAIN_RAW_L2=$DATA_PATH/wmt_unmt/mono/en/all.en.normal | |
VALID_RAW_L1=$DATA_PATH/wmt_unmt/para/evaluation_sets/newsdev2016-enro-ref.ro | |
VALID_RAW_L2=$DATA_PATH/wmt_unmt/para/evaluation_sets/newsdev2016-roen-ref.en | |
echo "Sub-sampling $SPM_SUBSAMPLE sentences from the data of each language to train sentencepiece..." | |
shuf -n $SPM_SUBSAMPLE $TRAIN_RAW_L1 >$SPM_DATA | |
shuf -n $SPM_SUBSAMPLE $TRAIN_RAW_L2 >>$SPM_DATA | |
else | |
echo "unknown dataset key" | |
fi | |
#--------------------------------------------------------------------------- | |
# For logging purposes | |
#--------------------------------------------------------------------------- | |
echo "" | |
echo "---------------------------------" | |
echo "MAIN_PATH: $MAIN_PATH" | |
echo "TOOLS_PATH: $TOOLS_PATH" | |
echo "DATA_PATH: $DATA_PATH" | |
echo "BIN_PATH: $BIN_PATH" | |
echo "TOK_PATH: $TOK_PATH" | |
echo "VOCAB_PATH: $VOCAB_PATH" | |
echo "---------------------------------" | |
echo "DATASET: $DATASET" | |
echo "VOCAB_SIZE: $VOCAB_SIZE" | |
echo "SPM_DATA: $SPM_DATA" | |
echo "---------------------------------" | |
#read -n 1 -s -r -p "If this looks good, press any key to continue..." | |
# Paths of tokenized data per language | |
NAME=mono.$DATASET.$VOCAB_SIZE | |
SPM=$VOCAB_PATH/$DATASET.$VOCAB_SIZE | |
mkdir -p $TOK_PATH/$NAME | |
# Path with binarized data | |
PROC_PATH=$BIN_PATH/pretraining.$NAME | |
TRAIN_TOK_L1=$TOK_PATH/$NAME/train.$L1 | |
TRAIN_TOK_L2=$TOK_PATH/$NAME/train.$L2 | |
VALID_TOK_L1=$TOK_PATH/$NAME/valid.$L1 | |
VALID_TOK_L2=$TOK_PATH/$NAME/valid.$L2 | |
#--------------------------------------------------------------------------- | |
# 1. Train the sentencepiece model (SPM) | |
#--------------------------------------------------------------------------- | |
if [ ! -f "$SPM.model" ]; then | |
echo "Training SPM..." | |
spm_train --input=$SPM_DATA \ | |
--vocab_size=$VOCAB_SIZE \ | |
--character_coverage=$SPM_COVERAGE \ | |
--max_sentence_length=256 \ | |
--model_prefix=$SPM \ | |
--model_type=unigram | |
# --model_type=unigram --input_sentence_size=10000000 --shuffle_input_sentence=true | |
# convert SPM vocab to fairseq dict for later use | |
cut -f1 $SPM.vocab | tail -n +4 | sed "s/$/ 100/g" >$SPM.dict.txt | |
else | |
echo "$SPM.model already trained." | |
fi | |
rm $SPM_DATA | |
#--------------------------------------------------------------------------- | |
# 2. Use the SPM to tokenize the data | |
#--------------------------------------------------------------------------- | |
echo "Tokenizing..." | |
echo " - $TRAIN_TOK_L1" | |
if [ ! -f "$TRAIN_TOK_L1" ]; then spm_encode --model=$SPM.model --output_format=piece <$TRAIN_RAW_L1 >$TRAIN_TOK_L1; fi | |
echo "done!" | |
echo " - $TRAIN_TOK_L2" | |
if [ ! -f "$TRAIN_TOK_L2" ]; then spm_encode --model=$SPM.model --output_format=piece <$TRAIN_RAW_L2 >$TRAIN_TOK_L2; fi | |
echo "done!" | |
echo " - $VALID_TOK_L1" | |
if [ ! -f "$VALID_TOK_L1" ]; then spm_encode --model=$SPM.model --output_format=piece <$VALID_RAW_L1 >$VALID_TOK_L1; fi | |
echo "done!" | |
echo " - $VALID_TOK_L2" | |
if [ ! -f "$VALID_TOK_L2" ]; then spm_encode --model=$SPM.model --output_format=piece <$VALID_RAW_L2 >$VALID_TOK_L2; fi | |
echo "done!" | |
#--------------------------------------------------------------------------- | |
# 3. Binarize the tokenized data | |
#--------------------------------------------------------------------------- | |
echo "Binarizing..." | |
fairseq-preprocess --only-source \ | |
--trainpref $TRAIN_TOK_L1 \ | |
--validpref $VALID_TOK_L1 \ | |
--destdir $PROC_PATH/$L1 \ | |
--srcdict $SPM.dict.txt \ | |
--bpe sentencepiece \ | |
--workers 10 | |
fairseq-preprocess --only-source \ | |
--trainpref $TRAIN_TOK_L2 \ | |
--validpref $VALID_TOK_L2 \ | |
--destdir $PROC_PATH/$L2 \ | |
--srcdict $SPM.dict.txt \ | |
--bpe sentencepiece \ | |
--workers 10 | |
cp "$SPM.dict.txt" "$PROC_PATH/dict.txt" | |
cp $SPM.model $PROC_PATH/spm.model | |
cp $SPM.vocab $PROC_PATH/spm.vocab |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment