Skip to content

Instantly share code, notes, and snippets.

@cbaziotis
Last active February 18, 2022 07:38
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save cbaziotis/8047f2077f1fad74709c7f82d87893a5 to your computer and use it in GitHub Desktop.
Save cbaziotis/8047f2077f1fad74709c7f82d87893a5 to your computer and use it in GitHub Desktop.
Fairseq preprocess for multilingual denoising (e.g., mBART)
#!/bin/bash
#-----------------------------------------------------------------------------------------------------------------------
# This script contains the preprocessing pipeline for some predefined datasets.
# 1. It learns a joint sentencepiece model on a sub-set of the data (training-side)
# 2. It tokenizes with the sentencepice model all the data
# 3. It binarizes them for training with faireq
#-----------------------------------------------------------------------------------------------------------------------
DATASET=$1
#DATASET=flores_neen
# main paths
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
MAIN_PATH=$(readlink -f "$DIR")
TOOLS_PATH=$MAIN_PATH/tools
DATA_PATH=$MAIN_PATH/data
VOCAB_PATH=$MAIN_PATH/vocab
BIN_PATH=$MAIN_PATH/data-bin
TOK_PATH=$MAIN_PATH/tok
# create paths
mkdir -p $MAIN_PATH
mkdir -p $TOOLS_PATH
mkdir -p $DATA_PATH
mkdir -p $BIN_PATH
mkdir -p $TOK_PATH
mkdir -p $VOCAB_PATH
SPM_SUBSAMPLE=5000000
SPM_COVERAGE=1
SPM_DATA=spm.$DATASET.train
#---------------------------------------------------------------------------
# define dataset fname paths
#---------------------------------------------------------------------------
if [ "$DATASET" == "flores_sien" ]; then
VOCAB_SIZE=20000
SPM_COVERAGE=0.9995
L1=si
L2=en
TRAIN_RAW_L1=$DATA_PATH/flores/mono/mono.norm.sample5000000.$L1
TRAIN_RAW_L2=$DATA_PATH/flores/mono/mono.sample5000000.$L2
VALID_RAW_L2=$DATA_PATH/flores/wiki_si_en_bpe5000/valid.$L2
VALID_RAW_L1=$DATA_PATH/flores/wiki_si_en_bpe5000/valid.$L1
# This also maintains 1:1 ratio between languages
# echo "Sub-sampling $SPM_SUBSAMPLE sentences from the data of each language to train sentencepiece..."
# shuf -n $SPM_SUBSAMPLE $DATA_PATH/flores/wiki_si_en_bpe5000/train.$L1 >$SPM_DATA
# shuf -n $SPM_SUBSAMPLE $DATA_PATH/flores/wiki_si_en_bpe5000/train.$L2 >>$SPM_DATA
SPM_DATA="${TRAIN_RAW_L1},${TRAIN_RAW_L2}"
elif [ "$DATASET" == "flores_neen" ]; then
VOCAB_SIZE=20000
SPM_COVERAGE=0.9995
L1=ne
L2=en
TRAIN_RAW_L1=$DATA_PATH/flores/mono/mono.norm.sample5000000.$L1
TRAIN_RAW_L2=$DATA_PATH/flores/mono/mono.sample5000000.$L2
VALID_RAW_L2=$DATA_PATH/flores/wiki_ne_en_bpe5000/valid.$L2
VALID_RAW_L1=$DATA_PATH/flores/wiki_ne_en_bpe5000/valid.$L1
# This also maintains 1:1 ratio between languages
# echo "Sub-sampling $SPM_SUBSAMPLE sentences from the data of each language to train sentencepiece..."
# shuf -n $SPM_SUBSAMPLE $DATA_PATH/flores/wiki_ne_en_bpe5000/train.$L1 >$SPM_DATA
# shuf -n $SPM_SUBSAMPLE $DATA_PATH/flores/wiki_ne_en_bpe5000/train.$L2 >>$SPM_DATA
SPM_DATA="${TRAIN_RAW_L1},${TRAIN_RAW_L2}"
elif [ "$DATASET" == "wmt14_fren" ]; then
VOCAB_SIZE=60000
L1=fr
L2=en
TRAIN_RAW_L1=$DATA_PATH/wmt_unmt/mono/fr/all.fr.normal
TRAIN_RAW_L2=$DATA_PATH/wmt_unmt/mono/en/all.en.normal
VALID_RAW_L1=$DATA_PATH/wmt_unmt/para/evaluation_sets/newstest2013-ref.fr
VALID_RAW_L2=$DATA_PATH/wmt_unmt/para/evaluation_sets/newstest2013-ref.en
echo "Sub-sampling $SPM_SUBSAMPLE sentences from the data of each language to train sentencepiece..."
shuf -n $SPM_SUBSAMPLE $TRAIN_RAW_L1 >$SPM_DATA
shuf -n $SPM_SUBSAMPLE $TRAIN_RAW_L2 >>$SPM_DATA
elif [ "$DATASET" == "wmt16_deen" ]; then
VOCAB_SIZE=60000
L1=de
L2=en
TRAIN_RAW_L1=$DATA_PATH/wmt_unmt/mono/de/all.de.normal
TRAIN_RAW_L2=$DATA_PATH/wmt_unmt/mono/en/all.en.normal
VALID_RAW_L1=$DATA_PATH/wmt_unmt/para/evaluation_sets/newstest2016-ende-ref.de
VALID_RAW_L2=$DATA_PATH/wmt_unmt/para/evaluation_sets/newstest2016-deen-ref.en
TEST_RAW_L1=$DATA_PATH/wmt_unmt/para/evaluation_sets/newstest2013-ref.de
TEST_RAW_L2=$DATA_PATH/wmt_unmt/para/evaluation_sets/newstest2013-ref.en
echo "Sub-sampling $SPM_SUBSAMPLE sentences from the data of each language to train sentencepiece..."
shuf -n $SPM_SUBSAMPLE $TRAIN_RAW_L1 >$SPM_DATA
shuf -n $SPM_SUBSAMPLE $TRAIN_RAW_L2 >>$SPM_DATA
elif [ "$DATASET" == "wmt16_roen" ]; then
VOCAB_SIZE=60000
L1=ro
L2=en
TRAIN_RAW_L1=$DATA_PATH/wmt_unmt/mono/ro/all.ro.normal
TRAIN_RAW_L2=$DATA_PATH/wmt_unmt/mono/en/all.en.normal
VALID_RAW_L1=$DATA_PATH/wmt_unmt/para/evaluation_sets/newsdev2016-enro-ref.ro
VALID_RAW_L2=$DATA_PATH/wmt_unmt/para/evaluation_sets/newsdev2016-roen-ref.en
echo "Sub-sampling $SPM_SUBSAMPLE sentences from the data of each language to train sentencepiece..."
shuf -n $SPM_SUBSAMPLE $TRAIN_RAW_L1 >$SPM_DATA
shuf -n $SPM_SUBSAMPLE $TRAIN_RAW_L2 >>$SPM_DATA
else
echo "unknown dataset key"
fi
#---------------------------------------------------------------------------
# For logging purposes
#---------------------------------------------------------------------------
echo ""
echo "---------------------------------"
echo "MAIN_PATH: $MAIN_PATH"
echo "TOOLS_PATH: $TOOLS_PATH"
echo "DATA_PATH: $DATA_PATH"
echo "BIN_PATH: $BIN_PATH"
echo "TOK_PATH: $TOK_PATH"
echo "VOCAB_PATH: $VOCAB_PATH"
echo "---------------------------------"
echo "DATASET: $DATASET"
echo "VOCAB_SIZE: $VOCAB_SIZE"
echo "SPM_DATA: $SPM_DATA"
echo "---------------------------------"
#read -n 1 -s -r -p "If this looks good, press any key to continue..."
# Paths of tokenized data per language
NAME=mono.$DATASET.$VOCAB_SIZE
SPM=$VOCAB_PATH/$DATASET.$VOCAB_SIZE
mkdir -p $TOK_PATH/$NAME
# Path with binarized data
PROC_PATH=$BIN_PATH/pretraining.$NAME
TRAIN_TOK_L1=$TOK_PATH/$NAME/train.$L1
TRAIN_TOK_L2=$TOK_PATH/$NAME/train.$L2
VALID_TOK_L1=$TOK_PATH/$NAME/valid.$L1
VALID_TOK_L2=$TOK_PATH/$NAME/valid.$L2
#---------------------------------------------------------------------------
# 1. Train the sentencepiece model (SPM)
#---------------------------------------------------------------------------
if [ ! -f "$SPM.model" ]; then
echo "Training SPM..."
spm_train --input=$SPM_DATA \
--vocab_size=$VOCAB_SIZE \
--character_coverage=$SPM_COVERAGE \
--max_sentence_length=256 \
--model_prefix=$SPM \
--model_type=unigram
# --model_type=unigram --input_sentence_size=10000000 --shuffle_input_sentence=true
# convert SPM vocab to fairseq dict for later use
cut -f1 $SPM.vocab | tail -n +4 | sed "s/$/ 100/g" >$SPM.dict.txt
else
echo "$SPM.model already trained."
fi
rm $SPM_DATA
#---------------------------------------------------------------------------
# 2. Use the SPM to tokenize the data
#---------------------------------------------------------------------------
echo "Tokenizing..."
echo " - $TRAIN_TOK_L1"
if [ ! -f "$TRAIN_TOK_L1" ]; then spm_encode --model=$SPM.model --output_format=piece <$TRAIN_RAW_L1 >$TRAIN_TOK_L1; fi
echo "done!"
echo " - $TRAIN_TOK_L2"
if [ ! -f "$TRAIN_TOK_L2" ]; then spm_encode --model=$SPM.model --output_format=piece <$TRAIN_RAW_L2 >$TRAIN_TOK_L2; fi
echo "done!"
echo " - $VALID_TOK_L1"
if [ ! -f "$VALID_TOK_L1" ]; then spm_encode --model=$SPM.model --output_format=piece <$VALID_RAW_L1 >$VALID_TOK_L1; fi
echo "done!"
echo " - $VALID_TOK_L2"
if [ ! -f "$VALID_TOK_L2" ]; then spm_encode --model=$SPM.model --output_format=piece <$VALID_RAW_L2 >$VALID_TOK_L2; fi
echo "done!"
#---------------------------------------------------------------------------
# 3. Binarize the tokenized data
#---------------------------------------------------------------------------
echo "Binarizing..."
fairseq-preprocess --only-source \
--trainpref $TRAIN_TOK_L1 \
--validpref $VALID_TOK_L1 \
--destdir $PROC_PATH/$L1 \
--srcdict $SPM.dict.txt \
--bpe sentencepiece \
--workers 10
fairseq-preprocess --only-source \
--trainpref $TRAIN_TOK_L2 \
--validpref $VALID_TOK_L2 \
--destdir $PROC_PATH/$L2 \
--srcdict $SPM.dict.txt \
--bpe sentencepiece \
--workers 10
cp "$SPM.dict.txt" "$PROC_PATH/dict.txt"
cp $SPM.model $PROC_PATH/spm.model
cp $SPM.vocab $PROC_PATH/spm.vocab
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment