Skip to content

Instantly share code, notes, and snippets.

@rpryzant
Created April 6, 2017 18:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rpryzant/ba0a587a9e2adbff6166f7b9356d5b9f to your computer and use it in GitHub Desktop.
Save rpryzant/ba0a587a9e2adbff6166f7b9356d5b9f to your computer and use it in GitHub Desktop.
Conjoining nmt datasets
#! /usr/bin/env bash
# Processes the v2 subtitles/aspec corpora into one combined corpus
# Requirements
# - Processed subtitlesv2 corpus
# - ASPEC
CORPUS1=$1
CORPUS2=$2
TARGET=$3 # target language = [ja, zh, fr]
LOC="/scr/rpryzant/chinese_english_corpora/"
A_PATH=$LOC$CORPUS1
B_PATH=$LOC$CORPUS2
OUTPUT_DIR=$LOC$CORPUS1"_"$CORPUS2"_processed"
mkdir -p $OUTPUT_DIR
######### SET UP CORPUS 1
cd $A_PATH
# Shuffle corpus
paste en ${TARGET} | shuf > corpus.shuf
# Remove examples are empty string for either en or TARGET
sed -i '/^$/d' corpus.shuf
sed -i '/^\t.*$/d' corpus.shuf
sed -i '/^.*\t$/d' corpus.shuf
# Separate into en/ja
cat corpus.shuf | cut -f1 > corpus.shuf.en
cat corpus.shuf | cut -f2 > corpus.shuf.${TARGET}
# Split into train/dev/test
# First 1000 lines are dev, next 1000 lines are test, the rest is train
tail -n +2000 corpus.shuf.en > train.en
tail -n +2000 corpus.shuf.${TARGET} > train.${TARGET}
head -n 1000 corpus.shuf.en > dev.en
head -n 1000 corpus.shuf.${TARGET} > dev.${TARGET}
head -n 2000 corpus.shuf.en | tail -n +1000 > test.en
head -n 2000 corpus.shuf.${TARGET} | tail -n +1000 > test.${TARGET}
cd ..
######### SET UP CORPUS 2
cd $B_PATH
# Shuffle corpus
paste en ${TARGET} | shuf > corpus.shuf
# Remove examples are empty string for either en or TARGET
sed -i '/^$/d' corpus.shuf
sed -i '/^\t.*$/d' corpus.shuf
sed -i '/^.*\t$/d' corpus.shuf
# Separate into en/ja
cat corpus.shuf | cut -f1 > corpus.shuf.en
cat corpus.shuf | cut -f2 > corpus.shuf.${TARGET}
# Split into train/dev/test
# First 1000 lines are dev, next 1000 lines are test, the rest is train
tail -n +2000 corpus.shuf.en > train.en
tail -n +2000 corpus.shuf.${TARGET} > train.${TARGET}
head -n 1000 corpus.shuf.en > dev.en
head -n 1000 corpus.shuf.${TARGET} > dev.${TARGET}
head -n 2000 corpus.shuf.en | tail -n +1000 > test.en
head -n 2000 corpus.shuf.${TARGET} | tail -n +1000 > test.${TARGET}
cd ..
######### SET UP SHARED CORPUS
cd $OUTPUT_DIR
# Learn BPE across both corpora
spm_train \
--input=${A_PATH}/train.en,${B_PATH}/train.en,${A_PATH}/train.${TARGET},${B_PATH}/train.${TARGET} \
--model_prefix=bpe \
--vocab_size=32000 \
--model_type=bpe
# Apply BPE to both corpora
for data in train dev test; do
spm_encode --model=bpe.model --output_format=piece \
< ${A_PATH}/${data}.${TARGET} \
> ${data}.${CORPUS1}.bpe.${TARGET}
spm_encode --model=bpe.model --output_format=piece \
< ${B_PATH}/${data}.${TARGET} \
> ${data}.${CORPUS2}.bpe.${TARGET}
spm_encode --model=bpe.model --output_format=piece \
< ${A_PATH}/${data}.en \
> ${data}.${CORPUS1}.bpe.en
spm_encode --model=bpe.model --output_format=piece \
< ${B_PATH}/${data}.en \
> ${data}.${CORPUS2}.bpe.en
done
# Create shuffled combined dataset
for data in train dev test; do
paste <(cat ${data}.${CORPUS1}.bpe.${TARGET} ${data}.${CORPUS2}.bpe.${TARGET}) <(cat ${data}.${CORPUS1}.bpe.en ${data}.${CORPUS2}.bpe.en) | shuf > ${data}.combined.bpe.shuf
cut -f1 ${data}.combined.bpe.shuf > ${data}.combined.bpe.${TARGET}
cut -f2 ${data}.combined.bpe.shuf > ${data}.combined.bpe.en
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment