rpryzant/process.sh

## process.sh
#! /usr/bin/env bash
# Processes the v2 subtitles/aspec corpora into one combined corpus
# Requirements
# - Processed subtitlesv2 corpus
# - ASPEC

CORPUS1=$1
CORPUS2=$2
TARGET=$3    # target language = [ja, zh, fr]
LOC="/scr/rpryzant/chinese_english_corpora/"


A_PATH=$LOC$CORPUS1
B_PATH=$LOC$CORPUS2
OUTPUT_DIR=$LOC$CORPUS1"_"$CORPUS2"_processed"

mkdir -p $OUTPUT_DIR


######### SET UP CORPUS 1
cd $A_PATH
# Shuffle corpus
paste en ${TARGET} | shuf > corpus.shuf
# Remove examples are  empty string for either en or TARGET
sed -i '/^$/d' corpus.shuf
sed -i '/^\t.*$/d' corpus.shuf
sed -i '/^.*\t$/d' corpus.shuf
# Separate into en/ja
cat corpus.shuf | cut -f1 > corpus.shuf.en
cat corpus.shuf | cut -f2 > corpus.shuf.${TARGET}

# Split into train/dev/test
# First 1000 lines are dev, next 1000 lines are test, the rest is train
tail -n +2000 corpus.shuf.en > train.en
tail -n +2000 corpus.shuf.${TARGET} > train.${TARGET}
head -n 1000 corpus.shuf.en > dev.en
head -n 1000 corpus.shuf.${TARGET} > dev.${TARGET}
head -n 2000 corpus.shuf.en | tail -n +1000  > test.en
head -n 2000 corpus.shuf.${TARGET} | tail -n +1000 > test.${TARGET}

cd ..


######### SET UP CORPUS 2
cd $B_PATH
# Shuffle corpus
paste en ${TARGET} | shuf > corpus.shuf
# Remove examples are  empty string for either en or TARGET
sed -i '/^$/d' corpus.shuf
sed -i '/^\t.*$/d' corpus.shuf
sed -i '/^.*\t$/d' corpus.shuf
# Separate into en/ja
cat corpus.shuf | cut -f1 > corpus.shuf.en
cat corpus.shuf | cut -f2 > corpus.shuf.${TARGET}

# Split into train/dev/test
# First 1000 lines are dev, next 1000 lines are test, the rest is train
tail -n +2000 corpus.shuf.en > train.en
tail -n +2000 corpus.shuf.${TARGET} > train.${TARGET}
head -n 1000 corpus.shuf.en > dev.en
head -n 1000 corpus.shuf.${TARGET} > dev.${TARGET}
head -n 2000 corpus.shuf.en | tail -n +1000  > test.en
head -n 2000 corpus.shuf.${TARGET} | tail -n +1000 > test.${TARGET}

cd ..


######### SET UP SHARED CORPUS
cd $OUTPUT_DIR

# Learn BPE across both corpora
spm_train \
  --input=${A_PATH}/train.en,${B_PATH}/train.en,${A_PATH}/train.${TARGET},${B_PATH}/train.${TARGET} \
  --model_prefix=bpe \
  --vocab_size=32000 \
  --model_type=bpe

# Apply BPE to both corpora
for data in train dev test; do
  spm_encode --model=bpe.model --output_format=piece \
    < ${A_PATH}/${data}.${TARGET} \
    > ${data}.${CORPUS1}.bpe.${TARGET}
  spm_encode --model=bpe.model --output_format=piece \
    < ${B_PATH}/${data}.${TARGET} \
    > ${data}.${CORPUS2}.bpe.${TARGET}
  spm_encode --model=bpe.model --output_format=piece \
    < ${A_PATH}/${data}.en \
    > ${data}.${CORPUS1}.bpe.en
  spm_encode --model=bpe.model --output_format=piece \
    < ${B_PATH}/${data}.en \
    > ${data}.${CORPUS2}.bpe.en
done

# Create shuffled combined dataset
for data in train dev test; do
  paste <(cat ${data}.${CORPUS1}.bpe.${TARGET} ${data}.${CORPUS2}.bpe.${TARGET}) <(cat ${data}.${CORPUS1}.bpe.en ${data}.${CORPUS2}.bpe.en) | shuf > ${data}.combined.bpe.shuf
  cut -f1 ${data}.combined.bpe.shuf > ${data}.combined.bpe.${TARGET}
  cut -f2 ${data}.combined.bpe.shuf > ${data}.combined.bpe.en
done
	#! /usr/bin/env bash
	# Processes the v2 subtitles/aspec corpora into one combined corpus
	# Requirements
	# - Processed subtitlesv2 corpus
	# - ASPEC

	CORPUS1=$1
	CORPUS2=$2
	TARGET=$3 # target language = [ja, zh, fr]
	LOC="/scr/rpryzant/chinese_english_corpora/"


	A_PATH=$LOC$CORPUS1
	B_PATH=$LOC$CORPUS2
	OUTPUT_DIR=$LOC$CORPUS1"_"$CORPUS2"_processed"

	mkdir -p $OUTPUT_DIR


	######### SET UP CORPUS 1
	cd $A_PATH
	# Shuffle corpus
	paste en ${TARGET} \| shuf > corpus.shuf
	# Remove examples are empty string for either en or TARGET
	sed -i '/^$/d' corpus.shuf
	sed -i '/^\t.*$/d' corpus.shuf
	sed -i '/^.*\t$/d' corpus.shuf
	# Separate into en/ja
	cat corpus.shuf \| cut -f1 > corpus.shuf.en
	cat corpus.shuf \| cut -f2 > corpus.shuf.${TARGET}

	# Split into train/dev/test
	# First 1000 lines are dev, next 1000 lines are test, the rest is train
	tail -n +2000 corpus.shuf.en > train.en
	tail -n +2000 corpus.shuf.${TARGET} > train.${TARGET}
	head -n 1000 corpus.shuf.en > dev.en
	head -n 1000 corpus.shuf.${TARGET} > dev.${TARGET}
	head -n 2000 corpus.shuf.en \| tail -n +1000 > test.en
	head -n 2000 corpus.shuf.${TARGET} \| tail -n +1000 > test.${TARGET}

	cd ..




	######### SET UP CORPUS 2
	cd $B_PATH
	# Shuffle corpus
	paste en ${TARGET} \| shuf > corpus.shuf
	# Remove examples are empty string for either en or TARGET
	sed -i '/^$/d' corpus.shuf
	sed -i '/^\t.*$/d' corpus.shuf
	sed -i '/^.*\t$/d' corpus.shuf
	# Separate into en/ja
	cat corpus.shuf \| cut -f1 > corpus.shuf.en
	cat corpus.shuf \| cut -f2 > corpus.shuf.${TARGET}

	# Split into train/dev/test
	# First 1000 lines are dev, next 1000 lines are test, the rest is train
	tail -n +2000 corpus.shuf.en > train.en
	tail -n +2000 corpus.shuf.${TARGET} > train.${TARGET}
	head -n 1000 corpus.shuf.en > dev.en
	head -n 1000 corpus.shuf.${TARGET} > dev.${TARGET}
	head -n 2000 corpus.shuf.en \| tail -n +1000 > test.en
	head -n 2000 corpus.shuf.${TARGET} \| tail -n +1000 > test.${TARGET}

	cd ..






	######### SET UP SHARED CORPUS
	cd $OUTPUT_DIR

	# Learn BPE across both corpora
	spm_train \
	--input=${A_PATH}/train.en,${B_PATH}/train.en,${A_PATH}/train.${TARGET},${B_PATH}/train.${TARGET} \
	--model_prefix=bpe \
	--vocab_size=32000 \
	--model_type=bpe

	# Apply BPE to both corpora
	for data in train dev test; do
	spm_encode --model=bpe.model --output_format=piece \
	< ${A_PATH}/${data}.${TARGET} \
	> ${data}.${CORPUS1}.bpe.${TARGET}
	spm_encode --model=bpe.model --output_format=piece \
	< ${B_PATH}/${data}.${TARGET} \
	> ${data}.${CORPUS2}.bpe.${TARGET}
	spm_encode --model=bpe.model --output_format=piece \
	< ${A_PATH}/${data}.en \
	> ${data}.${CORPUS1}.bpe.en
	spm_encode --model=bpe.model --output_format=piece \
	< ${B_PATH}/${data}.en \
	> ${data}.${CORPUS2}.bpe.en
	done

	# Create shuffled combined dataset
	for data in train dev test; do
	paste <(cat ${data}.${CORPUS1}.bpe.${TARGET} ${data}.${CORPUS2}.bpe.${TARGET}) <(cat ${data}.${CORPUS1}.bpe.en ${data}.${CORPUS2}.bpe.en) \| shuf > ${data}.combined.bpe.shuf
	cut -f1 ${data}.combined.bpe.shuf > ${data}.combined.bpe.${TARGET}
	cut -f2 ${data}.combined.bpe.shuf > ${data}.combined.bpe.en
	done