Last active
October 3, 2017 18:34
-
-
Save entn-at/486562edfa01f99a07a03d9c905a1a50 to your computer and use it in GitHub Desktop.
NIST SRE 2010 proof-of-concept recipe (C5 core extended) based on Xvectors
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Copyright 2017 Ewald Enzinger | |
# 2017 David Snyder | |
# 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) | |
# 2017 Johns Hopkins University (Author: Daniel Povey) | |
# Apache 2.0. | |
# | |
# This example demonstrates a "proof-of-concept" NIST SRE 2010 recipe using xvectors. | |
. cmd.sh | |
. path.sh | |
set -e | |
fbankdir=`pwd`/fbank | |
vaddir=`pwd`/fbank | |
trials=data/sre10_test/trials | |
nnet_dir=exp/xvector_nnet_1a | |
stage=0 | |
if [ $stage -le 0 ]; then | |
# Path to some, but not all of the training corpora | |
# Prepare the SRE 2010 evaluation data. | |
local/make_sre_2010_test.pl /export/corpora5/SRE/SRE2010/eval/ data/ | |
local/make_sre_2010_train.pl /export/corpora5/SRE/SRE2010/eval/ data/ | |
local/make_sre.sh data | |
local/make_fisher.sh /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2004T19 data/fisher1 | |
local/make_fisher.sh /export/corpora3/LDC/LDC2005S13 /export/corpora3/LDC/LDC2005T19 data/fisher2 | |
utils/combine_data.sh data/fisher data/fisher1 data/fisher2 | |
# Prepare SWB for UBM and i-vector extractor training. | |
local/make_swbd2_phase2.pl /export/corpora5/LDC/LDC99S79 \ | |
data/swbd2_phase2_train | |
local/make_swbd2_phase3.pl /export/corpora5/LDC/LDC2002S06 \ | |
data/swbd2_phase3_train | |
local/make_swbd_cellular1.pl /export/corpora5/LDC/LDC2001S13 \ | |
data/swbd_cellular1_train | |
local/make_swbd_cellular2.pl /export/corpora5/LDC/LDC2004S07 \ | |
data/swbd_cellular2_train | |
# Combine all SWB corpora into one dataset. | |
utils/combine_data.sh data/swbd \ | |
data/swbd_cellular1_train data/swbd_cellular2_train \ | |
data/swbd2_phase2_train data/swbd2_phase3_train | |
utils/validate_data_dir.sh --no-text --no-feats data/sre | |
utils/fix_data_dir.sh data/sre | |
fi | |
if [ $stage -le 1 ]; then | |
# Make filterbanks and compute the energy-based VAD for each dataset | |
for name in sre swbd fisher sre10_test sre10_train; do | |
steps/make_fbank.sh --fbank-config conf/fbank.conf --nj 40 --cmd "$train_cmd" \ | |
data/${name} exp/make_fbank $fbankdir | |
utils/fix_data_dir.sh data/${name} | |
sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \ | |
data/${name} exp/make_vad $vaddir | |
utils/fix_data_dir.sh data/${name} | |
done | |
utils/combine_data.sh data/swbd_fsh_sre data/swbd data/fisher data/sre | |
utils/fix_data_dir.sh data/swbd_fsh_sre | |
fi | |
# In this section, we augment the SWBD, Fisher, and SRE data with reverberation, | |
# noise, music, and babble, and combined it with the clean data. | |
# The combined list will be used to train the xvector DNN. The SRE | |
# subset will be used to train the PLDA model. | |
if [ $stage -le 2 ]; then | |
utils/data/get_utt2num_frames.sh --nj 40 --cmd "$train_cmd" data/swbd_fsh_sre | |
frame_shift=0.01 | |
awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/swbd_fsh_sre/utt2num_frames > data/swbd_fsh_sre/reco2dur | |
if [ ! -d "RIRS_NOISES" ]; then | |
# Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises | |
wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip | |
unzip rirs_noises.zip | |
fi | |
# Make a version with reverberated speech | |
rvb_opts=() | |
rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") | |
rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") | |
# Make a reverberated version of the SWBD+SRE list. Note that we don't add any | |
# additive noise here. | |
python steps/data/reverberate_data_dir.py \ | |
"${rvb_opts[@]}" \ | |
--speech-rvb-probability 1 \ | |
--pointsource-noise-addition-probability 0 \ | |
--isotropic-noise-addition-probability 0 \ | |
--num-replications 1 \ | |
--source-sampling-rate 8000 \ | |
data/swbd_fsh_sre data/swbd_fsh_sre_reverb | |
cp data/swbd_fsh_sre/vad.scp data/swbd_fsh_sre_reverb/ | |
utils/copy_data_dir.sh --utt-suffix "-reverb" data/swbd_fsh_sre_reverb data/swbd_fsh_sre_reverb.new | |
rm -rf data/swbd_fsh_sre_reverb | |
mv data/swbd_fsh_sre_reverb.new data/swbd_fsh_sre_reverb | |
# Prepare the MUSAN corpus, which consists of music, speech, and noise | |
# suitable for augmentation. | |
local/make_musan.sh /home/ubuntu/export/data/musan data | |
# Get the duration of the MUSAN recordings. This will be used by the | |
# script augment_data_dir.py. | |
for name in speech noise music; do | |
utils/data/get_utt2dur.sh data/musan_${name} | |
mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur | |
done | |
# Augment with musan_noise | |
python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/swbd_fsh_sre data/swbd_fsh_sre_noise | |
# Augment with musan_music | |
python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/swbd_fsh_sre data/swbd_fsh_sre_music | |
# Augment with musan_speech | |
python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/swbd_fsh_sre data/swbd_fsh_sre_babble | |
# Combine reverb, noise, music, and babble into one directory. | |
utils/combine_data.sh data/swbd_fsh_sre_aug data/swbd_fsh_sre_reverb data/swbd_fsh_sre_noise data/swbd_fsh_sre_music data/swbd_fsh_sre_babble | |
# Take a random subset of the augmentations (128k is somewhat larger than twice | |
# the size of the SWBD+SRE list) | |
utils/subset_data_dir.sh data/swbd_fsh_sre_aug 128000 data/swbd_fsh_sre_aug_128k | |
utils/fix_data_dir.sh data/swbd_fsh_sre_aug_128k | |
# Make filterbanks for the augmented data. Note that we do not compute a new | |
# vad.scp file here. Instead, we use the vad.scp from the clean version of | |
# the list. | |
steps/make_fbank.sh --fbank-config conf/fbank.conf --nj 40 --cmd "$train_cmd" \ | |
data/swbd_fsh_sre_aug_128k exp/make_fbank $fbankdir | |
# Combine the clean and augmented SWBD+SRE list. This is now roughly | |
# double the size of the original clean list. | |
utils/combine_data.sh data/swbd_fsh_sre_combined data/swbd_fsh_sre_aug_128k data/swbd_fsh_sre | |
# Filter out the clean + augmented portion of the SRE list. This will be used to | |
# train the PLDA model later in the script. | |
cp -r data/swbd_fsh_sre_combined data/sre_combined | |
utils/filter_scp.pl data/sre/spk2utt data/swbd_fsh_sre_combined/spk2utt | utils/spk2utt_to_utt2spk.pl > data/sre_combined/utt2spk | |
utils/fix_data_dir.sh data/sre_combined | |
fi | |
# Now we prepare the features to generate examples for xvector training. | |
if [ $stage -le 3 ]; then | |
# This script applies CMVN and removes nonspeech frames. Note that this is somewhat | |
# wasteful, as it roughly doubles the amount of training data on disk. After | |
# creating training examples, this can be removed. | |
local/nnet3/xvector/prepare_feats_for_egs.sh --nj 40 --cmd "$train_cmd" \ | |
data/swbd_fsh_sre_combined data/swbd_fsh_sre_combined_no_sil exp/swbd_sre_combined_no_sil | |
utils/fix_data_dir.sh data/swbd_fsh_sre_combined_no_sil | |
utils/data/get_utt2num_frames.sh --nj 40 --cmd "$train_cmd" data/swbd_fsh_sre_combined_no_sil | |
utils/fix_data_dir.sh data/swbd_fsh_sre_combined_no_sil | |
# Now, we need to remove features that are too short after removing silence | |
# frames. We want atleast 5s (500 frames) per utterance. | |
min_len=500 | |
mv data/swbd_fsh_sre_combined_no_sil/utt2num_frames data/swbd_fsh_sre_combined_no_sil/utt2num_frames.bak | |
awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/swbd_fsh_sre_combined_no_sil/utt2num_frames.bak > data/swbd_fsh_sre_combined_no_sil/utt2num_frames | |
utils/filter_scp.pl data/swbd_fsh_sre_combined_no_sil/utt2num_frames data/swbd_fsh_sre_combined_no_sil/utt2spk > data/swbd_fsh_sre_combined_no_sil/utt2spk.new | |
mv data/swbd_fsh_sre_combined_no_sil/utt2spk.new data/swbd_fsh_sre_combined_no_sil/utt2spk | |
utils/fix_data_dir.sh data/swbd_fsh_sre_combined_no_sil | |
# We also want several utterances per speaker. Now we'll throw out speakers | |
# with fewer than 8 utterances. | |
min_num_utts=8 | |
awk '{print $1, NF-1}' data/swbd_fsh_sre_combined_no_sil/spk2utt > data/swbd_fsh_sre_combined_no_sil/spk2num | |
awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/swbd_fsh_sre_combined_no_sil/spk2num | utils/filter_scp.pl - data/swbd_fsh_sre_combined_no_sil/spk2utt > data/swbd_fsh_sre_combined_no_sil/spk2utt.new | |
mv data/swbd_fsh_sre_combined_no_sil/spk2utt.new data/swbd_fsh_sre_combined_no_sil/spk2utt | |
utils/spk2utt_to_utt2spk.pl data/swbd_fsh_sre_combined_no_sil/spk2utt > data/swbd_fsh_sre_combined_no_sil/utt2spk | |
utils/filter_scp.pl data/swbd_fsh_sre_combined_no_sil/utt2spk data/swbd_fsh_sre_combined_no_sil/utt2num_frames > data/swbd_fsh_sre_combined_no_sil/utt2num_frames.new | |
mv data/swbd_fsh_sre_combined_no_sil/utt2num_frames.new data/swbd_fsh_sre_combined_no_sil/utt2num_frames | |
# Now we're ready to create training examples. | |
utils/fix_data_dir.sh data/swbd_fsh_sre_combined_no_sil | |
fi | |
if [ $stage -le 6 ]; then | |
local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage -1 \ | |
--data data/swbd_fsh_sre_combined_no_sil --nnet-dir $nnet_dir \ | |
--egs-dir $nnet_dir/egs | |
fi | |
if [ $stage -le 7 ]; then | |
sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \ | |
$nnet_dir data/sre10_test \ | |
exp/xvectors_sre10_test | |
sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \ | |
$nnet_dir data/sre10_train \ | |
exp/xvectors_sre10_train | |
# Extract xvectors for SRE data. We'll use this for things like LDA or PLDA. | |
sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 12G" --nj 40 \ | |
$nnet_dir data/sre_combined \ | |
exp/xvectors_sre_combined | |
fi | |
if [ $stage -le 8 ]; then | |
# Compute the mean vector for centering the evaluation xvectors. | |
$train_cmd exp/xvectors_sre_combined/log/compute_mean.log \ | |
ivector-mean scp:exp/xvectors_sre_combined/xvector.scp \ | |
exp/xvectors_sre_combined/mean.vec || exit 1; | |
# This script uses LDA to decrease the dimensionality prior to PLDA. | |
lda_dim=150 | |
$train_cmd exp/xvectors_sre_combined/log/lda.log \ | |
ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \ | |
"ark:ivector-subtract-global-mean scp:exp/xvectors_sre_combined/xvector.scp ark:- |" \ | |
ark:data/sre_combined/utt2spk exp/xvectors_sre_combined/transform.mat || exit 1; | |
# Train an out-of-domain PLDA model. | |
$train_cmd exp/xvectors_sre_combined/log/plda.log \ | |
ivector-compute-plda ark:data/sre_combined/spk2utt \ | |
"ark:ivector-subtract-global-mean scp:exp/xvectors_sre_combined/xvector.scp ark:- | transform-vec exp/xvectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ | |
exp/xvectors_sre_combined/plda || exit 1; | |
fi | |
if [ $stage -le 9 ]; then | |
# Get results using the out-of-domain PLDA model. | |
$train_cmd exp/scores/log/sre10_test_scoring.log \ | |
ivector-plda-scoring --normalize-length=true \ | |
--num-utts=ark:exp/xvectors_sre10_train/num_utts.ark \ | |
"ivector-copy-plda --smoothing=0.0 exp/xvectors_sre_combined/plda - |" \ | |
"ark:ivector-mean ark:data/sre10_train/spk2utt scp:exp/xvectors_sre10_train/xvector.scp ark:- | ivector-subtract-global-mean exp/xvectors_sre_combined/mean.vec ark:- ark:- | transform-vec exp/xvectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ | |
"ark:ivector-subtract-global-mean exp/xvectors_sre_combined/mean.vec scp:exp/xvectors_sre10_test/xvector.scp ark:- | transform-vec exp/xvectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ | |
"cat '$trials' | cut -d\ --fields=1,2 |" exp/scores/sre10_test_scores || exit 1; | |
eer=`compute-eer <(python local/prepare_for_eer.py $trials exp/scores/sre10_test_scores) 2> /dev/null` | |
echo "EER: $eer" | |
fi | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment