Last active
July 26, 2017 18:17
-
-
Save rrajasek95/8355070a870d5eea94f4c3fce210168d to your computer and use it in GitHub Desktop.
Data Processing of recordings for Transcriber Qualification
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
source functions.sh | |
folder=$1 | |
find $folder -name "*.mp3" | while read file; | |
do | |
basename=${file%.mp3} | |
echo "Converting $basename" | |
f_mp3towav $file $basename".wav" | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
filename: | |
author: rishi | |
date_created: 22/6/17 | |
''' | |
import sys | |
import os | |
folder_list = set(os.listdir("words")) | |
with open('subset_words.dict', 'r') as f: | |
for line in f: | |
tokens = line.strip().lower().split(maxsplit=1) | |
if tokens[0] not in folder_list: | |
continue | |
with open('words/{0}/{0}-align.jsgf'.format(tokens[0]), 'w') as w: | |
w.write('#JSGF V1.0;\ngrammar forcing;\npublic <{}> = sil {} [ sil ];\n'.format(tokens[0], tokens[1])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
ls $1 | while read word; do | |
# Get the dict entry for the word | |
line=$(grep "^$word\b" subset_words.dict) | |
if [ ! -z "$line" ]; then | |
# get the phonemes | |
phoneme_regex="^("$(echo $line | tr A-Z a-z | cut -d' ' -f2- | tr " " "|")")" | |
rm "words/"$word"/"$word"-normalign.txt" | |
touch "words/"$word"/"$word"-normalign.txt" | |
# for each align file, get the numerical data | |
find "words/"$word -name "*-align.txt" | while read f; do | |
echo $f | |
awk -v pregex="$phoneme_regex" '$0 ~ pregex {printf "%2s %6.3f %6.3f ", $1, -log(1-$5), log($3-$2+1)} END {print FILENAME}' $f | tee -a "words/"$word"/"$word"-normalign.txt" | |
done | |
fi | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
ls sentences | while read utterance; do | |
jsgf_file=$utterance"-align.jsgf" | |
path="sentences/"$utterance"/"$jsgf_file | |
if [ -f "$path" ]; then | |
line=$(cat $path | grep -o "sil.* sil") | |
phonemes=${line:4:-6} | |
phoneme_regex=$(echo $phonemes | tr " " "\n" | sort | uniq | tr "\n" "|") | |
phoneme_regex="^("${phoneme_regex::-1}")" | |
rm "sentences/"$utterance"/"$utterance"-alignments.txt" | |
touch "sentences/"$utterance"/"$utterance"-alignments.txt" | |
find "sentences/"$utterance -name "*-align.txt" | while read f; do | |
echo $f | |
awk -v pregex="$phoneme_regex" '$0 ~ pregex {printf "%2s %5d %4.2f ", $1, $5, ($3-$2)/100.0} END {print FILENAME}' $f >> "sentences/"$utterance"/"$utterance"-alignments.txt" | |
done | |
fi | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
filename: | |
author: rishi | |
date_created: 23/6/17 | |
''' | |
import sys | |
import os | |
folder_list = set(os.listdir("sentences")) | |
folder_list.remove("voice_tag_sentence.csv") | |
word_phonemes_map = {} | |
with open('subset_words.dict', 'r') as f: | |
for line in f: | |
token = line.strip().lower().split(maxsplit=1) | |
word_phonemes_map[token[0]] = token[1] | |
for folder in folder_list: | |
words = folder.split('-') | |
missing_words = [] | |
word_list = [] | |
for word in words: | |
if word not in word_phonemes_map: | |
if word[-1] == "s": # Probable possessive | |
if word[:-1] + '\'s' in word_phonemes_map: | |
word_list.append(word[:-1] + '\'s') | |
elif word[-2:] == "re": # Probable we're they're | |
if word[:-2] + '\'re' in word_phonemes_map: | |
word_list.append(word[:-2] + '\'re') | |
elif word[-2:] == "ll": # Probable we'll, i'll etc | |
if word[:-2] + '\'ll' in word_phonemes_map: | |
word_list.append(word[:-2] + '\'ll') | |
elif word[-2:] == "nt": #Probable isn't | |
if word[:-1] + '\'t' in word_phonemes_map: | |
word_list.append(word[:-1] + '\'t') | |
elif word[-2:] == "ve": | |
if word[:-2] + '\'ve' in word_phonemes_map: | |
word_list.append(word[:-2] + '\'ve') | |
elif word == "oclock": | |
word_list.append("o'clock") | |
elif word == "schoolbag": | |
word_list.append("school") | |
word_list.append("bag") | |
elif word == "beanstalk": | |
word_list.append("bean") | |
word_list.append("stalk") | |
elif word == "colourful": | |
word_list.append("colorful") | |
elif word == "kungfu": | |
word_list.append("kung") | |
word_list.append("fu") | |
elif word == "sandcastle": | |
word_list.append("sand") | |
word_list.append("castle") | |
else: | |
missing_words.append(word) | |
else: | |
word_list.append(word) | |
if missing_words: | |
print("Words missing:", missing_words) | |
else: | |
utterance = " ".join(word_list) | |
phoneme_list = [word_phonemes_map[word] for word in word_list] | |
phoneme_string = " ".join(phoneme_list) | |
jsgf_content = "#JSGF V1.0;\ngrammar forcing;\npublic <{}> = sil {} [ sil ];\n".format(utterance, phoneme_string) | |
with open('sentences/{0}/{0}-align.jsgf'.format(folder), 'w') as w: | |
w.write(jsgf_content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
ls $1 | while read word; do | |
if [ -f "words/"$word"/"$word"-normalign.txt" ]; then | |
grep ' ' "words/"$word"/"$word"-normalign.txt" | awk '{if (!mnf || NF<mnf) {mnf=NF}; for (f=1; f<NF; f++) {i[NR,f]=$f; if ((f-1) % 3) {m[f]+=$f; d[f]=$f-a[f]; a[f]+=d[f]/NR; m2[f]+=d[f]*($f-a[f])}}; i[NR,0]=$NF} END {print "Means and standard deviations of acoustic scores and durations for each phoneme:"; for (f=1; f<mnf; f++) {if ((f-1) % 3) {printf "%5.3f %5.3f ", m[f]/NR, sqrt(m2[f]/NR)} else {printf "%s ", $f}}; print "\n\nStandard scores of acoustic scores and durations for each scored utterance:"; for (r=1; r<=NR; r++) {for (f=1; f<mnf; f++) {if ((f-1) % 3) {printf "%+6.3f ", (i[r,f]-(m[f]/NR))/sqrt(m2[f]/NR)} else {printf "%s ", i[r,f]}}; print i[r,0]}}' > "words/"$word"/"$word"-standards.txt" | |
fi | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
filename: | |
author: rishi | |
date_created: 23/6/17 | |
''' | |
import os | |
word_list = [] | |
word_phonemes_map = {} | |
with open('../cmudict.dict', 'r') as dictionary_file: | |
for line in dictionary_file: | |
pair = (word, phones) = line.strip().split(maxsplit=1) | |
word_list.append((word, phones.replace("1", "").replace("0", "").replace("2", ""))) | |
token = line.strip().lower().split(maxsplit=1) | |
word_phonemes_map[token[0]] = token[1] | |
words = set(os.listdir("words")) | |
sentences = set(os.listdir("sentences")) | |
sentence_words = set() | |
for sentence in sentences: | |
for word in sentence.split('-'): | |
if word not in word_phonemes_map: | |
if word[-1] == "s": # Probable possessive | |
if word[:-1] + '\'s' in word_phonemes_map: | |
word = word[:-1] + '\'s' | |
elif word[-2:] == "re": # Probable we're they're | |
if word[:-2] + '\'re' in word_phonemes_map: | |
word = word[:-2] + '\'re' | |
elif word[-2:] == "ll": # Probable we'll, i'll etc | |
if word[:-2] + '\'ll' in word_phonemes_map: | |
word = word[:-2] + '\'ll' | |
elif word[-2:] == "nt": #Probable isn't | |
if word[:-1] + '\'t' in word_phonemes_map: | |
word = word[:-1] + '\'t' | |
elif word[-2:] == "ve": | |
if word[:-2] + '\'ve' in word_phonemes_map: | |
word = word[:-2] + '\'ve' | |
elif word == "oclock": | |
word = "o'clock" | |
elif word == "schoolbag": | |
sentence_words.add("school") | |
sentence_words.add("bag") | |
elif word == "beanstalk": | |
sentence_words.add("bean") | |
sentence_words.add("stalk") | |
elif word == "colourful": | |
sentence_words.add("colorful") | |
elif word == "kungfu": | |
sentence_words.add("kung") | |
sentence_words.add("fu") | |
elif word == "sandcastle": | |
sentence_words.add("sand") | |
sentence_words.add("castle") | |
sentence_words.add(word) | |
words = words | sentence_words | |
subset_word_list = [(word, phones) for (word, phones) in word_list if word in words] | |
subset_word_list.append(("sil", "SIL")) # Handling silence the grammar | |
with open('subset_words.dict', 'w') as dictionary_file: | |
for (word, phones) in subset_word_list: | |
dictionary_file.write(word + " " + phones+"\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
function f_mp3towav { | |
mpg123 -w $2 $1 | |
} | |
function f_download_audio_to_folder { | |
filename=$1 | |
url=$2 | |
folder=$3 | |
if [ ! -d $folder ]; then | |
mkdir -p $folder | |
fi | |
wget -O $folder/$filename $url | |
} | |
function f_force_align { | |
audio=$1 | |
align_jsgf=$2 | |
phoneme_dict=$3 | |
echo $audo | |
pocketsphinx_continuous -infile $audio -jsgf $align_jsgf -dict $phoneme_dict -backtrace yes -fsgusefiller no -bestpath no -wbeam 1e-56 -beam 1e-57 2>&1 | tee $audio"-align.txt" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment