Skip to content

Instantly share code, notes, and snippets.

@Mte90
Created March 17, 2020 19:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Mte90/116e5d8a17973b7bd9bd9050662736dd to your computer and use it in GitHub Desktop.
Save Mte90/116e5d8a17973b7bd9bd9050662736dd to your computer and use it in GitHub Desktop.
Clips dataset parser and cleaner
#!/usr/bin/env bash
function extract() {
echo "Estraggo $1.rar"
unrar e "./$1.rar" \*.txt \*.wav './clips-mitads/' -o+
}
if [ ! -d './clips-mitads' ]; then
mkdir './clips-mitads'
fi
extract "LETTO"
extract "RTV"
extract "DIALOGICO"
extract "ORTOFONICO"
# extract "TELEFONICO" inutile non ha file wav
cd ./clips-mitads/
rm -f ./*_tempi.txt
/bin/ls | /bin/grep -P "^.*[_abcdefghilmnopqrstuvz][abcdefghilmnopqrstuvz].wav$" | xargs rm -f
counts=$(find './' -type f ! -name "*.txt" | wc -l)
echo "File txt da convertire $counts"
counts=$(find './' -type f ! -name "*.wav" | wc -l)
echo "File wav totali $counts"
#!/usr/bin/env python
# Based on https://github.com/vslovik/LingITII/blob/master/scripts/cleaner.py`
"""
Convert transcribed text into csv
Usage examples:
python generate_csv.py DGmtA01F.txt
"""
import sys
import re
import os
filename = os.path.basename(sys.argv[1])
filename = os.path.splitext(filename)[0]
with open(sys.argv[1]) as txt_file:
text = ''.join([line for line in txt_file])
lines = [line for line in text.split('\n') if line.strip() != '']
i = -1
for line in lines:
i = i + 1
if not line[0].isupper():
break
del lines[0:i]
text = ''.join([line for line in lines])\
.replace('<sp>', '')\
.replace('<lp>','')\
.replace('<inspiration>','')\
.replace('<tongue-click>','')\
.replace('<creacky-voice>','')\
.replace('<NOISE>','')\
.replace('[screaming]','')\
.replace(',','')\
.replace('!','')\
.replace('?','')\
.replace('*','')\
.replace('{','')\
.replace('}','')\
.replace('[whispering]','')\
.replace('<laugh>','')\
.replace('#','')\
.replace('<eeh>','')\
.replace('<mh>','')\
.replace('<laugh>','')\
.replace('<mhmh>','')\
.replace('<ahah>',' ')\
.replace('<ah>','ah')\
.replace('<eh>','eh')\
.replace('<ah>','ah')\
.replace('<ehm>','ehm')\
.replace('<oh>','oh')\
.replace('+',' ')\
.replace('[dialect]',' ')\
.replace('<oo>',' ')\
.replace('/',' ')\
.replace("\r",'')
text = re.sub(r'\s+', ' ', text)
text = re.sub('p(\w{2}): ', r'\n' + filename + r'#\1.wav,', text)
text = ''.join([line for line in text.split('\n') if text.strip() != ''])
print(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment