Skip to content

Instantly share code, notes, and snippets.

@EzequielAdrianM
Last active February 23, 2018 19:35
Show Gist options
  • Save EzequielAdrianM/a81f329fb843e4b1ec896de3d2479669 to your computer and use it in GitHub Desktop.
Save EzequielAdrianM/a81f329fb843e4b1ec896de3d2479669 to your computer and use it in GitHub Desktop.
Import Voxforge to Deepgram Kur
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
import sys
import tarfile
import pandas
import re
import unicodedata
import time_uuid
import datetime
import shutil
import wave
from six.moves import urllib
from os import makedirs, path
from bs4 import BeautifulSoup
from tensorflow.python.platform import gfile
from tensorflow.contrib.learn.python.learn.datasets import base
"""Lambda function returns the filename of a path"""
filename_of = lambda x: path.split(x)[1]
# Alternative for mono wav files
def get_duration_wav(wav_filename):
f = wave.open(wav_filename, 'r')
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
f.close()
return round(duration, 3)
def download_and_preprocess(data_dir):
# Conditionally download data to data_dir
if not path.isdir(data_dir):
makedirs(data_dir)
archive_dir = data_dir+"/archive"
if not path.isdir(archive_dir):
makedirs(archive_dir)
print "Downloading Voxforge data set into {} if not already present...".format(archive_dir)
voxforge_url = 'http://www.repository.voxforge1.org/downloads/es/Trunk/Audio/Main/16kHz_16bit' #Spanish
#voxforge_url = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/' #English
html_page = urllib.request.urlopen(voxforge_url)
soup = BeautifulSoup(html_page, 'html.parser')
# list all links
refs = [l['href'] for l in soup.find_all('a') if ".tgz" in l['href']]
counter = 0
total = len(refs)
number_of_test = total//100
test_text = codecs.open(data_dir+"/test.jsonl", "w", "utf-8")
train_text = codecs.open(data_dir+"/train.jsonl", "w", "utf-8")
# download files
print "{} files to download".format(total)
for d in enumerate(refs):
(i, file) = d
archive = archive_dir + '/' + file
download_url = voxforge_url + '/' + file
counter += 1
base.maybe_download(filename_of(download_url), archive_dir, download_url)
# Conditionally extract data to dataset_dir
if i < number_of_test:
dataset_dir = path.join(data_dir, "test")
else:
dataset_dir = path.join(data_dir, "train")
if not gfile.Exists(path.join(dataset_dir, '.'.join(filename_of(archive).split(".")[:-1]))):
print('Processing file {} {} ({}/{})...'.format(i+1, filename_of(archive), counter, total))
tar = tarfile.open(archive)
tar.extractall(dataset_dir)
tar.close()
# Generate data set
promts_file = path.join(dataset_dir, '.'.join(filename_of(archive).split(".")[:-1])) + '/etc/PROMPTS'
with codecs.open(promts_file, 'r', 'utf-8') as f:
for line in f:
id = line.split(' ')[0].split('/')[-1]
uuid = str(time_uuid.TimeUUID.with_utcnow(randomize=True))
sentence = ' '.join(line.split(' ')[1:])
sentence = sentence.lower().replace('\n', '').replace('\r', '')
sentence = sentence.replace(u'á', 'a').replace(u'é', 'e').replace(u'í', 'i').replace(u'ó', 'o').replace(u'ú', 'u')
wav_file = path.join(promts_file[:-11],"wav/" + id + ".wav")
if path.isfile(wav_file):
wav_filesize = path.getsize(wav_file)
wav_duration = get_duration_wav(wav_file)
# remove audios that are shorter than 1s and longer than 20s.
if (wav_filesize/32000)>1 and (wav_filesize/32000)<20:
json_sentence = '{\"text\": \"' + sentence + '\", \"duration_s\": ' + str(wav_duration) + ', \"uuid\": \"' +uuid+ '\"}\n'
if i < number_of_test:
test_text.write(json_sentence)
else:
train_text.write(json_sentence)
shutil.move(wav_file, dataset_dir+'/'+uuid+'.wav')
shutil.rmtree(path.join(dataset_dir, '.'.join(filename_of(archive).split(".")[:-1])))
test_text.close()
train_text.close()
download_and_preprocess(sys.argv[1])
# 1. Descarga todo el dataset Voxforge en Español
# 2. Extrae todo lo descargado en la carpeta 'archive'
# 3. La mayoria del dataset se destina a train el resto a test.
# 4. Extrae las oraciones de cada persona
# 5. Obtiene la duracion de los WAV en segundos.
# 6. Renombra los WAV con UUIDs.
# 7. Elimina transcripciones que sean muy cortas o muy largas.
# 8. Elimina el contenido extraido para reducir espacio.
# 9. Crea los archivos JSONL.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment