Last active
February 23, 2018 19:35
-
-
Save EzequielAdrianM/a81f329fb843e4b1ec896de3d2479669 to your computer and use it in GitHub Desktop.
Import Voxforge to Deepgram Kur
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import codecs | |
import sys | |
import tarfile | |
import pandas | |
import re | |
import unicodedata | |
import time_uuid | |
import datetime | |
import shutil | |
import wave | |
from six.moves import urllib | |
from os import makedirs, path | |
from bs4 import BeautifulSoup | |
from tensorflow.python.platform import gfile | |
from tensorflow.contrib.learn.python.learn.datasets import base | |
"""Lambda function returns the filename of a path""" | |
filename_of = lambda x: path.split(x)[1] | |
# Alternative for mono wav files | |
def get_duration_wav(wav_filename): | |
f = wave.open(wav_filename, 'r') | |
frames = f.getnframes() | |
rate = f.getframerate() | |
duration = frames / float(rate) | |
f.close() | |
return round(duration, 3) | |
def download_and_preprocess(data_dir): | |
# Conditionally download data to data_dir | |
if not path.isdir(data_dir): | |
makedirs(data_dir) | |
archive_dir = data_dir+"/archive" | |
if not path.isdir(archive_dir): | |
makedirs(archive_dir) | |
print "Downloading Voxforge data set into {} if not already present...".format(archive_dir) | |
voxforge_url = 'http://www.repository.voxforge1.org/downloads/es/Trunk/Audio/Main/16kHz_16bit' #Spanish | |
#voxforge_url = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/' #English | |
html_page = urllib.request.urlopen(voxforge_url) | |
soup = BeautifulSoup(html_page, 'html.parser') | |
# list all links | |
refs = [l['href'] for l in soup.find_all('a') if ".tgz" in l['href']] | |
counter = 0 | |
total = len(refs) | |
number_of_test = total//100 | |
test_text = codecs.open(data_dir+"/test.jsonl", "w", "utf-8") | |
train_text = codecs.open(data_dir+"/train.jsonl", "w", "utf-8") | |
# download files | |
print "{} files to download".format(total) | |
for d in enumerate(refs): | |
(i, file) = d | |
archive = archive_dir + '/' + file | |
download_url = voxforge_url + '/' + file | |
counter += 1 | |
base.maybe_download(filename_of(download_url), archive_dir, download_url) | |
# Conditionally extract data to dataset_dir | |
if i < number_of_test: | |
dataset_dir = path.join(data_dir, "test") | |
else: | |
dataset_dir = path.join(data_dir, "train") | |
if not gfile.Exists(path.join(dataset_dir, '.'.join(filename_of(archive).split(".")[:-1]))): | |
print('Processing file {} {} ({}/{})...'.format(i+1, filename_of(archive), counter, total)) | |
tar = tarfile.open(archive) | |
tar.extractall(dataset_dir) | |
tar.close() | |
# Generate data set | |
promts_file = path.join(dataset_dir, '.'.join(filename_of(archive).split(".")[:-1])) + '/etc/PROMPTS' | |
with codecs.open(promts_file, 'r', 'utf-8') as f: | |
for line in f: | |
id = line.split(' ')[0].split('/')[-1] | |
uuid = str(time_uuid.TimeUUID.with_utcnow(randomize=True)) | |
sentence = ' '.join(line.split(' ')[1:]) | |
sentence = sentence.lower().replace('\n', '').replace('\r', '') | |
sentence = sentence.replace(u'á', 'a').replace(u'é', 'e').replace(u'í', 'i').replace(u'ó', 'o').replace(u'ú', 'u') | |
wav_file = path.join(promts_file[:-11],"wav/" + id + ".wav") | |
if path.isfile(wav_file): | |
wav_filesize = path.getsize(wav_file) | |
wav_duration = get_duration_wav(wav_file) | |
# remove audios that are shorter than 1s and longer than 20s. | |
if (wav_filesize/32000)>1 and (wav_filesize/32000)<20: | |
json_sentence = '{\"text\": \"' + sentence + '\", \"duration_s\": ' + str(wav_duration) + ', \"uuid\": \"' +uuid+ '\"}\n' | |
if i < number_of_test: | |
test_text.write(json_sentence) | |
else: | |
train_text.write(json_sentence) | |
shutil.move(wav_file, dataset_dir+'/'+uuid+'.wav') | |
shutil.rmtree(path.join(dataset_dir, '.'.join(filename_of(archive).split(".")[:-1]))) | |
test_text.close() | |
train_text.close() | |
download_and_preprocess(sys.argv[1]) | |
# 1. Descarga todo el dataset Voxforge en Español | |
# 2. Extrae todo lo descargado en la carpeta 'archive' | |
# 3. La mayoria del dataset se destina a train el resto a test. | |
# 4. Extrae las oraciones de cada persona | |
# 5. Obtiene la duracion de los WAV en segundos. | |
# 6. Renombra los WAV con UUIDs. | |
# 7. Elimina transcripciones que sean muy cortas o muy largas. | |
# 8. Elimina el contenido extraido para reducir espacio. | |
# 9. Crea los archivos JSONL. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment