Skip to content

Instantly share code, notes, and snippets.

@mustafaxfe
Created January 24, 2019 18:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mustafaxfe/d20be114ca7cea5c47ea5cc85653c761 to your computer and use it in GitHub Desktop.
Save mustafaxfe/d20be114ca7cea5c47ea5cc85653c761 to your computer and use it in GitHub Desktop.
Creating deepspeech data format
#First Step
book = AudioSegment.from_mp3("/content/gdrive/My Drive/TASR/kitaplar/teneke/bolum1/Teneke_Yasar_Kemal_01.mp3")
with open('/content/gdrive/My Drive/TASR/kitaplar/{0}/{1}_output/tuned.json'.format(book_name, chapter)) as f:
syncmap = json.loads(f.read())
syncmap
#Second Step
sentences = []
for fragment in syncmap['fragments']:
if ((float(fragment['end'])*1000) - float(fragment['begin'])*1000) > 400:
sentences.append({"audio":book[float(fragment['begin'])*1000:float(fragment['end'])*1000], "text":fragment['lines'][0]})
#Third Step
len(sentences)
df = pd.DataFrame(columns=['filename','text','up_votes','down_votes','age','gender','accent','duration'])
#Fourth Step
if not os.path.isdir("/content/gdrive/My Drive/TASR/kitaplar/teneke_out_{}/".format(chapter)):
os.mkdir('/content/gdrive/My Drive/TASR/kitaplar/teneke_out_{}/'.format(chapter))
# export audio segment
for idx, sentence in enumerate(sentences):
text = sentence['text'].lower()
sentence['audio'].export("/content/gdrive/My Drive/TASR/kitaplar/teneke_out_{}/sample-".format(chapter)+str(idx)+".mp3", format="mp3")
temp_df = pd.DataFrame([{'filename':"sample-"+str(idx)+".mp3",'text':text,'up_votes':0,'down_votes':0,'age':0,'gender':"female",'accent':'','duration':''}], columns=['filename','text','up_votes','down_votes','age','gender','accent','duration'])
df = df.append(temp_df)
# Last step
df.to_csv("/content/gdrive/My Drive/TASR/kitaplar/teneke_out_{}/sample.csv".format(chapter),index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment