mustafaxfe/deepspeech_data.py

## deepspeech_data.py
#First Step
book = AudioSegment.from_mp3("/content/gdrive/My Drive/TASR/kitaplar/teneke/bolum1/Teneke_Yasar_Kemal_01.mp3")

with open('/content/gdrive/My Drive/TASR/kitaplar/{0}/{1}_output/tuned.json'.format(book_name, chapter)) as f:
    syncmap = json.loads(f.read())
syncmap
#Second Step
sentences = []
for fragment in syncmap['fragments']:
    if ((float(fragment['end'])*1000) - float(fragment['begin'])*1000) > 400:
        sentences.append({"audio":book[float(fragment['begin'])*1000:float(fragment['end'])*1000], "text":fragment['lines'][0]})
#Third Step
len(sentences)
df = pd.DataFrame(columns=['filename','text','up_votes','down_votes','age','gender','accent','duration'])
#Fourth Step
if not os.path.isdir("/content/gdrive/My Drive/TASR/kitaplar/teneke_out_{}/".format(chapter)):
  os.mkdir('/content/gdrive/My Drive/TASR/kitaplar/teneke_out_{}/'.format(chapter))
# export audio segment
for idx, sentence in enumerate(sentences):
    text = sentence['text'].lower()
    sentence['audio'].export("/content/gdrive/My Drive/TASR/kitaplar/teneke_out_{}/sample-".format(chapter)+str(idx)+".mp3", format="mp3")
    temp_df = pd.DataFrame([{'filename':"sample-"+str(idx)+".mp3",'text':text,'up_votes':0,'down_votes':0,'age':0,'gender':"female",'accent':'','duration':''}], columns=['filename','text','up_votes','down_votes','age','gender','accent','duration'])
    df = df.append(temp_df)
# Last step
df.to_csv("/content/gdrive/My Drive/TASR/kitaplar/teneke_out_{}/sample.csv".format(chapter),index=False)
	#First Step
	book = AudioSegment.from_mp3("/content/gdrive/My Drive/TASR/kitaplar/teneke/bolum1/Teneke_Yasar_Kemal_01.mp3")

	with open('/content/gdrive/My Drive/TASR/kitaplar/{0}/{1}_output/tuned.json'.format(book_name, chapter)) as f:
	syncmap = json.loads(f.read())
	syncmap
	#Second Step
	sentences = []
	for fragment in syncmap['fragments']:
	if ((float(fragment['end'])1000) - float(fragment['begin'])1000) > 400:
	sentences.append({"audio":book[float(fragment['begin'])1000:float(fragment['end'])1000], "text":fragment['lines'][0]})
	#Third Step
	len(sentences)
	df = pd.DataFrame(columns=['filename','text','up_votes','down_votes','age','gender','accent','duration'])
	#Fourth Step
	if not os.path.isdir("/content/gdrive/My Drive/TASR/kitaplar/teneke_out_{}/".format(chapter)):
	os.mkdir('/content/gdrive/My Drive/TASR/kitaplar/teneke_out_{}/'.format(chapter))
	# export audio segment
	for idx, sentence in enumerate(sentences):
	text = sentence['text'].lower()
	sentence['audio'].export("/content/gdrive/My Drive/TASR/kitaplar/teneke_out_{}/sample-".format(chapter)+str(idx)+".mp3", format="mp3")
	temp_df = pd.DataFrame([{'filename':"sample-"+str(idx)+".mp3",'text':text,'up_votes':0,'down_votes':0,'age':0,'gender':"female",'accent':'','duration':''}], columns=['filename','text','up_votes','down_votes','age','gender','accent','duration'])
	df = df.append(temp_df)
	# Last step
	df.to_csv("/content/gdrive/My Drive/TASR/kitaplar/teneke_out_{}/sample.csv".format(chapter),index=False)