Created
May 17, 2019 02:09
-
-
Save thisismattmiller/8a894a1296f6e4db7d1c4c4bafcb07bd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import random | |
from pydub import AudioSegment | |
import base64 | |
import os | |
def match_target_amplitude(sound, target_dBFS): | |
change_in_dBFS = target_dBFS - sound.dBFS | |
return sound.apply_gain(change_in_dBFS) | |
data = json.load(open('do_the_damn_gram.json')) | |
# loop through each n-gram | |
for g in data: | |
g_count=0 | |
# pick random 100 or all of them if less than | |
if len(data[g]['files']) > 100: | |
files = random.sample(data[g]['files'],100) | |
else: | |
files = data[g]['files'] | |
# files=files[:2] | |
all_file_meta = [] | |
# loop through each mp3 that was recorded as having an occurace of the ngram | |
for f in files: | |
metadata = json.load(open(f.replace('lc-gov-audio-data/','lc-gov-audio-transcripts-ner/').replace('.mp3','.json'))) | |
# print(metadata) | |
# load the items and loop through them | |
mp3 = None | |
# make a new list of just the items, no punctutation | |
no_punctuation_items = [] | |
for x in metadata['results']['items']: | |
if x['type'] != 'punctuation': | |
no_punctuation_items.append(x) | |
# print(json.dumps(no_punctuation_items,indent=2)) | |
for idx, r in enumerate(no_punctuation_items): | |
# print(idx,r) | |
gram_split = g.split(' ') | |
start_pos = idx | |
found = True | |
found_items = [] | |
# basically try see if each piece of the word is one after another and it doesnt break the streak then we found all the ngram parts | |
for n in range(0,len(gram_split)): | |
if start_pos + n > len(no_punctuation_items)-1: | |
continue | |
if g.split(' ')[n].lower() in no_punctuation_items[start_pos+n]['alternatives'][0]['content'].lower() and len(g.split(' ')[n].lower()) == len(no_punctuation_items[start_pos+n]['alternatives'][0]['content'].lower()): | |
# print(g.split(' ')[n], no_punctuation_items[start_pos+n]['alternatives'][0]['content']) | |
found_items.append(no_punctuation_items[start_pos+n]) | |
else: | |
found = False | |
if found: | |
meta = {'word':g, 'mp3':None,'timing':[],'parts':[], 'file':f.replace('lc-gov-audio-data/','').replace('.mp3','')} | |
if len(found_items) < len(gram_split): | |
continue | |
if len(all_file_meta) >= 100: | |
continue | |
print(g, len(all_file_meta)) | |
print(found_items) | |
print('-------') | |
# load the mp3 | |
if mp3 == None: | |
mp3 = AudioSegment.from_mp3(f) | |
print(len(mp3)) | |
st = float(found_items[0]['start_time']) * 1000 | |
sp = float(found_items[len(found_items)-1]['end_time']) * 1000 | |
st = st - 50 | |
sp = sp + 50 | |
meta['parts'] = found_items.copy() | |
for a_found_item in found_items: | |
start = int((float(a_found_item['start_time']) - float(found_items[0]['start_time'])) * 1000) | |
print(a_found_item) | |
print(start) | |
meta['timing'].append(start) | |
print(st,sp) | |
print('----') | |
clip = mp3[st:sp] | |
clip = match_target_amplitude(clip,-15.0) | |
g_count+=1 | |
clip.export(f"{g.replace(' ','_')}_{g_count}.mp3", format="mp3") | |
mp3data = open(f"{g.replace(' ','_')}_{g_count}.mp3", "rb").read() | |
encoded = base64.b64encode(mp3data) | |
meta['mp3'] = encoded.decode('utf-8') | |
all_file_meta.append(meta) | |
os.remove(f"{g.replace(' ','_')}_{g_count}.mp3") | |
json.dump(all_file_meta,open(f"{g.replace(' ','_')}.json",'w'),indent=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment