Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thisismattmiller/8a894a1296f6e4db7d1c4c4bafcb07bd to your computer and use it in GitHub Desktop.
Save thisismattmiller/8a894a1296f6e4db7d1c4c4bafcb07bd to your computer and use it in GitHub Desktop.
import json
import random
from pydub import AudioSegment
import base64
import os
def match_target_amplitude(sound, target_dBFS):
change_in_dBFS = target_dBFS - sound.dBFS
return sound.apply_gain(change_in_dBFS)
data = json.load(open('do_the_damn_gram.json'))
# loop through each n-gram
for g in data:
g_count=0
# pick random 100 or all of them if less than
if len(data[g]['files']) > 100:
files = random.sample(data[g]['files'],100)
else:
files = data[g]['files']
# files=files[:2]
all_file_meta = []
# loop through each mp3 that was recorded as having an occurace of the ngram
for f in files:
metadata = json.load(open(f.replace('lc-gov-audio-data/','lc-gov-audio-transcripts-ner/').replace('.mp3','.json')))
# print(metadata)
# load the items and loop through them
mp3 = None
# make a new list of just the items, no punctutation
no_punctuation_items = []
for x in metadata['results']['items']:
if x['type'] != 'punctuation':
no_punctuation_items.append(x)
# print(json.dumps(no_punctuation_items,indent=2))
for idx, r in enumerate(no_punctuation_items):
# print(idx,r)
gram_split = g.split(' ')
start_pos = idx
found = True
found_items = []
# basically try see if each piece of the word is one after another and it doesnt break the streak then we found all the ngram parts
for n in range(0,len(gram_split)):
if start_pos + n > len(no_punctuation_items)-1:
continue
if g.split(' ')[n].lower() in no_punctuation_items[start_pos+n]['alternatives'][0]['content'].lower() and len(g.split(' ')[n].lower()) == len(no_punctuation_items[start_pos+n]['alternatives'][0]['content'].lower()):
# print(g.split(' ')[n], no_punctuation_items[start_pos+n]['alternatives'][0]['content'])
found_items.append(no_punctuation_items[start_pos+n])
else:
found = False
if found:
meta = {'word':g, 'mp3':None,'timing':[],'parts':[], 'file':f.replace('lc-gov-audio-data/','').replace('.mp3','')}
if len(found_items) < len(gram_split):
continue
if len(all_file_meta) >= 100:
continue
print(g, len(all_file_meta))
print(found_items)
print('-------')
# load the mp3
if mp3 == None:
mp3 = AudioSegment.from_mp3(f)
print(len(mp3))
st = float(found_items[0]['start_time']) * 1000
sp = float(found_items[len(found_items)-1]['end_time']) * 1000
st = st - 50
sp = sp + 50
meta['parts'] = found_items.copy()
for a_found_item in found_items:
start = int((float(a_found_item['start_time']) - float(found_items[0]['start_time'])) * 1000)
print(a_found_item)
print(start)
meta['timing'].append(start)
print(st,sp)
print('----')
clip = mp3[st:sp]
clip = match_target_amplitude(clip,-15.0)
g_count+=1
clip.export(f"{g.replace(' ','_')}_{g_count}.mp3", format="mp3")
mp3data = open(f"{g.replace(' ','_')}_{g_count}.mp3", "rb").read()
encoded = base64.b64encode(mp3data)
meta['mp3'] = encoded.decode('utf-8')
all_file_meta.append(meta)
os.remove(f"{g.replace(' ','_')}_{g_count}.mp3")
json.dump(all_file_meta,open(f"{g.replace(' ','_')}.json",'w'),indent=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment