thisismattmiller/gist:8a894a1296f6e4db7d1c4c4bafcb07bd

## gistfile1.txt
import json
import random
from pydub import AudioSegment
import base64
import os


def match_target_amplitude(sound, target_dBFS):
    change_in_dBFS = target_dBFS - sound.dBFS
    return sound.apply_gain(change_in_dBFS)

data = json.load(open('do_the_damn_gram.json'))


# loop through each n-gram
for g in data:
	g_count=0

	# pick random 100 or all of them if less than
	if len(data[g]['files']) > 100:
		files  = random.sample(data[g]['files'],100)
	else:
		files = data[g]['files']

	# files=files[:2]


	all_file_meta = []

	# loop through each mp3 that was recorded as having an occurace of the ngram
	for f in files:


		metadata = json.load(open(f.replace('lc-gov-audio-data/','lc-gov-audio-transcripts-ner/').replace('.mp3','.json')))

		# print(metadata)
		# load the items and loop through them
		mp3 = None

		# make a new list of just the items, no punctutation
		no_punctuation_items = []
		for x in metadata['results']['items']:
			if x['type'] != 'punctuation':
				no_punctuation_items.append(x)

		# print(json.dumps(no_punctuation_items,indent=2))

		for idx, r in enumerate(no_punctuation_items):

			# print(idx,r)

			gram_split = g.split(' ')
			start_pos = idx

			found = True
			found_items = []
			# basically try see if each piece of the word is one after another and it doesnt break the streak then we found all the ngram parts
			for n in range(0,len(gram_split)):

				if start_pos + n > len(no_punctuation_items)-1:
					continue

				if g.split(' ')[n].lower() in no_punctuation_items[start_pos+n]['alternatives'][0]['content'].lower() and len(g.split(' ')[n].lower()) == len(no_punctuation_items[start_pos+n]['alternatives'][0]['content'].lower()):
					# print(g.split(' ')[n], no_punctuation_items[start_pos+n]['alternatives'][0]['content'])
					found_items.append(no_punctuation_items[start_pos+n])
				else:
					found = False


			if found:

				meta = {'word':g, 'mp3':None,'timing':[],'parts':[], 'file':f.replace('lc-gov-audio-data/','').replace('.mp3','')}

				if len(found_items) < len(gram_split):
					continue

				if len(all_file_meta) >= 100:
					continue


				print(g, len(all_file_meta))
				print(found_items)
				print('-------')

				# load the mp3
				if mp3 == None:
					mp3 = AudioSegment.from_mp3(f)

				print(len(mp3))

				st = float(found_items[0]['start_time']) * 1000
				sp = float(found_items[len(found_items)-1]['end_time']) * 1000

				st = st - 50
				sp = sp + 50

				meta['parts'] = found_items.copy()

				for a_found_item in found_items:

					start = int((float(a_found_item['start_time']) - float(found_items[0]['start_time'])) * 1000)
					print(a_found_item)
					print(start)
					meta['timing'].append(start)


				print(st,sp)
				print('----')
				clip = mp3[st:sp]
				clip = match_target_amplitude(clip,-15.0)

				g_count+=1
				clip.export(f"{g.replace(' ','_')}_{g_count}.mp3", format="mp3")
				mp3data = open(f"{g.replace(' ','_')}_{g_count}.mp3", "rb").read()
				encoded = base64.b64encode(mp3data)
				meta['mp3'] = encoded.decode('utf-8')

				all_file_meta.append(meta)
				os.remove(f"{g.replace(' ','_')}_{g_count}.mp3")

	json.dump(all_file_meta,open(f"{g.replace(' ','_')}.json",'w'),indent=2)
	import json
	import random
	from pydub import AudioSegment
	import base64
	import os


	def match_target_amplitude(sound, target_dBFS):
	change_in_dBFS = target_dBFS - sound.dBFS
	return sound.apply_gain(change_in_dBFS)

	data = json.load(open('do_the_damn_gram.json'))


	# loop through each n-gram
	for g in data:
	g_count=0

	# pick random 100 or all of them if less than
	if len(data[g]['files']) > 100:
	files = random.sample(data[g]['files'],100)
	else:
	files = data[g]['files']

	# files=files[:2]


	all_file_meta = []

	# loop through each mp3 that was recorded as having an occurace of the ngram
	for f in files:



	metadata = json.load(open(f.replace('lc-gov-audio-data/','lc-gov-audio-transcripts-ner/').replace('.mp3','.json')))

	# print(metadata)
	# load the items and loop through them
	mp3 = None

	# make a new list of just the items, no punctutation
	no_punctuation_items = []
	for x in metadata['results']['items']:
	if x['type'] != 'punctuation':
	no_punctuation_items.append(x)

	# print(json.dumps(no_punctuation_items,indent=2))

	for idx, r in enumerate(no_punctuation_items):

	# print(idx,r)

	gram_split = g.split(' ')
	start_pos = idx

	found = True
	found_items = []
	# basically try see if each piece of the word is one after another and it doesnt break the streak then we found all the ngram parts
	for n in range(0,len(gram_split)):

	if start_pos + n > len(no_punctuation_items)-1:
	continue

	if g.split(' ')[n].lower() in no_punctuation_items[start_pos+n]['alternatives'][0]['content'].lower() and len(g.split(' ')[n].lower()) == len(no_punctuation_items[start_pos+n]['alternatives'][0]['content'].lower()):
	# print(g.split(' ')[n], no_punctuation_items[start_pos+n]['alternatives'][0]['content'])
	found_items.append(no_punctuation_items[start_pos+n])
	else:
	found = False


	if found:

	meta = {'word':g, 'mp3':None,'timing':[],'parts':[], 'file':f.replace('lc-gov-audio-data/','').replace('.mp3','')}

	if len(found_items) < len(gram_split):
	continue

	if len(all_file_meta) >= 100:
	continue


	print(g, len(all_file_meta))
	print(found_items)
	print('-------')

	# load the mp3
	if mp3 == None:
	mp3 = AudioSegment.from_mp3(f)

	print(len(mp3))

	st = float(found_items[0]['start_time']) * 1000
	sp = float(found_items[len(found_items)-1]['end_time']) * 1000

	st = st - 50
	sp = sp + 50

	meta['parts'] = found_items.copy()

	for a_found_item in found_items:

	start = int((float(a_found_item['start_time']) - float(found_items[0]['start_time'])) * 1000)
	print(a_found_item)
	print(start)
	meta['timing'].append(start)



	print(st,sp)
	print('----')
	clip = mp3[st:sp]
	clip = match_target_amplitude(clip,-15.0)

	g_count+=1
	clip.export(f"{g.replace(' ','_')}_{g_count}.mp3", format="mp3")
	mp3data = open(f"{g.replace(' ','_')}_{g_count}.mp3", "rb").read()
	encoded = base64.b64encode(mp3data)
	meta['mp3'] = encoded.decode('utf-8')

	all_file_meta.append(meta)
	os.remove(f"{g.replace(' ','_')}_{g_count}.mp3")

	json.dump(all_file_meta,open(f"{g.replace(' ','_')}.json",'w'),indent=2)