Skip to content

Instantly share code, notes, and snippets.

@0187773933
Last active April 25, 2024 23:57
Show Gist options
  • Save 0187773933/1d5f4f127ec5986c1026788c1aff3e40 to your computer and use it in GitHub Desktop.
Save 0187773933/1d5f4f127ec5986c1026788c1aff3e40 to your computer and use it in GitHub Desktop.
Runs Google Media Pipe Yamnet Audio Classification on MP3 File
import numpy as np
from pprint import pprint
import librosa
import tensorflow as tf
# https://storage.googleapis.com/mediapipe-models/audio_classifier/yamnet/float32/latest/yamnet.tflite
# https://github.com/tensorflow/models/blob/master/research/audioset/yamnet/yamnet.py
# https://research.google.com/audioset/ontology/index.html
# https://storage.googleapis.com/mediapipe-tasks/audio_classifier/yamnet_label_list.txt
# https://github.com/tensorflow/models/blob/master/research/audioset/yamnet/params.py#L25
sample_rate = 16000.0
stft_window_seconds = 0.025
stft_hop_seconds = 0.010
mel_bands = 64
mel_min_hz = 125.0
mel_max_hz = 7500.0
log_offset = 0.001
# patch_window_seconds = 0.96 # or 0.975 ?
patch_window_seconds = 0.975
patch_hop_seconds = 0.48
interpreter = tf.lite.Interpreter( model_path='./yamnet.tflite' )
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
def read_text( file_path ):
with open( file_path ) as f:
return f.read().splitlines()
def process_audio_segment( segment ):
waveform = segment.astype( np.float32 )
interpreter.set_tensor( input_details[ 0 ][ "index" ] , waveform )
interpreter.invoke()
return interpreter.get_tensor( output_details[ 0 ][ "index" ] ).flatten()
if __name__ == "__main__":
model_labels = read_text( "./yamnet_label_list.txt" )
# classify
audio_path = './Tradition.mp3'
audio_data, sr = librosa.load( audio_path , sr=sample_rate )
samples_per_patch = ( patch_window_seconds * sample_rate )
samples_per_patch_50_percent_overlap = int( samples_per_patch / 2 )
results = []
duration_in_samples = len( audio_data )
range_end = int( duration_in_samples - ( samples_per_patch + 1 ) )
for start in range( 0 , range_end , samples_per_patch_50_percent_overlap ): # 50% overlap
segment = audio_data[ start: ( start + int( samples_per_patch ) ) ]
probabilities = process_audio_segment( segment )
results.append( probabilities )
total_results = len( results )
for i , result in enumerate( results ):
print( f"\nSection [{i+1}] of {total_results}" )
labeled_probabilities = list( zip( model_labels , result ) )
labeled_probabilities = [ pair for pair in labeled_probabilities if pair[ 1 ] > 0 ]
labeled_probabilities.sort( key=lambda x: x[ 1 ] , reverse=True )
for label, probability in labeled_probabilities[ : 19 ]:
print( f"{label}: {probability}" )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment