Skip to content

Instantly share code, notes, and snippets.

@surinderlohat
Last active February 8, 2025 18:31
Show Gist options
  • Save surinderlohat/a72ee47f8ae1c60b31b76e2fe1d0d5b9 to your computer and use it in GitHub Desktop.
Save surinderlohat/a72ee47f8ae1c60b31b76e2fe1d0d5b9 to your computer and use it in GitHub Desktop.
colab helping methods
# Clear console output i.e large import statement logs
from IPython.display import clear_output
clear_output(wait=True)
# show audio
from IPython.display import Audio, display
audio = AudioSegment.from_file(audio_path)
display(Audio("path.wav"))
---------------------------------------------------
# Packages need to audio processing
!pip install yt-dlp
!pip install pydub
------------------------------------------------------------------------------------------------
# Create directory if it doesn't exist
import os
os.makedirs(base_path, exist_ok=True)
def create_clean_directory(directory_path):
os.makedirs(directory_path,exist_ok=True)
# Remove and recreate the directory
if os.path.exists(directory_path):
shutil.rmtree(directory_path)
os.makedirs(directory_path,exist_ok=True)
# List Files in directory
import glob
segment_audio_paths = glob.glob(f"{directory}/*") # List all files in the directory
print("segment_audio_paths",segment_audio_paths)
# Loop through all WAV files in the folder
for filename in segment_audio_paths:
if filename.endswith(".mp3"):
# file_path = os.path.join(folder_path, filename)
--------------------------------------------------------------------------------------------
#Load JSON file
import json
with open("file.txt", 'r', encoding='utf-8') as f:
result = json.load(f)
result['segments'][:10]
--------------------------------------------------------------------------------------------
import time
import edge_tts
async def text_to_speech(text,index):
voice_short_name = "hi-IN-MadhurNeural".split(" - ")[0]
rate_str = f"{0:+d}%"
pitch_str = f"{5:+d}Hz"
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
await communicate.save(f"{save_audio_path}/{index}_audio.mp3")
return communicate
# Make Dir if not Exist
save_audio_path = f"{base_path}/audio_segments"
os.makedirs(save_audio_path, exist_ok=True)
for index, text in enumerate(updated_segmeent_df['text_hindi']):
# print(f"Processing {index}",text)
await text_to_speech(text,index+1)
time.sleep(1)
----------------------------------------------------------------------------------------------------
# Extract mel-spectrogram from audio using librosa
import librosa
def extract_mel_spectrogram(audio_path, n_mels=128, fmin=0, fmax=8000):
# Load audio file
y, sr = librosa.load(audio_path, sr=None)
# Compute mel-spectrogram
mel_spectrogram = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax)
# Convert to log scale (log-mel spectrogram)
log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
return log_mel_spectrogram
---------------------------------------------------------
# Loop thrugh directory
# # Convert all downloaded audio to 16kHz mono wav files
# for file in os.listdir(playlist_path):
# if file.endswith('.wav'):
# file_path = os.path.join(playlist_path, file)
# audio = AudioSegment.from_wav(file_path)
# audio = audio.set_frame_rate(16000).set_channels(1)
# audio.export(file_path, format='wav')
------------------------------------------------------
!pip install git+https://github.com/openai/whisper.git
!apt-get install ffmpeg
!pip install pydub
import whisper
from pydub import AudioSegment
# Load the Whisper model
# Load the Whisper model (you can choose 'tiny', 'base', 'small', 'medium', 'large')
model = whisper.load_model("large")
result = model.transcribe(raw_audio_path, task="translate", language="zh")
result
# Save Response in daw file
import json
filter_data = {}
filter_data['text'] = result['text']
segments = []
for segment in result['segments']:
segments.append({'start': segment['start'], 'end': segment['end'], 'text': segment['text'] })
filter_data['segments'] = segments
with open("filename.txt", 'w', encoding='utf-8') as f:
json.dump(filter_data, f, ensure_ascii=False, indent=4)
-------------------------------------------------------
# Google Speach
!pip install SpeechRecognition
# Expariment chunk audio in 5 sec chunks and try to match the spectograms
import speech_recognition as sr
def transcribe_audio_with_timestamps(audio_file):
recognizer = sr.Recognizer()
with sr.AudioFile(audio_file) as source:
audio = recognizer.record(source)
try:
result = recognizer.recognize_google(audio,language="hi-IN", show_all=True)
return result.get('alternative')[0]['transcript']
except sr.UnknownValueError:
print("Speech Recognition could not understand audio.")
except sr.RequestError:
print("Could not request results.")
audio_path ="/content/drive/MyDrive/YoutubeDownloads/ProcessedAudioData/FYQWb2OBxY8/chunk_2.wav"
response= transcribe_audio_with_timestamps(audio_path)
print(response)
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment