Last active
February 8, 2025 18:31
-
-
Save surinderlohat/a72ee47f8ae1c60b31b76e2fe1d0d5b9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
colab helping methods | |
# Clear console output i.e large import statement logs | |
from IPython.display import clear_output | |
clear_output(wait=True) | |
# show audio | |
from IPython.display import Audio, display | |
audio = AudioSegment.from_file(audio_path) | |
display(Audio("path.wav")) | |
--------------------------------------------------- | |
# Packages need to audio processing | |
!pip install yt-dlp | |
!pip install pydub | |
------------------------------------------------------------------------------------------------ | |
# Create directory if it doesn't exist | |
import os | |
os.makedirs(base_path, exist_ok=True) | |
def create_clean_directory(directory_path): | |
os.makedirs(directory_path,exist_ok=True) | |
# Remove and recreate the directory | |
if os.path.exists(directory_path): | |
shutil.rmtree(directory_path) | |
os.makedirs(directory_path,exist_ok=True) | |
# List Files in directory | |
import glob | |
segment_audio_paths = glob.glob(f"{directory}/*") # List all files in the directory | |
print("segment_audio_paths",segment_audio_paths) | |
# Loop through all WAV files in the folder | |
for filename in segment_audio_paths: | |
if filename.endswith(".mp3"): | |
# file_path = os.path.join(folder_path, filename) | |
-------------------------------------------------------------------------------------------- | |
#Load JSON file | |
import json | |
with open("file.txt", 'r', encoding='utf-8') as f: | |
result = json.load(f) | |
result['segments'][:10] | |
-------------------------------------------------------------------------------------------- | |
import time | |
import edge_tts | |
async def text_to_speech(text,index): | |
voice_short_name = "hi-IN-MadhurNeural".split(" - ")[0] | |
rate_str = f"{0:+d}%" | |
pitch_str = f"{5:+d}Hz" | |
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str) | |
await communicate.save(f"{save_audio_path}/{index}_audio.mp3") | |
return communicate | |
# Make Dir if not Exist | |
save_audio_path = f"{base_path}/audio_segments" | |
os.makedirs(save_audio_path, exist_ok=True) | |
for index, text in enumerate(updated_segmeent_df['text_hindi']): | |
# print(f"Processing {index}",text) | |
await text_to_speech(text,index+1) | |
time.sleep(1) | |
---------------------------------------------------------------------------------------------------- | |
# Extract mel-spectrogram from audio using librosa | |
import librosa | |
def extract_mel_spectrogram(audio_path, n_mels=128, fmin=0, fmax=8000): | |
# Load audio file | |
y, sr = librosa.load(audio_path, sr=None) | |
# Compute mel-spectrogram | |
mel_spectrogram = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax) | |
# Convert to log scale (log-mel spectrogram) | |
log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max) | |
return log_mel_spectrogram | |
--------------------------------------------------------- | |
# Loop thrugh directory | |
# # Convert all downloaded audio to 16kHz mono wav files | |
# for file in os.listdir(playlist_path): | |
# if file.endswith('.wav'): | |
# file_path = os.path.join(playlist_path, file) | |
# audio = AudioSegment.from_wav(file_path) | |
# audio = audio.set_frame_rate(16000).set_channels(1) | |
# audio.export(file_path, format='wav') | |
------------------------------------------------------ | |
!pip install git+https://github.com/openai/whisper.git | |
!apt-get install ffmpeg | |
!pip install pydub | |
import whisper | |
from pydub import AudioSegment | |
# Load the Whisper model | |
# Load the Whisper model (you can choose 'tiny', 'base', 'small', 'medium', 'large') | |
model = whisper.load_model("large") | |
result = model.transcribe(raw_audio_path, task="translate", language="zh") | |
result | |
# Save Response in daw file | |
import json | |
filter_data = {} | |
filter_data['text'] = result['text'] | |
segments = [] | |
for segment in result['segments']: | |
segments.append({'start': segment['start'], 'end': segment['end'], 'text': segment['text'] }) | |
filter_data['segments'] = segments | |
with open("filename.txt", 'w', encoding='utf-8') as f: | |
json.dump(filter_data, f, ensure_ascii=False, indent=4) | |
------------------------------------------------------- | |
# Google Speach | |
!pip install SpeechRecognition | |
# Expariment chunk audio in 5 sec chunks and try to match the spectograms | |
import speech_recognition as sr | |
def transcribe_audio_with_timestamps(audio_file): | |
recognizer = sr.Recognizer() | |
with sr.AudioFile(audio_file) as source: | |
audio = recognizer.record(source) | |
try: | |
result = recognizer.recognize_google(audio,language="hi-IN", show_all=True) | |
return result.get('alternative')[0]['transcript'] | |
except sr.UnknownValueError: | |
print("Speech Recognition could not understand audio.") | |
except sr.RequestError: | |
print("Could not request results.") | |
audio_path ="/content/drive/MyDrive/YoutubeDownloads/ProcessedAudioData/FYQWb2OBxY8/chunk_2.wav" | |
response= transcribe_audio_with_timestamps(audio_path) | |
print(response) | |
# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment