colab helping methods
# Clear console output i.e large import statement logs
from IPython.display import clear_output
# show audio
from IPython.display import Audio, display
audio = AudioSegment.from_file(audio_path)
# Packages need to audio processing
!pip install yt-dlp
!pip install pydub
# Create directory if it doesn't exist
import os
os.makedirs(base_path, exist_ok=True)
def create_clean_directory(directory_path):
# Remove and recreate the directory
if os.path.exists(directory_path):
# List Files in directory
import glob
segment_audio_paths = glob.glob(f"{directory}/*") # List all files in the directory
# Loop through all WAV files in the folder
for filename in segment_audio_paths:
if filename.endswith(".mp3"):
# file_path = os.path.join(folder_path, filename)
#Load JSON file
import json
with open("file.txt", 'r', encoding='utf-8') as f:
result = json.load(f)
import time
import edge_tts
async def text_to_speech(text,index):
voice_short_name = "hi-IN-MadhurNeural".split(" - ")[0]
rate_str = f"{0:+d}%"
pitch_str = f"{5:+d}Hz"
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
return communicate
# Make Dir if not Exist
save_audio_path = f"{base_path}/audio_segments"
os.makedirs(save_audio_path, exist_ok=True)
for index, text in enumerate(updated_segmeent_df['text_hindi']):
# print(f"Processing {index}",text)
await text_to_speech(text,index+1)
# Extract mel-spectrogram from audio using librosa
import librosa
def extract_mel_spectrogram(audio_path, n_mels=128, fmin=0, fmax=8000):
# Load audio file
y, sr = librosa.load(audio_path, sr=None)
# Compute mel-spectrogram
mel_spectrogram = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax)
# Convert to log scale (log-mel spectrogram)
log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
return log_mel_spectrogram
# Loop thrugh directory
# # Convert all downloaded audio to 16kHz mono wav files
# for file in os.listdir(playlist_path):
# if file.endswith('.wav'):
# file_path = os.path.join(playlist_path, file)
# audio = AudioSegment.from_wav(file_path)
# audio = audio.set_frame_rate(16000).set_channels(1)
# audio.export(file_path, format='wav')
!pip install git+
!apt-get install ffmpeg
!pip install pydub
import whisper
from pydub import AudioSegment
# Load the Whisper model
# Load the Whisper model (you can choose 'tiny', 'base', 'small', 'medium', 'large')
model = whisper.load_model("large")
result = model.transcribe(raw_audio_path, task="translate", language="zh")
# Save Response in daw file
import json
filter_data = {}
filter_data['text'] = result['text']
segments = []
for segment in result['segments']:
segments.append({'start': segment['start'], 'end': segment['end'], 'text': segment['text'] })
filter_data['segments'] = segments
with open("filename.txt", 'w', encoding='utf-8') as f:
json.dump(filter_data, f, ensure_ascii=False, indent=4)
# Google Speach
!pip install SpeechRecognition
# Expariment chunk audio in 5 sec chunks and try to match the spectograms
import speech_recognition as sr
def transcribe_audio_with_timestamps(audio_file):
recognizer = sr.Recognizer()
with sr.AudioFile(audio_file) as source:
audio = recognizer.record(source)
result = recognizer.recognize_google(audio,language="hi-IN", show_all=True)
return result.get('alternative')[0]['transcript']
except sr.UnknownValueError:
print("Speech Recognition could not understand audio.")
except sr.RequestError:
print("Could not request results.")
audio_path ="/content/drive/MyDrive/YoutubeDownloads/ProcessedAudioData/FYQWb2OBxY8/chunk_2.wav"
response= transcribe_audio_with_timestamps(audio_path)
