Tatiana Tikhonova tikhonova

## text_generation_with_aitextgen
ai.train('result.txt',
         line_by_line=False,
         from_cache=False,
         num_steps=100000,
         generate_every=5000,
         save_every=5000,
         save_gdrive=True,
         learning_rate=1e-3,
         fp16=False,
         batch_size=1,

## debugging_errors.py
''' ffmpeg not found when using pydub utils
___
If using Win, need to download from the official website and add to path, then reload git bash.
https://github.com/jiaaro/pydub/issues/348
'''

''' AssertionError: Distributed mode requires CUDA
___
a MUST-read to confirm that both GPU and drivers support the CUDA version you've installed (or about to install):
https://stackoverflow.com/questions/60987997/why-torch-cuda-is-available-returns-false-even-after-installing-pytorch-with

## tacotron_hparams.py
# Attention parameters
attention_rnn_dim=1024,  # sets the number of units in the RNN
attention_dim=128,  # sets the number of units in the attention mechanism
#  These two values are relatively large and may require a significant amount of GPU memory during training and inference.

# Location Layer parameters
attention_location_n_filters=32,  # sets the number of filters in the CNN
attention_location_kernel_size=31,  # sets the size of the filters
# This means that the CNN has 32 filters and each filter has a kernel size of 31.

## 5_audio_transcript_cleanup_filelists_setup.py .py
''' Make metadata.csv and filelists via https://jaimeleal.github.io/how-to-speech-synthesis '''

import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

filepath = 'E:/AlanWatts/dataset/transcripts2/'
files = os.listdir(filepath)
rows = []

## 4_audio_Google_transcribe_with_multiprocessing.py
# snippet of instantiating a client
client = speech.SpeechClient()
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=22050,
    audio_channel_count=1,
    model='phone_call',  # recognizes low quality audio better than default
    use_enhanced=1,  # if available
    language_code="en-US")


## 3_audio_split_into_segments.py
# using AudioSegment

    def get_duration(self):
        return self.audio.duration_seconds

    def single_split(self, from_min, to_min, split_filename):
        t1 = from_min * 7 * 1000  # convert to milliseconds
        t2 = to_min * 7 * 1000
        split_audio = self.audio[t1:t2]
        resampled = split_audio.set_frame_rate(22050)  # change the sampling rate to 22050 Hz

## 2_remove_silence.py
def remove_sil(file_path: str, file: str, dest_path: str, format="wav"):
    sound = AudioSegment.from_file(os.path.join(file_path, file), format=format)
    non_sil_times = detect_nonsilent(sound, min_silence_len=50, silence_thresh=sound.dBFS * 1.5)
    if len(non_sil_times) == 0:
        return None
    elif len(non_sil_times) > 0:
        non_sil_times_concat = [non_sil_times[0]]
        if len(non_sil_times) > 1:
            for t in non_sil_times[1:]:
                if t[0] - non_sil_times_concat[-1][-1] < 200:

## 1_reduce_noise.py
def reduce_noise(file_path: str, file: str, dest_path: str) -> None:
    # load data
    rate, data = wavfile.read(os.path.join(file_path, file))

    reduced_noise = nr.reduce_noise(y=data, sr=rate)
    # perform noise reduction
    try:
        wavfile.write(os.path.join(dest_path, file), rate, reduced_noise)
    except Exception():
        pass

## 0_audio_convert_to_wav.py
def convert_audio_to_wav(filename: str, filepath: str, dest_path: str) -> None:
    filepath = os.path.join(filepath, filename)
    dest_filepath = os.path.join(dest_path, f"{filename[:-4]}.wav")
    given_audio = AudioSegment.from_file(filepath, format="mp3")  # replace with mp4 or avi
    given_audio.export(dest_filepath, format="wav")

# create a list of input arguments for the function
inputs = [(filename, filepath, dest_path) for filename in os.listdir(filepath) if filename not in os.listdir(dest_path)]

# create a Process pool with 16 worker processes

## us_state_abbrev.py
# United States of America Python Dictionary to translate States,
# Districts & Territories to Two-Letter codes and vice versa.
#
# https://gist.github.com/rogerallen/1583593
#
# Dedicated to the public domain.  To the extent possible under law,
# Roger Allen has waived all copyright and related or neighboring
# rights to this code.

us_state_abbrev = {
	ai.train('result.txt',
	line_by_line=False,
	from_cache=False,
	num_steps=100000,
	generate_every=5000,
	save_every=5000,
	save_gdrive=True,
	learning_rate=1e-3,
	fp16=False,
	batch_size=1,
	''' ffmpeg not found when using pydub utils
	___
	If using Win, need to download from the official website and add to path, then reload git bash.
	https://github.com/jiaaro/pydub/issues/348
	'''

	''' AssertionError: Distributed mode requires CUDA
	___
	a MUST-read to confirm that both GPU and drivers support the CUDA version you've installed (or about to install):
	https://stackoverflow.com/questions/60987997/why-torch-cuda-is-available-returns-false-even-after-installing-pytorch-with
	# Attention parameters
	attention_rnn_dim=1024, # sets the number of units in the RNN
	attention_dim=128, # sets the number of units in the attention mechanism
	# These two values are relatively large and may require a significant amount of GPU memory during training and inference.

	# Location Layer parameters
	attention_location_n_filters=32, # sets the number of filters in the CNN
	attention_location_kernel_size=31, # sets the size of the filters
	# This means that the CNN has 32 filters and each filter has a kernel size of 31.
	''' Make metadata.csv and filelists via https://jaimeleal.github.io/how-to-speech-synthesis '''

	import os
	import pandas as pd
	from sklearn.model_selection import train_test_split
	import numpy as np

	filepath = 'E:/AlanWatts/dataset/transcripts2/'
	files = os.listdir(filepath)
	rows = []
	# snippet of instantiating a client
	client = speech.SpeechClient()
	config = speech.RecognitionConfig(
	encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
	sample_rate_hertz=22050,
	audio_channel_count=1,
	model='phone_call', # recognizes low quality audio better than default
	use_enhanced=1, # if available
	language_code="en-US")
	# using AudioSegment

	def get_duration(self):
	return self.audio.duration_seconds

	def single_split(self, from_min, to_min, split_filename):
	t1 = from_min * 7 * 1000 # convert to milliseconds
	t2 = to_min * 7 * 1000
	split_audio = self.audio[t1:t2]
	resampled = split_audio.set_frame_rate(22050) # change the sampling rate to 22050 Hz
	def remove_sil(file_path: str, file: str, dest_path: str, format="wav"):
	sound = AudioSegment.from_file(os.path.join(file_path, file), format=format)
	non_sil_times = detect_nonsilent(sound, min_silence_len=50, silence_thresh=sound.dBFS * 1.5)
	if len(non_sil_times) == 0:
	return None
	elif len(non_sil_times) > 0:
	non_sil_times_concat = [non_sil_times[0]]
	if len(non_sil_times) > 1:
	for t in non_sil_times[1:]:
	if t[0] - non_sil_times_concat[-1][-1] < 200:
	def reduce_noise(file_path: str, file: str, dest_path: str) -> None:
	# load data
	rate, data = wavfile.read(os.path.join(file_path, file))

	reduced_noise = nr.reduce_noise(y=data, sr=rate)
	# perform noise reduction
	try:
	wavfile.write(os.path.join(dest_path, file), rate, reduced_noise)
	except Exception():
	pass
	def convert_audio_to_wav(filename: str, filepath: str, dest_path: str) -> None:
	filepath = os.path.join(filepath, filename)
	dest_filepath = os.path.join(dest_path, f"{filename[:-4]}.wav")
	given_audio = AudioSegment.from_file(filepath, format="mp3") # replace with mp4 or avi
	given_audio.export(dest_filepath, format="wav")

	# create a list of input arguments for the function
	inputs = [(filename, filepath, dest_path) for filename in os.listdir(filepath) if filename not in os.listdir(dest_path)]

	# create a Process pool with 16 worker processes
	# United States of America Python Dictionary to translate States,
	# Districts & Territories to Two-Letter codes and vice versa.
	#
	# https://gist.github.com/rogerallen/1583593
	#
	# Dedicated to the public domain. To the extent possible under law,
	# Roger Allen has waived all copyright and related or neighboring
	# rights to this code.

	us_state_abbrev = {