bigsnarfdude/clean_transcripts.py

## clean_transcripts.py
import os
import sys

def clean_duplicates(file_name):
    duplicates = []
    cleaned = []
    with open(file_name, 'r') as f:
        sentences = f.readlines()
    for s in sentences:
        if s in cleaned:
            if s not in duplicates:
                duplicates.append(s)
        else:
            cleaned.append(s)
    return ' '.join(cleaned)

def process_directory(directory):
    for file_name in os.listdir(directory):
        if file_name.endswith('.txt'):
            file_path = os.path.join(directory, file_name)
            cleaned_text = clean_duplicates(file_path)
            new_file_name = file_path.replace('.txt', '_cleaned.txt')
            with open(new_file_name, 'w') as f:
                f.write(cleaned_text)

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python script_name.py <directory_path>")
        sys.exit(1)

    directory_path = sys.argv[1]
    process_directory(directory_path)

## process_transcripts.py
import sys
from llama_cpp import Llama
from prompt import PromptText
from utils import read_file_to_string
from utils import get_filename
from utils import slugify
from utils import split_file
import time

input_file_path = sys.argv[1] if len(sys.argv) > 1 else sys.exit("No input file found")
output_file_path = get_filename(input_file_path)
output_file_path = slugify(output_file_path)
output_file_path = output_file_path+'_summary.txt' # built to go to temp but doesnt delete
mixtral = "/home/vincent/development/llama.cpp/models/mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf"
mistral = "/home/vincent/development/llama.cpp/models/mistral-7b-instruct-v0.2.Q5_K_M.gguf"

prompt = PromptText().summary_text
file_chunks = split_file(input_file_path)

holder = []
for item in file_chunks:
    content = read_file_to_string(item)
    content_word_max = 4096
    gpu_layers = 14
    generated_response = 200
    summary_submission = prompt+content+'[/INST]'

    llm = Llama(model_path=mixtral, n_ctx=content_word_max, n_gpu_layers=gpu_layers)
    output = llm(summary_submission, max_tokens=generated_response, stop=["</s>"])
    output_text = output['choices'][0]['text']
    holder.append(output_text)
    llm = None


with open(output_file_path, 'w') as output_file:
    output_file.write(''.join(holder))

## utils.py
import os
import re


def read_file_to_string(file_path):
    try:
        with open(file_path, 'r') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        return "File not found."
    except Exception as e:
        return f"An error occurred: {e}"


def slugify(value):
    """
    Normalizes a string: converts to lowercase, removes non-alpha characters,
    and converts dashes and spaces to underscores.

    """
    value = re.sub(r"[^\w\s-]", "", value).strip().lower()
    value = re.sub(r"[-\s]+", "_", value)
    return re.sub(r"[^\x00-\x7f]", "", value)


def get_filename(file_path):
    """
    Returns the filename without the filetype extension.

    """
    #base = os.path.basename(file_path)
    #filename_without_extension, _ = os.path.splitext(base)
    return os.path.splitext(os.path.basename(file_path))[0]


def split_file(file_path, chunk_size=4096):
    holder = []
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
    for i, chunk in enumerate(chunks):
        output_file_name = f'/tmp/{file_path}_chunk_{i+1}.txt'
        with open(output_file_name, 'w', encoding='utf-8') as chunk_file:
            chunk_file.write(chunk)
            holder.append(output_file_name)
    return holder
	import os
	import sys

	def clean_duplicates(file_name):
	duplicates = []
	cleaned = []
	with open(file_name, 'r') as f:
	sentences = f.readlines()
	for s in sentences:
	if s in cleaned:
	if s not in duplicates:
	duplicates.append(s)
	else:
	cleaned.append(s)
	return ' '.join(cleaned)

	def process_directory(directory):
	for file_name in os.listdir(directory):
	if file_name.endswith('.txt'):
	file_path = os.path.join(directory, file_name)
	cleaned_text = clean_duplicates(file_path)
	new_file_name = file_path.replace('.txt', '_cleaned.txt')
	with open(new_file_name, 'w') as f:
	f.write(cleaned_text)

	if __name__ == "__main__":
	if len(sys.argv) < 2:
	print("Usage: python script_name.py <directory_path>")
	sys.exit(1)

	directory_path = sys.argv[1]
	process_directory(directory_path)
	import sys
	from llama_cpp import Llama
	from prompt import PromptText
	from utils import read_file_to_string
	from utils import get_filename
	from utils import slugify
	from utils import split_file
	import time

	input_file_path = sys.argv[1] if len(sys.argv) > 1 else sys.exit("No input file found")
	output_file_path = get_filename(input_file_path)
	output_file_path = slugify(output_file_path)
	output_file_path = output_file_path+'_summary.txt' # built to go to temp but doesnt delete
	mixtral = "/home/vincent/development/llama.cpp/models/mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf"
	mistral = "/home/vincent/development/llama.cpp/models/mistral-7b-instruct-v0.2.Q5_K_M.gguf"

	prompt = PromptText().summary_text
	file_chunks = split_file(input_file_path)

	holder = []
	for item in file_chunks:
	content = read_file_to_string(item)
	content_word_max = 4096
	gpu_layers = 14
	generated_response = 200
	summary_submission = prompt+content+'[/INST]'

	llm = Llama(model_path=mixtral, n_ctx=content_word_max, n_gpu_layers=gpu_layers)
	output = llm(summary_submission, max_tokens=generated_response, stop=["</s>"])
	output_text = output['choices'][0]['text']
	holder.append(output_text)
	llm = None


	with open(output_file_path, 'w') as output_file:
	output_file.write(''.join(holder))
	import os
	import re


	def read_file_to_string(file_path):
	try:
	with open(file_path, 'r') as file:
	content = file.read()
	return content
	except FileNotFoundError:
	return "File not found."
	except Exception as e:
	return f"An error occurred: {e}"


	def slugify(value):
	"""
	Normalizes a string: converts to lowercase, removes non-alpha characters,
	and converts dashes and spaces to underscores.

	"""
	value = re.sub(r"[^\w\s-]", "", value).strip().lower()
	value = re.sub(r"[-\s]+", "_", value)
	return re.sub(r"[^\x00-\x7f]", "", value)


	def get_filename(file_path):
	"""
	Returns the filename without the filetype extension.

	"""
	#base = os.path.basename(file_path)
	#filename_without_extension, _ = os.path.splitext(base)
	return os.path.splitext(os.path.basename(file_path))[0]


	def split_file(file_path, chunk_size=4096):
	holder = []
	with open(file_path, 'r', encoding='utf-8') as file:
	content = file.read()
	chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
	for i, chunk in enumerate(chunks):
	output_file_name = f'/tmp/{file_path}_chunk_{i+1}.txt'
	with open(output_file_name, 'w', encoding='utf-8') as chunk_file:
	chunk_file.write(chunk)
	holder.append(output_file_name)
	return holder