Skip to content

Instantly share code, notes, and snippets.

@bigsnarfdude
Last active February 23, 2024 23:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bigsnarfdude/c19dbff531712ec1c5ee84e68d7785e4 to your computer and use it in GitHub Desktop.
Save bigsnarfdude/c19dbff531712ec1c5ee84e68d7785e4 to your computer and use it in GitHub Desktop.
process_transcripts.py
import os
import sys
def clean_duplicates(file_name):
duplicates = []
cleaned = []
with open(file_name, 'r') as f:
sentences = f.readlines()
for s in sentences:
if s in cleaned:
if s not in duplicates:
duplicates.append(s)
else:
cleaned.append(s)
return ' '.join(cleaned)
def process_directory(directory):
for file_name in os.listdir(directory):
if file_name.endswith('.txt'):
file_path = os.path.join(directory, file_name)
cleaned_text = clean_duplicates(file_path)
new_file_name = file_path.replace('.txt', '_cleaned.txt')
with open(new_file_name, 'w') as f:
f.write(cleaned_text)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python script_name.py <directory_path>")
sys.exit(1)
directory_path = sys.argv[1]
process_directory(directory_path)
import sys
from llama_cpp import Llama
from prompt import PromptText
from utils import read_file_to_string
from utils import get_filename
from utils import slugify
from utils import split_file
import time
input_file_path = sys.argv[1] if len(sys.argv) > 1 else sys.exit("No input file found")
output_file_path = get_filename(input_file_path)
output_file_path = slugify(output_file_path)
output_file_path = output_file_path+'_summary.txt' # built to go to temp but doesnt delete
mixtral = "/home/vincent/development/llama.cpp/models/mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf"
mistral = "/home/vincent/development/llama.cpp/models/mistral-7b-instruct-v0.2.Q5_K_M.gguf"
prompt = PromptText().summary_text
file_chunks = split_file(input_file_path)
holder = []
for item in file_chunks:
content = read_file_to_string(item)
content_word_max = 4096
gpu_layers = 14
generated_response = 200
summary_submission = prompt+content+'[/INST]'
llm = Llama(model_path=mixtral, n_ctx=content_word_max, n_gpu_layers=gpu_layers)
output = llm(summary_submission, max_tokens=generated_response, stop=["</s>"])
output_text = output['choices'][0]['text']
holder.append(output_text)
llm = None
with open(output_file_path, 'w') as output_file:
output_file.write(''.join(holder))
import os
import re
def read_file_to_string(file_path):
try:
with open(file_path, 'r') as file:
content = file.read()
return content
except FileNotFoundError:
return "File not found."
except Exception as e:
return f"An error occurred: {e}"
def slugify(value):
"""
Normalizes a string: converts to lowercase, removes non-alpha characters,
and converts dashes and spaces to underscores.
"""
value = re.sub(r"[^\w\s-]", "", value).strip().lower()
value = re.sub(r"[-\s]+", "_", value)
return re.sub(r"[^\x00-\x7f]", "", value)
def get_filename(file_path):
"""
Returns the filename without the filetype extension.
"""
#base = os.path.basename(file_path)
#filename_without_extension, _ = os.path.splitext(base)
return os.path.splitext(os.path.basename(file_path))[0]
def split_file(file_path, chunk_size=4096):
holder = []
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
for i, chunk in enumerate(chunks):
output_file_name = f'/tmp/{file_path}_chunk_{i+1}.txt'
with open(output_file_name, 'w', encoding='utf-8') as chunk_file:
chunk_file.write(chunk)
holder.append(output_file_name)
return holder
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment