Skip to content

Instantly share code, notes, and snippets.

@sammaniamsam
Last active September 13, 2023 14:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sammaniamsam/3e49bf662c7fbbd2d24894068d1c8b0d to your computer and use it in GitHub Desktop.
Save sammaniamsam/3e49bf662c7fbbd2d24894068d1c8b0d to your computer and use it in GitHub Desktop.
This Gist is to contextual resources all located in a directory and write them as text chunks to an output file that can be fed into ChatGPT
"""
Import Libraries: The script begins by importing several Python libraries required for processing different types of files, including os, fnmatch, docx, pptx, PyPDF2, and openpyxl.
Chunk Size Configuration: It defines a constant CHUNK_SIZE_IN_WORDS, which determines the maximum number of words in each text chunk. This value can be modified as needed to control the size of the output text chunks.
File Content Extraction Functions: The script defines functions to extract text content from different file types:
read_word_file: Extracts text from .docx files.
read_pptx_file: Extracts text from .pptx files, including text from slides and shapes.
read_pdf_file: Extracts text from .pdf files using PyPDF2.
read_excel_file: Extracts text from .xlsx files by iterating through cells in each worksheet.
Write Chunk Function: Defines a function write_chunk to write text chunks to an output file. It includes the file path and a flag to indicate if the chunk is a continuation of the previous chunk.
Count Files Function: Defines a function count_files to count the total number of files in a given directory and its subdirectories. It allows for specifying a list of patterns to ignore certain files or directories.
Extract Text from Directory Function: The main function extract_text_from_directory takes three parameters:
directory_path: The path to the directory to start the extraction process.
ignore_list: A list of file patterns to ignore during extraction.
output_file_path: The path to the output text file where the extracted content will be saved in chunks.
Inside this function:
It counts the total files to be processed using count_files.
Iterates through the directory and subdirectories using os.walk.
For each file, it determines its type based on the file extension and extracts text content accordingly using the appropriate function.
Splits the extracted text into chunks of the specified size (CHUNK_SIZE_IN_WORDS) and writes them to the output file, with each chunk preceded by the file path.
Keeps track of processed files and displays progress.
Text File Check Function: Defines a function is_text to check if a file is a plain text file. It attempts to read the file and returns True if successful, indicating that the file contains text.
Load Ignore List Function: Loads an ignore list from a file (default filename is .ignoreFiles) and returns a list of patterns to be ignored during extraction.
Main Execution Block: The script executes when run directly (not imported as a module). It takes user input for the directory path to start the extraction process, calculates the output file name based on the directory name, loads the ignore list, counts the total files to be processed, and then calls extract_text_from_directory to perform the extraction. Finally, it displays a message indicating where the extracted content has been written.
In summary, this script is a text extraction tool that processes files of various formats within a directory, splits the extracted text into manageable chunks, and writes them to an output file with information about the source file. It is useful for creating a text corpus from diverse sources for various natural language processing (NLP) tasks.
"""
import os
import fnmatch
from docx import Document
from pptx import Presentation
import PyPDF2
import openpyxl
CHUNK_SIZE_IN_WORDS = 100 # Modify as needed
def read_word_file(file_path):
doc = Document(file_path)
return ' '.join([p.text for p in doc.paragraphs])
def read_pptx_file(file_path):
prs = Presentation(file_path)
return ' '.join([slide.shapes.title.text + ' ' + ' '.join([shape.text for shape in slide.shapes if hasattr(shape, "text")]) for slide in prs.slides])
def read_pdf_file(file_path):
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfFileReader(f)
return ' '.join([reader.getPage(i).extract_text() for i in range(reader.numPages)])
def read_excel_file(file_path):
wb = openpyxl.load_workbook(file_path)
all_text = []
for sheet in wb:
for row in sheet.iter_rows():
for cell in row:
all_text.append(str(cell.value))
return ' '.join(all_text)
def write_chunk(output_file, file_path, chunk_content, is_continuation):
output_file.write('\n')
if is_continuation:
output_file.write(f"{file_path} chunk\n")
else:
output_file.write(f"{file_path}\n")
output_file.write(chunk_content + '\n\n')
def count_files(directory_path, ignore_list):
count = 0
for subdir, _, files in os.walk(directory_path):
# If a subdir matches an ignore pattern, skip it
if any(fnmatch.fnmatch(subdir, os.path.join(directory_path, pattern)) for pattern in ignore_list):
continue
count += len(files)
return count
def extract_text_from_directory(directory_path, ignore_list, output_file_path):
total_files = count_files(directory_path, ignore_list)
processed_files = 0
with open(output_file_path, 'w') as output_file:
for subdir, _, files in os.walk(directory_path):
if any(fnmatch.fnmatch(subdir, os.path.join(directory_path, pattern)) for pattern in ignore_list):
continue
for file in files:
if any(fnmatch.fnmatch(file, pattern) for pattern in ignore_list):
continue
file_path = os.path.join(subdir, file)
contents = ""
if file_path.endswith(".docx"):
contents = read_word_file(file_path)
elif file_path.endswith(".pptx"):
contents = read_pptx_file(file_path)
elif file_path.endswith(".pdf"):
contents = read_pdf_file(file_path)
elif file_path.endswith(".xlsx"):
contents = read_excel_file(file_path)
elif is_text(file_path):
with open(file_path, 'r', errors='replace') as input_file:
contents = input_file.read()
if contents:
words = contents.split()
chunks = [words[i:i + CHUNK_SIZE_IN_WORDS] for i in range(0, len(words), CHUNK_SIZE_IN_WORDS)]
for idx, chunk in enumerate(chunks):
is_continuation = idx > 0
write_chunk(output_file, file_path, ' '.join(chunk), is_continuation)
processed_files += 1
progress_percent = (processed_files / total_files) * 100
print(f"Processed {processed_files}/{total_files} files. Progress: {progress_percent:.2f}%")
def is_text(file_path):
try:
with open(file_path, 'r', errors='replace') as check_file:
check_file.read()
return True
except:
return False
def load_ignore_list(ignore_file_path=".ignoreFiles"):
with open(ignore_file_path, 'r') as f:
# Remove comments, skip blank lines
lines = [line.strip() for line in f if line.strip() and not line.startswith('#')]
return lines
if __name__ == "__main__":
directory_path = input("Enter the directory path of the code repository: ")
repo_name = os.path.basename(directory_path)
output_file_name = f"{repo_name}_ChatGptContextBytes.txt"
ignore_list = load_ignore_list(os.path.join(directory_path, ".ignoreFiles"))
print(f"Total files to process: {count_files(directory_path, ignore_list)}")
extract_text_from_directory(directory_path, ignore_list, output_file_name)
print(f"Contents written to {output_file_name} in chunks of {CHUNK_SIZE_IN_WORDS} words.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment