sammaniamsam/ChatGptContextBytes.py

## ChatGptContextBytes.py
"""
Import Libraries: The script begins by importing several Python libraries required for processing different types of files, including os, fnmatch, docx, pptx, PyPDF2, and openpyxl.

Chunk Size Configuration: It defines a constant CHUNK_SIZE_IN_WORDS, which determines the maximum number of words in each text chunk. This value can be modified as needed to control the size of the output text chunks.

File Content Extraction Functions: The script defines functions to extract text content from different file types:

read_word_file: Extracts text from .docx files.
read_pptx_file: Extracts text from .pptx files, including text from slides and shapes.
read_pdf_file: Extracts text from .pdf files using PyPDF2.
read_excel_file: Extracts text from .xlsx files by iterating through cells in each worksheet.
Write Chunk Function: Defines a function write_chunk to write text chunks to an output file. It includes the file path and a flag to indicate if the chunk is a continuation of the previous chunk.

Count Files Function: Defines a function count_files to count the total number of files in a given directory and its subdirectories. It allows for specifying a list of patterns to ignore certain files or directories.

Extract Text from Directory Function: The main function extract_text_from_directory takes three parameters:

directory_path: The path to the directory to start the extraction process.
ignore_list: A list of file patterns to ignore during extraction.
output_file_path: The path to the output text file where the extracted content will be saved in chunks.
Inside this function:

It counts the total files to be processed using count_files.
Iterates through the directory and subdirectories using os.walk.
For each file, it determines its type based on the file extension and extracts text content accordingly using the appropriate function.
Splits the extracted text into chunks of the specified size (CHUNK_SIZE_IN_WORDS) and writes them to the output file, with each chunk preceded by the file path.
Keeps track of processed files and displays progress.
Text File Check Function: Defines a function is_text to check if a file is a plain text file. It attempts to read the file and returns True if successful, indicating that the file contains text.

Load Ignore List Function: Loads an ignore list from a file (default filename is .ignoreFiles) and returns a list of patterns to be ignored during extraction.

Main Execution Block: The script executes when run directly (not imported as a module). It takes user input for the directory path to start the extraction process, calculates the output file name based on the directory name, loads the ignore list, counts the total files to be processed, and then calls extract_text_from_directory to perform the extraction. Finally, it displays a message indicating where the extracted content has been written.

In summary, this script is a text extraction tool that processes files of various formats within a directory, splits the extracted text into manageable chunks, and writes them to an output file with information about the source file. It is useful for creating a text corpus from diverse sources for various natural language processing (NLP) tasks.
"""
import os
import fnmatch
from docx import Document
from pptx import Presentation
import PyPDF2
import openpyxl

CHUNK_SIZE_IN_WORDS = 100  # Modify as needed

def read_word_file(file_path):
    doc = Document(file_path)
    return ' '.join([p.text for p in doc.paragraphs])

def read_pptx_file(file_path):
    prs = Presentation(file_path)
    return ' '.join([slide.shapes.title.text + ' ' + ' '.join([shape.text for shape in slide.shapes if hasattr(shape, "text")]) for slide in prs.slides])

def read_pdf_file(file_path):
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfFileReader(f)
        return ' '.join([reader.getPage(i).extract_text() for i in range(reader.numPages)])

def read_excel_file(file_path):
    wb = openpyxl.load_workbook(file_path)
    all_text = []
    for sheet in wb:
        for row in sheet.iter_rows():
            for cell in row:
                all_text.append(str(cell.value))
    return ' '.join(all_text)

def write_chunk(output_file, file_path, chunk_content, is_continuation):
    output_file.write('\n')
    if is_continuation:
        output_file.write(f"{file_path} chunk\n")
    else:
        output_file.write(f"{file_path}\n")
    output_file.write(chunk_content + '\n\n')

def count_files(directory_path, ignore_list):
    count = 0
    for subdir, _, files in os.walk(directory_path):
        # If a subdir matches an ignore pattern, skip it
        if any(fnmatch.fnmatch(subdir, os.path.join(directory_path, pattern)) for pattern in ignore_list):
            continue
        count += len(files)
    return count

def extract_text_from_directory(directory_path, ignore_list, output_file_path):
    total_files = count_files(directory_path, ignore_list)
    processed_files = 0

    with open(output_file_path, 'w') as output_file:
        for subdir, _, files in os.walk(directory_path):
            if any(fnmatch.fnmatch(subdir, os.path.join(directory_path, pattern)) for pattern in ignore_list):
                continue

            for file in files:
                if any(fnmatch.fnmatch(file, pattern) for pattern in ignore_list):
                    continue

                file_path = os.path.join(subdir, file)

                contents = ""
                if file_path.endswith(".docx"):
                    contents = read_word_file(file_path)
                elif file_path.endswith(".pptx"):
                    contents = read_pptx_file(file_path)
                elif file_path.endswith(".pdf"):
                    contents = read_pdf_file(file_path)
                elif file_path.endswith(".xlsx"):
                    contents = read_excel_file(file_path)
                elif is_text(file_path):
                    with open(file_path, 'r', errors='replace') as input_file:
                        contents = input_file.read()

                if contents:
                    words = contents.split()
                    chunks = [words[i:i + CHUNK_SIZE_IN_WORDS] for i in range(0, len(words), CHUNK_SIZE_IN_WORDS)]
                    for idx, chunk in enumerate(chunks):
                        is_continuation = idx > 0
                        write_chunk(output_file, file_path, ' '.join(chunk), is_continuation)

                processed_files += 1
                progress_percent = (processed_files / total_files) * 100
                print(f"Processed {processed_files}/{total_files} files. Progress: {progress_percent:.2f}%")

def is_text(file_path):
    try:
        with open(file_path, 'r', errors='replace') as check_file:
            check_file.read()
        return True
    except:
        return False

def load_ignore_list(ignore_file_path=".ignoreFiles"):
    with open(ignore_file_path, 'r') as f:
        # Remove comments, skip blank lines
        lines = [line.strip() for line in f if line.strip() and not line.startswith('#')]
    return lines

if __name__ == "__main__":
    directory_path = input("Enter the directory path of the code repository: ")
    repo_name = os.path.basename(directory_path)
    output_file_name = f"{repo_name}_ChatGptContextBytes.txt"
    ignore_list = load_ignore_list(os.path.join(directory_path, ".ignoreFiles"))
    print(f"Total files to process: {count_files(directory_path, ignore_list)}")
    extract_text_from_directory(directory_path, ignore_list, output_file_name)
    print(f"Contents written to {output_file_name} in chunks of {CHUNK_SIZE_IN_WORDS} words.")
	"""
	Import Libraries: The script begins by importing several Python libraries required for processing different types of files, including os, fnmatch, docx, pptx, PyPDF2, and openpyxl.

	Chunk Size Configuration: It defines a constant CHUNK_SIZE_IN_WORDS, which determines the maximum number of words in each text chunk. This value can be modified as needed to control the size of the output text chunks.

	File Content Extraction Functions: The script defines functions to extract text content from different file types:

	read_word_file: Extracts text from .docx files.
	read_pptx_file: Extracts text from .pptx files, including text from slides and shapes.
	read_pdf_file: Extracts text from .pdf files using PyPDF2.
	read_excel_file: Extracts text from .xlsx files by iterating through cells in each worksheet.
	Write Chunk Function: Defines a function write_chunk to write text chunks to an output file. It includes the file path and a flag to indicate if the chunk is a continuation of the previous chunk.

	Count Files Function: Defines a function count_files to count the total number of files in a given directory and its subdirectories. It allows for specifying a list of patterns to ignore certain files or directories.

	Extract Text from Directory Function: The main function extract_text_from_directory takes three parameters:

	directory_path: The path to the directory to start the extraction process.
	ignore_list: A list of file patterns to ignore during extraction.
	output_file_path: The path to the output text file where the extracted content will be saved in chunks.
	Inside this function:

	It counts the total files to be processed using count_files.
	Iterates through the directory and subdirectories using os.walk.
	For each file, it determines its type based on the file extension and extracts text content accordingly using the appropriate function.
	Splits the extracted text into chunks of the specified size (CHUNK_SIZE_IN_WORDS) and writes them to the output file, with each chunk preceded by the file path.
	Keeps track of processed files and displays progress.
	Text File Check Function: Defines a function is_text to check if a file is a plain text file. It attempts to read the file and returns True if successful, indicating that the file contains text.

	Load Ignore List Function: Loads an ignore list from a file (default filename is .ignoreFiles) and returns a list of patterns to be ignored during extraction.

	Main Execution Block: The script executes when run directly (not imported as a module). It takes user input for the directory path to start the extraction process, calculates the output file name based on the directory name, loads the ignore list, counts the total files to be processed, and then calls extract_text_from_directory to perform the extraction. Finally, it displays a message indicating where the extracted content has been written.

	In summary, this script is a text extraction tool that processes files of various formats within a directory, splits the extracted text into manageable chunks, and writes them to an output file with information about the source file. It is useful for creating a text corpus from diverse sources for various natural language processing (NLP) tasks.
	"""
	import os
	import fnmatch
	from docx import Document
	from pptx import Presentation
	import PyPDF2
	import openpyxl

	CHUNK_SIZE_IN_WORDS = 100 # Modify as needed

	def read_word_file(file_path):
	doc = Document(file_path)
	return ' '.join([p.text for p in doc.paragraphs])

	def read_pptx_file(file_path):
	prs = Presentation(file_path)
	return ' '.join([slide.shapes.title.text + ' ' + ' '.join([shape.text for shape in slide.shapes if hasattr(shape, "text")]) for slide in prs.slides])

	def read_pdf_file(file_path):
	with open(file_path, 'rb') as f:
	reader = PyPDF2.PdfFileReader(f)
	return ' '.join([reader.getPage(i).extract_text() for i in range(reader.numPages)])

	def read_excel_file(file_path):
	wb = openpyxl.load_workbook(file_path)
	all_text = []
	for sheet in wb:
	for row in sheet.iter_rows():
	for cell in row:
	all_text.append(str(cell.value))
	return ' '.join(all_text)

	def write_chunk(output_file, file_path, chunk_content, is_continuation):
	output_file.write('\n')
	if is_continuation:
	output_file.write(f"{file_path} chunk\n")
	else:
	output_file.write(f"{file_path}\n")
	output_file.write(chunk_content + '\n\n')

	def count_files(directory_path, ignore_list):
	count = 0
	for subdir, _, files in os.walk(directory_path):
	# If a subdir matches an ignore pattern, skip it
	if any(fnmatch.fnmatch(subdir, os.path.join(directory_path, pattern)) for pattern in ignore_list):
	continue
	count += len(files)
	return count

	def extract_text_from_directory(directory_path, ignore_list, output_file_path):
	total_files = count_files(directory_path, ignore_list)
	processed_files = 0

	with open(output_file_path, 'w') as output_file:
	for subdir, _, files in os.walk(directory_path):
	if any(fnmatch.fnmatch(subdir, os.path.join(directory_path, pattern)) for pattern in ignore_list):
	continue

	for file in files:
	if any(fnmatch.fnmatch(file, pattern) for pattern in ignore_list):
	continue

	file_path = os.path.join(subdir, file)

	contents = ""
	if file_path.endswith(".docx"):
	contents = read_word_file(file_path)
	elif file_path.endswith(".pptx"):
	contents = read_pptx_file(file_path)
	elif file_path.endswith(".pdf"):
	contents = read_pdf_file(file_path)
	elif file_path.endswith(".xlsx"):
	contents = read_excel_file(file_path)
	elif is_text(file_path):
	with open(file_path, 'r', errors='replace') as input_file:
	contents = input_file.read()

	if contents:
	words = contents.split()
	chunks = [words[i:i + CHUNK_SIZE_IN_WORDS] for i in range(0, len(words), CHUNK_SIZE_IN_WORDS)]
	for idx, chunk in enumerate(chunks):
	is_continuation = idx > 0
	write_chunk(output_file, file_path, ' '.join(chunk), is_continuation)

	processed_files += 1
	progress_percent = (processed_files / total_files) * 100
	print(f"Processed {processed_files}/{total_files} files. Progress: {progress_percent:.2f}%")

	def is_text(file_path):
	try:
	with open(file_path, 'r', errors='replace') as check_file:
	check_file.read()
	return True
	except:
	return False

	def load_ignore_list(ignore_file_path=".ignoreFiles"):
	with open(ignore_file_path, 'r') as f:
	# Remove comments, skip blank lines
	lines = [line.strip() for line in f if line.strip() and not line.startswith('#')]
	return lines

	if __name__ == "__main__":
	directory_path = input("Enter the directory path of the code repository: ")
	repo_name = os.path.basename(directory_path)
	output_file_name = f"{repo_name}_ChatGptContextBytes.txt"
	ignore_list = load_ignore_list(os.path.join(directory_path, ".ignoreFiles"))
	print(f"Total files to process: {count_files(directory_path, ignore_list)}")
	extract_text_from_directory(directory_path, ignore_list, output_file_name)
	print(f"Contents written to {output_file_name} in chunks of {CHUNK_SIZE_IN_WORDS} words.")