yshalsager/calc_weight.py

## calc_weight.py
import sys
from pathlib import Path
import re

# Check if a file name is provided
if len(sys.argv) < 2:
    print("Usage: python calc_weight.py <filename>")
    sys.exit(1)

filename = Path(sys.argv[1])

# Ensure the file exists
if not filename.exists():
    print(f"Error: File {filename} does not exist.")
    sys.exit(1)

# Count the number of lines
with filename.open("r", encoding="utf-8") as file:
    lines = file.readlines()
    count = len(lines)

# Calculate the divider to ensure results between 50 and 254
divider = (count // 205) + 1

# Process the lines and update the weight
new_lines = []
for line in lines:
    count -= 1

    # Replace the weight if it's a word line, otherwise add without actions
    if "f=" in line:
        weighed = (count // divider) + 50
        name = re.search("=(.*),", line)
        if name and len(name.group(1)) > 1 and not name.group(1).isdigit():
            line = re.sub(r"(\d*[.])?\d+", str(weighed), line)
    new_lines.append(line)

# Write the updated lines back to the file
with filename.open("w", encoding="utf-8") as file:
    file.writelines(new_lines)

## cleanup_non_arabic.py
# pip install regex
from pathlib import Path
import regex

# Compile the pattern to match lines with non-Arabic characters in the word field
pattern = regex.compile(r"word=.*\P{Arabic}.*, f=\d+")

# Read the input file
input_file = Path(".").resolve() / "word_list.txt"
lines = input_file.read_text().splitlines()

# Filter out the lines that match the pattern
arabic_lines = (line for line in lines if not pattern.search(line))

# Write the result back to the input file
input_file.write_text("\n".join(arabic_lines))

## extract_words_from_epub.py
# pip install ebooklib parsel pyarabic
import ebooklib
from ebooklib import epub
from parsel import Selector
from pyarabic import araby as pyarabic
from pathlib import Path


# Function to extract text from an EPUB item using parsel
def extract_text_from_item(item):
    selector = Selector(text=item.get_content().decode())
    text = " ".join(selector.css("*::text").getall())
    return text.strip()


# Function to process an EPUB file
def process_epub(file_path):
    # Check if the output file already exists
    output_file_path = file_path.with_suffix(".txt")
    #    if output_file_path.exists():
    #        print(f"Output file already exists for {file_path}. Skipping.")
    #        return # Exit early

    # Load the EPUB file
    book = epub.read_epub(str(file_path))

    # Extract text from each item in the EPUB using parsel
    texts = []
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        text = extract_text_from_item(item)
        texts.append(text)

    # Combine all texts into a single string
    full_text = " ".join(texts)

    # Remove Arabic tashkeel
    full_text = pyarabic.strip_tashkeel(full_text)
    # Tokenize the text into words using pyarabic
    words = pyarabic.tokenize(full_text)
    # Write each word to the output file
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        for word in words:
            if word.isalpha():
                output_file.write(word + "\n")


# Iterate over subfolders in the current folder looking for books
for root in Path(".").rglob("*.epub"):
    print(root)
    process_epub(root)

## gen_freq.py
# pip install tqdm
from collections import Counter
import argparse
from tqdm import tqdm

def count_lines(filename):
    with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
        return sum(1 for _ in file)

def word_frequency(filename, total_lines):
    frequency = Counter()
    with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
        for line in tqdm(file, total=total_lines, desc="Processing file", unit="lines"):
            words = line.lower().split()
            frequency.update(words)
    return frequency

def write_frequency_to_file(frequency, output_filename):
    with open(output_filename, 'w', encoding='utf-8') as file:
        for word, count in frequency.items():
            file.write(f" word={word}, f={count}\n")

def main():
    parser = argparse.ArgumentParser(description='Calculate word frequency and write to a file.')
    parser.add_argument('input_file', type=str, help='The input file to read words from.')
    parser.add_argument('output_file', type=str, help='The output file to write word frequencies to.')
    args = parser.parse_args()

    total_lines = count_lines(args.input_file)
    frequency = word_frequency(args.input_file, total_lines)

    write_frequency_to_file(frequency, args.output_file)

if __name__ == "__main__":
    main()
	import sys
	from pathlib import Path
	import re

	# Check if a file name is provided
	if len(sys.argv) < 2:
	print("Usage: python calc_weight.py <filename>")
	sys.exit(1)

	filename = Path(sys.argv[1])

	# Ensure the file exists
	if not filename.exists():
	print(f"Error: File {filename} does not exist.")
	sys.exit(1)

	# Count the number of lines
	with filename.open("r", encoding="utf-8") as file:
	lines = file.readlines()
	count = len(lines)

	# Calculate the divider to ensure results between 50 and 254
	divider = (count // 205) + 1

	# Process the lines and update the weight
	new_lines = []
	for line in lines:
	count -= 1

	# Replace the weight if it's a word line, otherwise add without actions
	if "f=" in line:
	weighed = (count // divider) + 50
	name = re.search("=(.*),", line)
	if name and len(name.group(1)) > 1 and not name.group(1).isdigit():
	line = re.sub(r"(\d*[.])?\d+", str(weighed), line)
	new_lines.append(line)

	# Write the updated lines back to the file
	with filename.open("w", encoding="utf-8") as file:
	file.writelines(new_lines)
	# pip install regex
	from pathlib import Path
	import regex

	# Compile the pattern to match lines with non-Arabic characters in the word field
	pattern = regex.compile(r"word=.\P{Arabic}., f=\d+")

	# Read the input file
	input_file = Path(".").resolve() / "word_list.txt"
	lines = input_file.read_text().splitlines()

	# Filter out the lines that match the pattern
	arabic_lines = (line for line in lines if not pattern.search(line))

	# Write the result back to the input file
	input_file.write_text("\n".join(arabic_lines))
	# pip install ebooklib parsel pyarabic
	import ebooklib
	from ebooklib import epub
	from parsel import Selector
	from pyarabic import araby as pyarabic
	from pathlib import Path


	# Function to extract text from an EPUB item using parsel
	def extract_text_from_item(item):
	selector = Selector(text=item.get_content().decode())
	text = " ".join(selector.css("*::text").getall())
	return text.strip()


	# Function to process an EPUB file
	def process_epub(file_path):
	# Check if the output file already exists
	output_file_path = file_path.with_suffix(".txt")
	# if output_file_path.exists():
	# print(f"Output file already exists for {file_path}. Skipping.")
	# return # Exit early

	# Load the EPUB file
	book = epub.read_epub(str(file_path))

	# Extract text from each item in the EPUB using parsel
	texts = []
	for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
	text = extract_text_from_item(item)
	texts.append(text)

	# Combine all texts into a single string
	full_text = " ".join(texts)

	# Remove Arabic tashkeel
	full_text = pyarabic.strip_tashkeel(full_text)
	# Tokenize the text into words using pyarabic
	words = pyarabic.tokenize(full_text)
	# Write each word to the output file
	with open(output_file_path, "w", encoding="utf-8") as output_file:
	for word in words:
	if word.isalpha():
	output_file.write(word + "\n")


	# Iterate over subfolders in the current folder looking for books
	for root in Path(".").rglob("*.epub"):
	print(root)
	process_epub(root)
	# pip install tqdm
	from collections import Counter
	import argparse
	from tqdm import tqdm

	def count_lines(filename):
	with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
	return sum(1 for _ in file)

	def word_frequency(filename, total_lines):
	frequency = Counter()
	with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
	for line in tqdm(file, total=total_lines, desc="Processing file", unit="lines"):
	words = line.lower().split()
	frequency.update(words)
	return frequency

	def write_frequency_to_file(frequency, output_filename):
	with open(output_filename, 'w', encoding='utf-8') as file:
	for word, count in frequency.items():
	file.write(f" word={word}, f={count}\n")

	def main():
	parser = argparse.ArgumentParser(description='Calculate word frequency and write to a file.')
	parser.add_argument('input_file', type=str, help='The input file to read words from.')
	parser.add_argument('output_file', type=str, help='The output file to write word frequencies to.')
	args = parser.parse_args()

	total_lines = count_lines(args.input_file)
	frequency = word_frequency(args.input_file, total_lines)

	write_frequency_to_file(frequency, args.output_file)

	if __name__ == "__main__":
	main()