Skip to content

Instantly share code, notes, and snippets.

@yshalsager
Created April 4, 2024 03:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yshalsager/bdf484a792a57d0db2b509f883da12e0 to your computer and use it in GitHub Desktop.
Save yshalsager/bdf484a792a57d0db2b509f883da12e0 to your computer and use it in GitHub Desktop.
Steps to create an Arabic dictionary for AOSP keyboard from shamela.ws
import sys
from pathlib import Path
import re
# Check if a file name is provided
if len(sys.argv) < 2:
print("Usage: python calc_weight.py <filename>")
sys.exit(1)
filename = Path(sys.argv[1])
# Ensure the file exists
if not filename.exists():
print(f"Error: File {filename} does not exist.")
sys.exit(1)
# Count the number of lines
with filename.open("r", encoding="utf-8") as file:
lines = file.readlines()
count = len(lines)
# Calculate the divider to ensure results between 50 and 254
divider = (count // 205) + 1
# Process the lines and update the weight
new_lines = []
for line in lines:
count -= 1
# Replace the weight if it's a word line, otherwise add without actions
if "f=" in line:
weighed = (count // divider) + 50
name = re.search("=(.*),", line)
if name and len(name.group(1)) > 1 and not name.group(1).isdigit():
line = re.sub(r"(\d*[.])?\d+", str(weighed), line)
new_lines.append(line)
# Write the updated lines back to the file
with filename.open("w", encoding="utf-8") as file:
file.writelines(new_lines)
# pip install regex
from pathlib import Path
import regex
# Compile the pattern to match lines with non-Arabic characters in the word field
pattern = regex.compile(r"word=.*\P{Arabic}.*, f=\d+")
# Read the input file
input_file = Path(".").resolve() / "word_list.txt"
lines = input_file.read_text().splitlines()
# Filter out the lines that match the pattern
arabic_lines = (line for line in lines if not pattern.search(line))
# Write the result back to the input file
input_file.write_text("\n".join(arabic_lines))
# pip install ebooklib parsel pyarabic
import ebooklib
from ebooklib import epub
from parsel import Selector
from pyarabic import araby as pyarabic
from pathlib import Path
# Function to extract text from an EPUB item using parsel
def extract_text_from_item(item):
selector = Selector(text=item.get_content().decode())
text = " ".join(selector.css("*::text").getall())
return text.strip()
# Function to process an EPUB file
def process_epub(file_path):
# Check if the output file already exists
output_file_path = file_path.with_suffix(".txt")
# if output_file_path.exists():
# print(f"Output file already exists for {file_path}. Skipping.")
# return # Exit early
# Load the EPUB file
book = epub.read_epub(str(file_path))
# Extract text from each item in the EPUB using parsel
texts = []
for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
text = extract_text_from_item(item)
texts.append(text)
# Combine all texts into a single string
full_text = " ".join(texts)
# Remove Arabic tashkeel
full_text = pyarabic.strip_tashkeel(full_text)
# Tokenize the text into words using pyarabic
words = pyarabic.tokenize(full_text)
# Write each word to the output file
with open(output_file_path, "w", encoding="utf-8") as output_file:
for word in words:
if word.isalpha():
output_file.write(word + "\n")
# Iterate over subfolders in the current folder looking for books
for root in Path(".").rglob("*.epub"):
print(root)
process_epub(root)
# pip install tqdm
from collections import Counter
import argparse
from tqdm import tqdm
def count_lines(filename):
with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
return sum(1 for _ in file)
def word_frequency(filename, total_lines):
frequency = Counter()
with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
for line in tqdm(file, total=total_lines, desc="Processing file", unit="lines"):
words = line.lower().split()
frequency.update(words)
return frequency
def write_frequency_to_file(frequency, output_filename):
with open(output_filename, 'w', encoding='utf-8') as file:
for word, count in frequency.items():
file.write(f" word={word}, f={count}\n")
def main():
parser = argparse.ArgumentParser(description='Calculate word frequency and write to a file.')
parser.add_argument('input_file', type=str, help='The input file to read words from.')
parser.add_argument('output_file', type=str, help='The output file to write word frequencies to.')
args = parser.parse_args()
total_lines = count_lines(args.input_file)
frequency = word_frequency(args.input_file, total_lines)
write_frequency_to_file(frequency, args.output_file)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment