Created
February 22, 2024 11:28
-
-
Save ehzawad/f8ac64d65eb96fbd93c47275d78c5e88 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import csv | |
def is_bangla_word(word): | |
# Check if the word contains Bangla characters (Unicode range for Bangla: U+0980 to U+09FF) | |
return bool(re.search('[\u0980-\u09FF]', word)) | |
def extract_bangla_words(line): | |
# Split the line into potential words, filtering out non-Bangla words | |
words = re.findall('[\u0980-\u09FF]+', line) | |
return words | |
def count_word_frequencies(file_path): | |
word_counts = {} | |
with open(file_path, 'r', encoding='utf-8') as file: | |
for line in file: | |
words = extract_bangla_words(line) | |
for word in words: | |
if is_bangla_word(word): | |
word_counts[word] = word_counts.get(word, 0) + 1 | |
return word_counts | |
def write_frequencies_to_csv(word_counts, output_file_path): | |
with open(output_file_path, 'w', newline='', encoding='utf-8') as csv_file: | |
writer = csv.writer(csv_file) | |
writer.writerow(['Word', 'Frequency']) # Writing the header | |
for word, count in word_counts.items(): | |
writer.writerow([word, count]) | |
# Example usage | |
file_path = 'your_text_file.txt' # Replace with the path to your text file | |
output_file_path = 'word_frequencies.csv' | |
word_counts = count_word_frequencies(file_path) | |
write_frequencies_to_csv(word_counts, output_file_path) | |
print(f"Word frequencies have been written to {output_file_path}.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment