Skip to content

Instantly share code, notes, and snippets.

@Rajan-sust
Last active July 11, 2023 04:37
Show Gist options
  • Save Rajan-sust/8015c3aa73f72efd9f3b57eab029eb0f to your computer and use it in GitHub Desktop.
Save Rajan-sust/8015c3aa73f72efd9f3b57eab029eb0f to your computer and use it in GitHub Desktop.
from collections import defaultdict
def get_frequency(char_count, start, end):
total = sum(x for _, x in char_count.items())
for value in range(start, end):
letter = chr(value)
try:
print(f'{letter}\t{char_count[letter]}')
except:
pass
if __name__ == '__main__':
chunk_size = (4096 * 4096) # Size of each chunk in bytes
char_count = defaultdict(int)
with open('bnwiki-latest-pages-articles.xml', mode='r', encoding='utf-8') as file:
while True:
chunk = file.read(chunk_size)
if not chunk:
break
for ch in chunk:
unicode_value = ord(ch)
if 2431 < unicode_value < 2560:
char_count[ch] += 1
# print(char_count)
# For vowel
get_frequency(char_count, 2437, 2453)
# For Consonant
get_frequency(char_count, 2453, 2489)
sorted_dict = sorted(char_count.items(), key=lambda x: x[1], reverse=True)
print('----------------------------')
for key, value in sorted_dict:
print(f'{key}\t{value}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment