from tqdm import tqdm
from collections import OrderedDict
import csv
def read_names(file_path):
names = []
with open(file_path, encoding="utf-8") as f:
for line in tqdm(f):
name = line.strip()
if len(name) > 0:
names.append(name)
return names
def write_names(names, file_path):
with open(file_path, "w", encoding="utf-8") as f:
for name in tqdm(names):
f.write(name + "\n")
def get_characters(names):
characters = []
for name in tqdm(names):
for character in name:
if character not in characters:
characters.append(character)
return sorted(characters)
def get_names_contain_characters(names, characters):
result = []
for name in tqdm(names):
for character in characters:
if character in name:
result.append(name)
return result
def replace_non_printing_characters(names):
characters = {
"\t": " ",
"\xa0": "",
"\xad": "",
"\u200c": " ",
"\u200d": "",
"\u200e": "",
"\u200f": "",
}
result = []
for name in tqdm(names):
for character in characters:
if character in name:
name = name.replace(character, characters[character])
result.append(name)
return result
def filter_invalid_names(names):
valid_characters = [
" ", # SPACE
"ء", # ARABIC LETTER HAMZA
"آ", # ARABIC LETTER ALEF WITH MADDA ABOVE
"ا", # ARABIC LETTER ALEF
"أ", # ARABIC LETTER ALEF WITH HAMZA ABOVE
"إ", # ARABIC LETTER ALEF WITH HAMZA BELOW
"ب", # ARABIC LETTER BEH
"پ", # ARABIC LETTER PEH
"ت", # ARABIC LETTER TEH
"ث", # ARABIC LETTER THEH
"ج", # ARABIC LETTER JEEM
"چ", # ARABIC LETTER TCHEH
"ح", # ARABIC LETTER HAH
"خ", # ARABIC LETTER KHAH
"د", # ARABIC LETTER DAL
"ذ", # ARABIC LETTER THAL
"ر", # ARABIC LETTER REH
"ز", # ARABIC LETTER ZAIN
"ژ", # ARABIC LETTER JEH
"س", # ARABIC LETTER SEEN
"ش", # ARABIC LETTER SHEEN
"ص", # ARABIC LETTER SAD
"ض", # ARABIC LETTER DAD
"ط", # ARABIC LETTER TAH
"ظ", # ARABIC LETTER ZAH
"ع", # ARABIC LETTER AIN
"غ", # ARABIC LETTER GHAIN
"ف", # ARABIC LETTER FEH
"ق", # ARABIC LETTER QAF
"ک", # ARABIC LETTER KEHEH
"ك", # ARABIC LETTER KAF
"گ", # ARABIC LETTER GAF
"ل", # ARABIC LETTER LAM
"م", # ARABIC LETTER MEEM
"ن", # ARABIC LETTER NOON
"و", # ARABIC LETTER WAW
"ؤ", # ARABIC LETTER WAW WITH HAMZA ABOVE
"ه", # ARABIC LETTER HEH
"ة", # ARABIC LETTER TEH MARBUTA
"ی", # ARABIC LETTER FARSI YEH
"ي", # ARABIC LETTER YEH
"ئ", # ARABIC LETTER YEH WITH HAMZA ABOVE
"ى", # ARABIC LETTER ALEF MAKSURA
"َ", # ARABIC FATHA
"ِ", # ARABIC KASRA
"ُ", # ARABIC DAMMA
"ً", # ARABIC FATHATAN
"ٍ", # ARABIC KASRATAN
"ٌ", # ARABIC DAMMATAN
"ّ", # ARABIC SHADDA
"ْ", # ARABIC SUKUN
"ـ", # ARABIC TATWEEL
]
valid_names = []
invalid_names = []
for name in tqdm(names):
valid = True
for character in name:
if character not in valid_characters:
valid = False
break
if valid:
valid_names.append(name)
else:
invalid_names.append(name)
return valid_names, invalid_names
def replace_non_persian_characters(names):
characters = {
"أ": "ا", # ARABIC LETTER ALEF WITH HAMZA ABOVE -> ARABIC LETTER ALEF
"إ": "ا", # ARABIC LETTER ALEF WITH HAMZA BELOW -> ARABIC LETTER ALEF
"ك": "ک", # ARABIC LETTER KAF -> ARABIC LETTER KEHEH
"ؤ": "و", # ARABIC LETTER WAW WITH HAMZA ABOVE -> ARABIC LETTER WAW
"ة": "ه", # ARABIC LETTER TEH MARBUTA -> ARABIC LETTER HEH
"ي": "ی", # ARABIC LETTER YEH -> ARABIC LETTER FARSI YEH
"ى": "ی", # ARABIC LETTER ALEF MAKSURA -> ARABIC LETTER FARSI YEH
"َ": "", # ARABIC FATHA
"ِ": "", # ARABIC KASRA
"ُ": "", # ARABIC DAMMA
"ً": "", # ARABIC FATHATAN
"ٍ": "", # ARABIC KASRATAN
"ٌ": "", # ARABIC DAMMATAN
"ّ": "", # ARABIC SHADDA
"ْ": "", # ARABIC SUKUN
"ـ": "", # ARABIC TATWEEL
}
result = []
for name in tqdm(names):
for character in characters:
if character in name:
name = name.replace(character, characters[character])
result.append(name)
return result
def remove_extra_spaces(names):
characters = ["ء", "آ", "ا", "د", "ذ", "ر", "ز", "ژ", "و"]
result = []
for name in tqdm(names):
for character in characters:
while name != name.replace("{} ".format(character), character):
name = name.replace("{} ".format(character), character)
while " " in name:
name = name.replace(" ", " ")
name = name.strip()
if len(name) > 0:
result.append(name.strip())
return result
def calculate_frequencies(names):
result = OrderedDict()
for name in tqdm(names):
if name not in result:
result[name] = 0
result[name] += 1
result = dict(reversed(sorted(result.items(), key=lambda item: item[1])))
return result
def write_frequencies(name_frequencies, file_path):
with open(file_path, "w", encoding="utf-8") as f:
csv_writer = csv.writer(f, quoting=csv.QUOTE_ALL)
csv_writer.writerow(["name", "frequency"])
for name in tqdm(name_frequencies):
csv_writer.writerow([name, name_frequencies[name]])
def main():
print("Read names...")
names = read_names("names.txt")
print("Get characters...")
characters = get_characters(names)
print(characters)
print("Write names with non printing characters...")
special_names = get_names_contain_characters(names, ["\xa0"])
write_names(sorted(special_names), "non-printing-xa0.txt")
special_names = get_names_contain_characters(names, ["\xad"])
write_names(sorted(special_names), "non-printing-xad.txt")
special_names = get_names_contain_characters(names, ["\u200c"])
write_names(sorted(special_names), "non-printing-u200c.txt")
special_names = get_names_contain_characters(names, ["\u200d"])
write_names(sorted(special_names), "non-printing-u200d.txt")
special_names = get_names_contain_characters(names, ["\u200e"])
write_names(sorted(special_names), "non-printing-u200e.txt")
special_names = get_names_contain_characters(names, ["\u200f"])
write_names(sorted(special_names), "non-printing-u200f.txt")
print("Replace non printing characters...")
names = replace_non_printing_characters(names)
print("Get characters...")
characters = get_characters(names)
print(characters)
print("Filter invalid names...")
names, invalid_names = filter_invalid_names(names)
write_names(sorted(invalid_names), "invalid-names.txt")
print("Get characters...")
characters = get_characters(names)
print(characters)
print("Replace non persian characters...")
names = replace_non_persian_characters(names)
print("Get characters...")
characters = get_characters(names)
print(characters)
print("Remove extra spaces...")
names = remove_extra_spaces(names)
print("calculate frequencies...")
name_frequencies = calculate_frequencies(names)
print("write frequencies...")
write_frequencies(name_frequencies, "name-frequencies.csv")
if __name__ == "__main__":
main()
Last active
July 8, 2024 18:21
-
-
Save amir-saniyan/25e14d749bb0b505350eb392207eb635 to your computer and use it in GitHub Desktop.
Persian Names Cleaner
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment