Skip to content

Instantly share code, notes, and snippets.

@amir-saniyan
Last active July 8, 2024 18:21
Show Gist options
  • Save amir-saniyan/25e14d749bb0b505350eb392207eb635 to your computer and use it in GitHub Desktop.
Save amir-saniyan/25e14d749bb0b505350eb392207eb635 to your computer and use it in GitHub Desktop.
Persian Names Cleaner

Persian Names Cleaner

from tqdm import tqdm
from collections import OrderedDict
import csv


def read_names(file_path):
    names = []

    with open(file_path, encoding="utf-8") as f:
        for line in tqdm(f):
            name = line.strip()
            if len(name) > 0:
                names.append(name)

    return names


def write_names(names, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for name in tqdm(names):
            f.write(name + "\n")


def get_characters(names):
    characters = []

    for name in tqdm(names):
        for character in name:
            if character not in characters:
                characters.append(character)

    return sorted(characters)


def get_names_contain_characters(names, characters):
    result = []

    for name in tqdm(names):
        for character in characters:
            if character in name:
                result.append(name)

    return result


def replace_non_printing_characters(names):
    characters = {
        "\t": " ",
        "\xa0": "",
        "\xad": "",
        "\u200c": " ",
        "\u200d": "",
        "\u200e": "",
        "\u200f": "",
    }

    result = []

    for name in tqdm(names):
        for character in characters:
            if character in name:
                name = name.replace(character, characters[character])

        result.append(name)

    return result


def filter_invalid_names(names):
    valid_characters = [
        " ",  # SPACE
        "ء",  # ARABIC LETTER HAMZA
        "آ",  # ARABIC LETTER ALEF WITH MADDA ABOVE
        "ا",  # ARABIC LETTER ALEF
        "أ",  # ARABIC LETTER ALEF WITH HAMZA ABOVE
        "إ",  # ARABIC LETTER ALEF WITH HAMZA BELOW
        "ب",  # ARABIC LETTER BEH
        "پ",  # ARABIC LETTER PEH
        "ت",  # ARABIC LETTER TEH
        "ث",  # ARABIC LETTER THEH
        "ج",  # ARABIC LETTER JEEM
        "چ",  # ARABIC LETTER TCHEH
        "ح",  # ARABIC LETTER HAH
        "خ",  # ARABIC LETTER KHAH
        "د",  # ARABIC LETTER DAL
        "ذ",  # ARABIC LETTER THAL
        "ر",  # ARABIC LETTER REH
        "ز",  # ARABIC LETTER ZAIN
        "ژ",  # ARABIC LETTER JEH
        "س",  # ARABIC LETTER SEEN
        "ش",  # ARABIC LETTER SHEEN
        "ص",  # ARABIC LETTER SAD
        "ض",  # ARABIC LETTER DAD
        "ط",  # ARABIC LETTER TAH
        "ظ",  # ARABIC LETTER ZAH
        "ع",  # ARABIC LETTER AIN
        "غ",  # ARABIC LETTER GHAIN
        "ف",  # ARABIC LETTER FEH
        "ق",  # ARABIC LETTER QAF
        "ک",  # ARABIC LETTER KEHEH
        "ك",  # ARABIC LETTER KAF
        "گ",  # ARABIC LETTER GAF
        "ل",  # ARABIC LETTER LAM
        "م",  # ARABIC LETTER MEEM
        "ن",  # ARABIC LETTER NOON
        "و",  # ARABIC LETTER WAW
        "ؤ",  # ARABIC LETTER WAW WITH HAMZA ABOVE
        "ه",  # ARABIC LETTER HEH
        "ة",  # ARABIC LETTER TEH MARBUTA
        "ی",  # ARABIC LETTER FARSI YEH
        "ي",  # ARABIC LETTER YEH
        "ئ",  # ARABIC LETTER YEH WITH HAMZA ABOVE
        "ى",  # ARABIC LETTER ALEF MAKSURA
        "َ",  # ARABIC FATHA
        "ِ",  # ARABIC KASRA
        "ُ",  # ARABIC DAMMA
        "ً",  # ARABIC FATHATAN
        "ٍ",  # ARABIC KASRATAN
        "ٌ",  # ARABIC DAMMATAN
        "ّ",  # ARABIC SHADDA
        "ْ",  # ARABIC SUKUN
        "ـ",  # ARABIC TATWEEL
    ]

    valid_names = []
    invalid_names = []

    for name in tqdm(names):
        valid = True

        for character in name:
            if character not in valid_characters:
                valid = False
                break

        if valid:
            valid_names.append(name)

        else:
            invalid_names.append(name)

    return valid_names, invalid_names


def replace_non_persian_characters(names):
    characters = {
        "أ": "ا",  # ARABIC LETTER ALEF WITH HAMZA ABOVE -> ARABIC LETTER ALEF
        "إ": "ا",  # ARABIC LETTER ALEF WITH HAMZA BELOW -> ARABIC LETTER ALEF
        "ك": "ک",  # ARABIC LETTER KAF -> ARABIC LETTER KEHEH
        "ؤ": "و",  # ARABIC LETTER WAW WITH HAMZA ABOVE -> ARABIC LETTER WAW
        "ة": "ه",  # ARABIC LETTER TEH MARBUTA -> ARABIC LETTER HEH
        "ي": "ی",  # ARABIC LETTER YEH -> ARABIC LETTER FARSI YEH
        "ى": "ی",  # ARABIC LETTER ALEF MAKSURA -> ARABIC LETTER FARSI YEH
        "َ": "",  # ARABIC FATHA
        "ِ": "",  # ARABIC KASRA
        "ُ": "",  # ARABIC DAMMA
        "ً": "",  # ARABIC FATHATAN
        "ٍ": "",  # ARABIC KASRATAN
        "ٌ": "",  # ARABIC DAMMATAN
        "ّ": "",  # ARABIC SHADDA
        "ْ": "",  # ARABIC SUKUN
        "ـ": "",  # ARABIC TATWEEL
    }

    result = []

    for name in tqdm(names):
        for character in characters:
            if character in name:
                name = name.replace(character, characters[character])

        result.append(name)

    return result


def remove_extra_spaces(names):
    characters = ["ء", "آ", "ا", "د", "ذ", "ر", "ز", "ژ", "و"]

    result = []

    for name in tqdm(names):
        for character in characters:
            while name != name.replace("{} ".format(character), character):
                name = name.replace("{} ".format(character), character)

        while "  " in name:
            name = name.replace("  ", " ")

        name = name.strip()

        if len(name) > 0:
            result.append(name.strip())

    return result


def calculate_frequencies(names):
    result = OrderedDict()

    for name in tqdm(names):
        if name not in result:
            result[name] = 0

        result[name] += 1

    result = dict(reversed(sorted(result.items(), key=lambda item: item[1])))

    return result


def write_frequencies(name_frequencies, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        csv_writer = csv.writer(f, quoting=csv.QUOTE_ALL)
        csv_writer.writerow(["name", "frequency"])

        for name in tqdm(name_frequencies):
            csv_writer.writerow([name, name_frequencies[name]])


def main():
    print("Read names...")
    names = read_names("names.txt")

    print("Get characters...")
    characters = get_characters(names)
    print(characters)

    print("Write names with non printing characters...")
    special_names = get_names_contain_characters(names, ["\xa0"])
    write_names(sorted(special_names), "non-printing-xa0.txt")
    special_names = get_names_contain_characters(names, ["\xad"])
    write_names(sorted(special_names), "non-printing-xad.txt")
    special_names = get_names_contain_characters(names, ["\u200c"])
    write_names(sorted(special_names), "non-printing-u200c.txt")
    special_names = get_names_contain_characters(names, ["\u200d"])
    write_names(sorted(special_names), "non-printing-u200d.txt")
    special_names = get_names_contain_characters(names, ["\u200e"])
    write_names(sorted(special_names), "non-printing-u200e.txt")
    special_names = get_names_contain_characters(names, ["\u200f"])
    write_names(sorted(special_names), "non-printing-u200f.txt")

    print("Replace non printing characters...")
    names = replace_non_printing_characters(names)

    print("Get characters...")
    characters = get_characters(names)
    print(characters)

    print("Filter invalid names...")
    names, invalid_names = filter_invalid_names(names)
    write_names(sorted(invalid_names), "invalid-names.txt")

    print("Get characters...")
    characters = get_characters(names)
    print(characters)

    print("Replace non persian characters...")
    names = replace_non_persian_characters(names)

    print("Get characters...")
    characters = get_characters(names)
    print(characters)

    print("Remove extra spaces...")
    names = remove_extra_spaces(names)

    print("calculate frequencies...")
    name_frequencies = calculate_frequencies(names)

    print("write frequencies...")
    write_frequencies(name_frequencies, "name-frequencies.csv")


if __name__ == "__main__":
    main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment