grischard/convertcase.py

## convertcase.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Converts the UPPERCASE names of AKK to proper Albanian/Serbian/Turkish Titlecase.

Does some corrections on the fly.

TODO: add code to delete from roads_akk where name in ('Berbatovci', 'Jakup Mirena');
We currently do this in sql after the import, which means the gpkg output still has them.

"""

from importlib.machinery import SourceFileLoader
import os, re
import pandas as pd
import geopandas as gpd
from geopandas import GeoDataFrame

DATA_DIR_PATH = "/root/AKKfiles/"
NEW_DATA_DIR_PATH = "/usr/share/nginx/osm/converted-akk-data/"

titlecase = SourceFileLoader(
    "titlecase", "scripts/python-titlecase-albanian/titlecase/__init__.py"
).load_module()


def substitutions(word, **kwargs):
    if word in ("UÇK", "NATO", "TMK", "USA", "UÇPMB", "ABC"):
        return word.upper()
    elif word == "DR" or word == "DR.":
        return "Doktor"
    elif word == "PROF" or word == "PROF.":
        return "Professor"
    elif word == "KFORIT":
        return "KFOR-it"
    elif word == "UÇK-SË":
        return "UÇK-së"
    elif len(word) == 1 and word not in ("I", "E", "Ë", "E"):
        return word.upper() + "."
    elif word in ("SH", "DH", "GJ", "XJ", "LL", "TH", "ZH", "RR"):
        return titlecase.titlecase(word) + "."


corrections = {
    # Father's name
    "Ajet i Bala": "Ajet I. Bala",
    "Baftjar i Kryeziu": "Baftjar I. Kryeziu",
    "Bekë i Jusaj": "Bekë I. Jusaj",
    "Shaban i Hoxha": "Shaban I. Hoxha",
    # Possible typos in AKK
    "Atë Fran Koliqi": "At Fran Koliqi",
    "Betej e 25 Majit": "Beteja e 25 Majit",
    "Dimitrije Tucovč": "Dimitrije Tucoviq",
    "Shëlbyesi": "Shërbyesi",
    "Kosova B.": "Kosova B",
    # Undo overcorrection
    "A.": "A",
    # "to somewhere -> Rruga to somewhere"
    "Llapit": "Rruga e Llapit",
    "Bregajve": "Rruga e Bregajve"
}


def process_data():

    # Check paths
    if not os.path.isdir(DATA_DIR_PATH):
        print("Data directory " + DATA_DIR_PATH + " is missing")
        return

    if not os.path.isdir(NEW_DATA_DIR_PATH):
        try:
            os.makedirs(NEW_DATA_DIR_PATH)
        except:
            raise OSError(e)

    for filename in os.listdir(DATA_DIR_PATH):
        if filename.endswith(".gpkg"):

            fn = filename.split(".")[0]
            file_path = DATA_DIR_PATH + filename
            new_file_path = NEW_DATA_DIR_PATH + filename
            fn_type = fn.split("_")[0]

            print(filename + " is being processed.")

            gdf = gpd.read_file(file_path)
            df = pd.DataFrame(gdf)

            column = []
            for index, row in df.iterrows():

                if fn_type == "Entrances":
                    rna = row["RoadNameAL"]
                elif fn_type == "RoadSegmentView":
                    rna = row["RoadNameAl"]

                if rna == None:
                    rna = ""

                ## Preprocess

                # fix spaces missing after full stop
                rna = re.sub(r"\.([a-zA-Z])", ". \\1", rna)

                # convert multiple spaces to single spaces
                rna = re.sub(' +', ' ', rna)

                # Prefix Rruga so that "E FOO" becomes "RRUGA E FOO"
                if rna.startswith("E "):
                    rna = "RRUGA " + rna

                ## Titlecase
                rna = titlecase.titlecase(
                    rna.upper(), callback=substitutions
                )  # CoNvErT CaSe

                ## Post-process

                # Postfix -së for UÇK possessive case
                if rna.endswith("e UÇK") or rna.endswith("i UÇK"):
                    rna += "-së"

                # Serbian, Icelandic, what's the difference? *shrug*
                rna = rna.replace("ð", "đ")
                # Lowercase ž as a separate letter
                rna = rna.replace("Ǆ", "Dž")

                # Apply corrections
                if rna in corrections:
                    rna = corrections[rna]

                # Remove full stops after numbers
                rna = re.sub(r"(\d)\.", r"\1", rna)

                column.append(rna)

            df["RoadNameConverted"] = column

            # Convert data to original format
            new_gdf = GeoDataFrame(df)

            # Save new files
            new_gdf.to_file(new_file_path, "GPKG")

            print(filename + " successfully processed and saved.")
        else:
            print("The file " + filename + " is not GPKG format!")


process_data()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""
	Converts the UPPERCASE names of AKK to proper Albanian/Serbian/Turkish Titlecase.

	Does some corrections on the fly.

	TODO: add code to delete from roads_akk where name in ('Berbatovci', 'Jakup Mirena');
	We currently do this in sql after the import, which means the gpkg output still has them.

	"""

	from importlib.machinery import SourceFileLoader
	import os, re
	import pandas as pd
	import geopandas as gpd
	from geopandas import GeoDataFrame

	DATA_DIR_PATH = "/root/AKKfiles/"
	NEW_DATA_DIR_PATH = "/usr/share/nginx/osm/converted-akk-data/"

	titlecase = SourceFileLoader(
	"titlecase", "scripts/python-titlecase-albanian/titlecase/__init__.py"
	).load_module()


	def substitutions(word, **kwargs):
	if word in ("UÇK", "NATO", "TMK", "USA", "UÇPMB", "ABC"):
	return word.upper()
	elif word == "DR" or word == "DR.":
	return "Doktor"
	elif word == "PROF" or word == "PROF.":
	return "Professor"
	elif word == "KFORIT":
	return "KFOR-it"
	elif word == "UÇK-SË":
	return "UÇK-së"
	elif len(word) == 1 and word not in ("I", "E", "Ë", "E"):
	return word.upper() + "."
	elif word in ("SH", "DH", "GJ", "XJ", "LL", "TH", "ZH", "RR"):
	return titlecase.titlecase(word) + "."


	corrections = {
	# Father's name
	"Ajet i Bala": "Ajet I. Bala",
	"Baftjar i Kryeziu": "Baftjar I. Kryeziu",
	"Bekë i Jusaj": "Bekë I. Jusaj",
	"Shaban i Hoxha": "Shaban I. Hoxha",
	# Possible typos in AKK
	"Atë Fran Koliqi": "At Fran Koliqi",
	"Betej e 25 Majit": "Beteja e 25 Majit",
	"Dimitrije Tucovč": "Dimitrije Tucoviq",
	"Shëlbyesi": "Shërbyesi",
	"Kosova B.": "Kosova B",
	# Undo overcorrection
	"A.": "A",
	# "to somewhere -> Rruga to somewhere"
	"Llapit": "Rruga e Llapit",
	"Bregajve": "Rruga e Bregajve"
	}


	def process_data():

	# Check paths
	if not os.path.isdir(DATA_DIR_PATH):
	print("Data directory " + DATA_DIR_PATH + " is missing")
	return

	if not os.path.isdir(NEW_DATA_DIR_PATH):
	try:
	os.makedirs(NEW_DATA_DIR_PATH)
	except:
	raise OSError(e)

	for filename in os.listdir(DATA_DIR_PATH):
	if filename.endswith(".gpkg"):

	fn = filename.split(".")[0]
	file_path = DATA_DIR_PATH + filename
	new_file_path = NEW_DATA_DIR_PATH + filename
	fn_type = fn.split("_")[0]

	print(filename + " is being processed.")

	gdf = gpd.read_file(file_path)
	df = pd.DataFrame(gdf)

	column = []
	for index, row in df.iterrows():

	if fn_type == "Entrances":
	rna = row["RoadNameAL"]
	elif fn_type == "RoadSegmentView":
	rna = row["RoadNameAl"]

	if rna == None:
	rna = ""

	## Preprocess

	# fix spaces missing after full stop
	rna = re.sub(r"\.([a-zA-Z])", ". \\1", rna)

	# convert multiple spaces to single spaces
	rna = re.sub(' +', ' ', rna)

	# Prefix Rruga so that "E FOO" becomes "RRUGA E FOO"
	if rna.startswith("E "):
	rna = "RRUGA " + rna

	## Titlecase
	rna = titlecase.titlecase(
	rna.upper(), callback=substitutions
	) # CoNvErT CaSe

	## Post-process

	# Postfix -së for UÇK possessive case
	if rna.endswith("e UÇK") or rna.endswith("i UÇK"):
	rna += "-së"

	# Serbian, Icelandic, what's the difference? shrug
	rna = rna.replace("ð", "đ")
	# Lowercase ž as a separate letter
	rna = rna.replace("Ǆ", "Dž")

	# Apply corrections
	if rna in corrections:
	rna = corrections[rna]

	# Remove full stops after numbers
	rna = re.sub(r"(\d)\.", r"\1", rna)

	column.append(rna)

	df["RoadNameConverted"] = column

	# Convert data to original format
	new_gdf = GeoDataFrame(df)

	# Save new files
	new_gdf.to_file(new_file_path, "GPKG")

	print(filename + " successfully processed and saved.")
	else:
	print("The file " + filename + " is not GPKG format!")


	process_data()