Skip to content

Instantly share code, notes, and snippets.

@grischard
Created March 29, 2019 10:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save grischard/4fe7c3954c6acdda2fd78d8b53c6e772 to your computer and use it in GitHub Desktop.
Save grischard/4fe7c3954c6acdda2fd78d8b53c6e772 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Converts the UPPERCASE names of AKK to proper Albanian/Serbian/Turkish Titlecase.
Does some corrections on the fly.
TODO: add code to delete from roads_akk where name in ('Berbatovci', 'Jakup Mirena');
We currently do this in sql after the import, which means the gpkg output still has them.
"""
from importlib.machinery import SourceFileLoader
import os, re
import pandas as pd
import geopandas as gpd
from geopandas import GeoDataFrame
DATA_DIR_PATH = "/root/AKKfiles/"
NEW_DATA_DIR_PATH = "/usr/share/nginx/osm/converted-akk-data/"
titlecase = SourceFileLoader(
"titlecase", "scripts/python-titlecase-albanian/titlecase/__init__.py"
).load_module()
def substitutions(word, **kwargs):
if word in ("UÇK", "NATO", "TMK", "USA", "UÇPMB", "ABC"):
return word.upper()
elif word == "DR" or word == "DR.":
return "Doktor"
elif word == "PROF" or word == "PROF.":
return "Professor"
elif word == "KFORIT":
return "KFOR-it"
elif word == "UÇK-SË":
return "UÇK-së"
elif len(word) == 1 and word not in ("I", "E", "Ë", "E"):
return word.upper() + "."
elif word in ("SH", "DH", "GJ", "XJ", "LL", "TH", "ZH", "RR"):
return titlecase.titlecase(word) + "."
corrections = {
# Father's name
"Ajet i Bala": "Ajet I. Bala",
"Baftjar i Kryeziu": "Baftjar I. Kryeziu",
"Bekë i Jusaj": "Bekë I. Jusaj",
"Shaban i Hoxha": "Shaban I. Hoxha",
# Possible typos in AKK
"Atë Fran Koliqi": "At Fran Koliqi",
"Betej e 25 Majit": "Beteja e 25 Majit",
"Dimitrije Tucovč": "Dimitrije Tucoviq",
"Shëlbyesi": "Shërbyesi",
"Kosova B.": "Kosova B",
# Undo overcorrection
"A.": "A",
# "to somewhere -> Rruga to somewhere"
"Llapit": "Rruga e Llapit",
"Bregajve": "Rruga e Bregajve"
}
def process_data():
# Check paths
if not os.path.isdir(DATA_DIR_PATH):
print("Data directory " + DATA_DIR_PATH + " is missing")
return
if not os.path.isdir(NEW_DATA_DIR_PATH):
try:
os.makedirs(NEW_DATA_DIR_PATH)
except:
raise OSError(e)
for filename in os.listdir(DATA_DIR_PATH):
if filename.endswith(".gpkg"):
fn = filename.split(".")[0]
file_path = DATA_DIR_PATH + filename
new_file_path = NEW_DATA_DIR_PATH + filename
fn_type = fn.split("_")[0]
print(filename + " is being processed.")
gdf = gpd.read_file(file_path)
df = pd.DataFrame(gdf)
column = []
for index, row in df.iterrows():
if fn_type == "Entrances":
rna = row["RoadNameAL"]
elif fn_type == "RoadSegmentView":
rna = row["RoadNameAl"]
if rna == None:
rna = ""
## Preprocess
# fix spaces missing after full stop
rna = re.sub(r"\.([a-zA-Z])", ". \\1", rna)
# convert multiple spaces to single spaces
rna = re.sub(' +', ' ', rna)
# Prefix Rruga so that "E FOO" becomes "RRUGA E FOO"
if rna.startswith("E "):
rna = "RRUGA " + rna
## Titlecase
rna = titlecase.titlecase(
rna.upper(), callback=substitutions
) # CoNvErT CaSe
## Post-process
# Postfix -së for UÇK possessive case
if rna.endswith("e UÇK") or rna.endswith("i UÇK"):
rna += "-së"
# Serbian, Icelandic, what's the difference? *shrug*
rna = rna.replace("ð", "đ")
# Lowercase ž as a separate letter
rna = rna.replace("DŽ", "Dž")
# Apply corrections
if rna in corrections:
rna = corrections[rna]
# Remove full stops after numbers
rna = re.sub(r"(\d)\.", r"\1", rna)
column.append(rna)
df["RoadNameConverted"] = column
# Convert data to original format
new_gdf = GeoDataFrame(df)
# Save new files
new_gdf.to_file(new_file_path, "GPKG")
print(filename + " successfully processed and saved.")
else:
print("The file " + filename + " is not GPKG format!")
process_data()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment