Created
March 29, 2019 10:44
-
-
Save grischard/4fe7c3954c6acdda2fd78d8b53c6e772 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Converts the UPPERCASE names of AKK to proper Albanian/Serbian/Turkish Titlecase. | |
Does some corrections on the fly. | |
TODO: add code to delete from roads_akk where name in ('Berbatovci', 'Jakup Mirena'); | |
We currently do this in sql after the import, which means the gpkg output still has them. | |
""" | |
from importlib.machinery import SourceFileLoader | |
import os, re | |
import pandas as pd | |
import geopandas as gpd | |
from geopandas import GeoDataFrame | |
DATA_DIR_PATH = "/root/AKKfiles/" | |
NEW_DATA_DIR_PATH = "/usr/share/nginx/osm/converted-akk-data/" | |
titlecase = SourceFileLoader( | |
"titlecase", "scripts/python-titlecase-albanian/titlecase/__init__.py" | |
).load_module() | |
def substitutions(word, **kwargs): | |
if word in ("UÇK", "NATO", "TMK", "USA", "UÇPMB", "ABC"): | |
return word.upper() | |
elif word == "DR" or word == "DR.": | |
return "Doktor" | |
elif word == "PROF" or word == "PROF.": | |
return "Professor" | |
elif word == "KFORIT": | |
return "KFOR-it" | |
elif word == "UÇK-SË": | |
return "UÇK-së" | |
elif len(word) == 1 and word not in ("I", "E", "Ë", "E"): | |
return word.upper() + "." | |
elif word in ("SH", "DH", "GJ", "XJ", "LL", "TH", "ZH", "RR"): | |
return titlecase.titlecase(word) + "." | |
corrections = { | |
# Father's name | |
"Ajet i Bala": "Ajet I. Bala", | |
"Baftjar i Kryeziu": "Baftjar I. Kryeziu", | |
"Bekë i Jusaj": "Bekë I. Jusaj", | |
"Shaban i Hoxha": "Shaban I. Hoxha", | |
# Possible typos in AKK | |
"Atë Fran Koliqi": "At Fran Koliqi", | |
"Betej e 25 Majit": "Beteja e 25 Majit", | |
"Dimitrije Tucovč": "Dimitrije Tucoviq", | |
"Shëlbyesi": "Shërbyesi", | |
"Kosova B.": "Kosova B", | |
# Undo overcorrection | |
"A.": "A", | |
# "to somewhere -> Rruga to somewhere" | |
"Llapit": "Rruga e Llapit", | |
"Bregajve": "Rruga e Bregajve" | |
} | |
def process_data(): | |
# Check paths | |
if not os.path.isdir(DATA_DIR_PATH): | |
print("Data directory " + DATA_DIR_PATH + " is missing") | |
return | |
if not os.path.isdir(NEW_DATA_DIR_PATH): | |
try: | |
os.makedirs(NEW_DATA_DIR_PATH) | |
except: | |
raise OSError(e) | |
for filename in os.listdir(DATA_DIR_PATH): | |
if filename.endswith(".gpkg"): | |
fn = filename.split(".")[0] | |
file_path = DATA_DIR_PATH + filename | |
new_file_path = NEW_DATA_DIR_PATH + filename | |
fn_type = fn.split("_")[0] | |
print(filename + " is being processed.") | |
gdf = gpd.read_file(file_path) | |
df = pd.DataFrame(gdf) | |
column = [] | |
for index, row in df.iterrows(): | |
if fn_type == "Entrances": | |
rna = row["RoadNameAL"] | |
elif fn_type == "RoadSegmentView": | |
rna = row["RoadNameAl"] | |
if rna == None: | |
rna = "" | |
## Preprocess | |
# fix spaces missing after full stop | |
rna = re.sub(r"\.([a-zA-Z])", ". \\1", rna) | |
# convert multiple spaces to single spaces | |
rna = re.sub(' +', ' ', rna) | |
# Prefix Rruga so that "E FOO" becomes "RRUGA E FOO" | |
if rna.startswith("E "): | |
rna = "RRUGA " + rna | |
## Titlecase | |
rna = titlecase.titlecase( | |
rna.upper(), callback=substitutions | |
) # CoNvErT CaSe | |
## Post-process | |
# Postfix -së for UÇK possessive case | |
if rna.endswith("e UÇK") or rna.endswith("i UÇK"): | |
rna += "-së" | |
# Serbian, Icelandic, what's the difference? *shrug* | |
rna = rna.replace("ð", "đ") | |
# Lowercase ž as a separate letter | |
rna = rna.replace("DŽ", "Dž") | |
# Apply corrections | |
if rna in corrections: | |
rna = corrections[rna] | |
# Remove full stops after numbers | |
rna = re.sub(r"(\d)\.", r"\1", rna) | |
column.append(rna) | |
df["RoadNameConverted"] = column | |
# Convert data to original format | |
new_gdf = GeoDataFrame(df) | |
# Save new files | |
new_gdf.to_file(new_file_path, "GPKG") | |
print(filename + " successfully processed and saved.") | |
else: | |
print("The file " + filename + " is not GPKG format!") | |
process_data() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment