grischard/place-population.py

## place-population.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Return GeoJSON centroids with population for each locality in Luxembourg.

BD-Adresses doesn't include communes, so we use the OpenStreetMap community's
csventrifuge output, which has been enriched. The output of this script is
© OpenStreetMap Contributors, see https://openstreetmap.org/copyright.

Some localities are spread over communes (yeah), e.g. Dirbach which is in three.

- Downloads the latest addresses produced by csventrifuge (enriched by commune)
- Average the position of all places
- Conflate population from rnpp
- Spit out geojson

Run like :

python3 place-centroid.py > place-centroids.geojson

"""


import pandas as pd
import geojson


def data2geojson(df):
    # Converts our dataframe to geojson. Column names hardcoded.
    features = []
    insert_features = lambda X: features.append(
        geojson.Feature(
            geometry=geojson.Point(
                (round(X["lon_wgs84"], 4), round(X["lat_wgs84"], 4))
            ),
            properties=dict(
                localite=X["localite"], commune=X["commune"], population=X["population"]
            ),
        )
    )
    df.apply(insert_features, axis=1)
    return geojson.dumps(
        geojson.FeatureCollection(features), sort_keys=True, ensure_ascii=False
    )


# The endpoint that redirects to the most recent version of the
# addresses in geojson.
ADDRESSES_CSV = "https://openstreetmap.lu/luxembourg-addresses.csv"
df = pd.read_csv(ADDRESSES_CSV)
# Drop useless labels
df = df.drop(
    labels=[
        "rue",
        "numero",
        "code_postal",
        "id_caclr_rue",
        "id_caclr_bat",
        "coord_est_luref",
        "coord_nord_luref",
        "id_geoportail",
        "code_commune",
    ],
    axis=1,
)
# Remove commune in parentheses, information is redundant and makes conflation
# more difficult
df = df.replace(to_replace=r" \(.*\)", value="", regex=True)

# People live there, but there's no address in BD-Addresses. Hardcode a point.
missingaddresses = [
    ["Helperknapp", "Kuelbecherhaff", 49.72131, 6.06763],
    ["Larochette", "Meysembourg", 49.76963, 6.18718],
    ["Waldbillig", "Grundhof", 49.81823, 6.32157],
    ["Wormeldange", "Dreiborn", 49.61974, 6.39383],
]

df = df.append(
    pd.DataFrame(
        missingaddresses, columns=["commune", "localite", "lat_wgs84", "lon_wgs84"]
    )
)

# Group by the mean lat/lon to get centre
df = df.groupby(["commune", "localite"]).mean()

POPULATION_CSV = (
    "https://data.public.lu/fr/datasets/r/cd165f0a-feb7-401f-a440-d42695e63738"
)
# cp437 is the encoding of the original IBM PC. Why they use it is anyone's guess.
popdf = pd.read_csv(POPULATION_CSV, encoding="cp437")
# Drop useless labels
popdf = popdf.drop(labels="COMMUNE_CODE", axis=1)
# Remove commune in parentheses, information is redundant and makes conflation
# more difficult
popdf = popdf.replace(to_replace=r" \(.*\)", value="", regex=True)
# Rename stuff to match csventrifuge output
popdf = popdf.rename(
    columns={"COMMUNE": "commune", "LOCALITE": "localite", "POPULATION": "population"}
)
popdf["commune"].replace({"Redange/Attert": "Redange-sur-Attert"}, inplace=True)
popdf["localite"].replace(
    {
        "Redange/Attert": "Redange",
        "Roodt/Eisch": "Roodt-sur-Eisch",
        "Goebelsmuehle": "Goebelsmühle",
    },
    inplace=True,
)

# Join both together
richdf = pd.merge(df, popdf, on=["commune", "localite"], how="outer")

# Places with no population get a 0
richdf["population"].fillna(0, inplace=True)

print(data2geojson(richdf))
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""
	Return GeoJSON centroids with population for each locality in Luxembourg.

	BD-Adresses doesn't include communes, so we use the OpenStreetMap community's
	csventrifuge output, which has been enriched. The output of this script is
	© OpenStreetMap Contributors, see https://openstreetmap.org/copyright.

	Some localities are spread over communes (yeah), e.g. Dirbach which is in three.

	- Downloads the latest addresses produced by csventrifuge (enriched by commune)
	- Average the position of all places
	- Conflate population from rnpp
	- Spit out geojson

	Run like :

	python3 place-centroid.py > place-centroids.geojson

	"""


	import pandas as pd
	import geojson


	def data2geojson(df):
	# Converts our dataframe to geojson. Column names hardcoded.
	features = []
	insert_features = lambda X: features.append(
	geojson.Feature(
	geometry=geojson.Point(
	(round(X["lon_wgs84"], 4), round(X["lat_wgs84"], 4))
	),
	properties=dict(
	localite=X["localite"], commune=X["commune"], population=X["population"]
	),
	)
	)
	df.apply(insert_features, axis=1)
	return geojson.dumps(
	geojson.FeatureCollection(features), sort_keys=True, ensure_ascii=False
	)


	# The endpoint that redirects to the most recent version of the
	# addresses in geojson.
	ADDRESSES_CSV = "https://openstreetmap.lu/luxembourg-addresses.csv"
	df = pd.read_csv(ADDRESSES_CSV)
	# Drop useless labels
	df = df.drop(
	labels=[
	"rue",
	"numero",
	"code_postal",
	"id_caclr_rue",
	"id_caclr_bat",
	"coord_est_luref",
	"coord_nord_luref",
	"id_geoportail",
	"code_commune",
	],
	axis=1,
	)
	# Remove commune in parentheses, information is redundant and makes conflation
	# more difficult
	df = df.replace(to_replace=r" \(.*\)", value="", regex=True)

	# People live there, but there's no address in BD-Addresses. Hardcode a point.
	missingaddresses = [
	["Helperknapp", "Kuelbecherhaff", 49.72131, 6.06763],
	["Larochette", "Meysembourg", 49.76963, 6.18718],
	["Waldbillig", "Grundhof", 49.81823, 6.32157],
	["Wormeldange", "Dreiborn", 49.61974, 6.39383],
	]

	df = df.append(
	pd.DataFrame(
	missingaddresses, columns=["commune", "localite", "lat_wgs84", "lon_wgs84"]
	)
	)

	# Group by the mean lat/lon to get centre
	df = df.groupby(["commune", "localite"]).mean()

	POPULATION_CSV = (
	"https://data.public.lu/fr/datasets/r/cd165f0a-feb7-401f-a440-d42695e63738"
	)
	# cp437 is the encoding of the original IBM PC. Why they use it is anyone's guess.
	popdf = pd.read_csv(POPULATION_CSV, encoding="cp437")
	# Drop useless labels
	popdf = popdf.drop(labels="COMMUNE_CODE", axis=1)
	# Remove commune in parentheses, information is redundant and makes conflation
	# more difficult
	popdf = popdf.replace(to_replace=r" \(.*\)", value="", regex=True)
	# Rename stuff to match csventrifuge output
	popdf = popdf.rename(
	columns={"COMMUNE": "commune", "LOCALITE": "localite", "POPULATION": "population"}
	)
	popdf["commune"].replace({"Redange/Attert": "Redange-sur-Attert"}, inplace=True)
	popdf["localite"].replace(
	{
	"Redange/Attert": "Redange",
	"Roodt/Eisch": "Roodt-sur-Eisch",
	"Goebelsmuehle": "Goebelsmühle",
	},
	inplace=True,
	)

	# Join both together
	richdf = pd.merge(df, popdf, on=["commune", "localite"], how="outer")

	# Places with no population get a 0
	richdf["population"].fillna(0, inplace=True)

	print(data2geojson(richdf))