mthh/prepare_regioviz.py

## prepare_regioviz.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import geopandas as gpd
import ujson as json
import rtree
import os
import shutil
import zipfile

def zipdir(path, ziph):
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file))

def idx_generator_func(bounds_ix):
    for i, bound in bounds_ix:
        yield (i, bound, i)


def make_index(bounds):
    return rtree.index.Index([z for z in idx_generator_func(bounds)],
                             Interleaved=True)

def get_dists_ix_nearest(destination_level, origin_feature):
    subset = gdf[gdf.level == destination_level]
    match = subset[subset.name == origin_feature.name]
    if match.any().any():
        return match.iloc[0]
    dist = 3000
    while True:
        ix_rtree = list(indexes[destination_level].intersection(
                    origin_feature.geometry.buffer(dist).bounds,
                    objects='raw'))
        close_features = gdf.loc[ix_rtree]
        dists = sorted(
            [(origin_feature.geometry.centroid.distance(ft.geometry.centroid), ft.Index)
             for ft in close_features.itertuples()],
            key=lambda x: x[0])
        if len(dists) != 0:
            return gdf.loc[dists[0][1]]
        else:
            dist += 7000
            if dist > 200000: # Actually this shouldn't happen on our dataset:
                raise ValueError('No nearest feature for : {}'.format(origin_feature.id))

if __name__ == '__main__':
    # Let's go the appropriate folder containing our files for this project:
    os.chdir('/home/mz/rgvz_fra')

    # Remove the data_app folder if any:
    try:
        shutil.rmtree('data_app')
    except FileNotFoundError:
        pass

    # Create the folder to be used for receiving the modified files:
    os.mkdir('data_app')

    # Read the csv dataset:
    data = pd.read_csv('REGIOVIZ_DATA.csv', sep='\t', encoding='latin-1')
    data.set_index('id', inplace=True)


    # Read each input geom file and convert it to GeoJSON (still using epsg:2154 projection):
    #list_files = ['geom/' + i for i in os.listdir('geom') if 'shp' in i]
    list_files = ['geom/' + i for i in os.listdir('geom') if 'shp' in i
                  and i.replace('.shp', '') not in list(set(data.level)) + ['AUT']]
    gdf_geo, name_geo = None, None
    for fpath in list_files:
        gdf = gpd.read_file(fpath)
        name = fpath.replace('.shp', '2154.geojson').replace('geom/', 'data_app/')
        # On the target layer, for each feature we are computing the nearest feature
        # in each other territorial mesh/level:
        if 'territoires_france' in fpath:
            # For later use:
            gdf_geo = gdf.copy()
            name_geo = name.replace('2154', '')

            gdf.set_index('id', inplace=True)
            # Join the dataset on our target layer:
            gdf = gdf.join(data, lsuffix="_")
            gdf.reset_index(drop=False, inplace=True)

            # List the currently available levels:
            levels = set(gdf.level)

            # Drop some useless columns:
            to_drop = set(gdf.columns).difference(
                set(['id', 'geometry', 'level', 'name'] + list(levels))
                )
            gdf.drop(labels=to_drop, axis=1, inplace=True)

            # Index features separatly for each level:
            indexes = {}
            for level in levels:
                indexes[level] = make_index(
                    [(a.Index, a.geometry.bounds) for a in gdf[gdf.level == level].itertuples()])

            i = 1
            tot = len(gdf)
            for origin_level in levels:
                print('Current origin level: {} / {}% done'
                      .format(origin_level, round(i*100/tot)))
                other_levels = levels.difference({origin_level})
                subset = gdf[gdf.level == origin_level]
                for origin_ft in subset.itertuples():
                    i += 1
                    for dest_level in other_levels:
                        col_name = 'nearest_{}'.format(dest_level)
                        closest = get_dists_ix_nearest(dest_level, origin_ft)
                        gdf.loc[(origin_ft.Index, col_name)] = closest['id']

            new_columns = ['nearest_{}'.format(l) for l in levels]

            # Drop useless columns again:
            to_drop = set(gdf.columns).difference(
                set(['id', 'geometry'] + new_columns)
                )
            gdf.drop(labels=to_drop, axis=1, inplace=True)


        # Write the GeoJSON file:
        d = json.loads(gdf.to_json())
        d['crs'] = {
            "type": "name",
            "properties": {"name": "urn:ogc:def:crs:EPSG::2154"}
            }
        with open(name, 'w') as f:
            f.write(json.dumps(d))


    data.reset_index(drop=False, inplace=True)
    # .. and export the dataset (csv format, comma separator, no field delimiter
    # except when needed, empty values flagged with 'NA' string)
    data.to_csv('data_app/REGIOVIZ_DATA.csv',
                sep=',', na_rep='NA', index=False, encoding='UTF-8')

    # Convert the metadata sheet to csv:
    a = pd.read_excel('indicateurs_meta.xls', encoding='ISO-8859-1')
    a = a.replace('’?', "'").replace('?’', "'").replace('’', "'")
    a['Methodology'] = a['Methodology.1']
    a.to_csv('data_app/indicateurs_meta.csv',
             sep=',', index=False, encoding='UTF-8')

    # Copy the style file:
    shutil.copy2('styles.json', 'data_app/styles.json')

    # We are supposed to have 10 files in the data_app folder:
    assert len(os.listdir('data_app')) == 10

    # Lets zip them together:
    zipf = zipfile.ZipFile('data.zip', 'w', zipfile.ZIP_DEFLATED)
    zipdir('data_app/', zipf)
    zipf.close()

    # And move the archive.. in the data_app folder:
    shutil.move('data.zip', 'data_app/data.zip')

    # Also export thwe target layer with epsg:4326 projection
    # as it will be available in the application for download :
    with open(name_geo, 'w') as f:
        f.write(gdf_geo.to_crs(epsg=4326).to_json())

    # Also copy the various metadata sheets :
    for name_doc in os.listdir('a_integrer'):
        path_doc = os.path.join('a_integrer', name_doc)
        with open(path_doc, 'rb') as f_in:
            if 'Doc_metadata_rgvz' in path_doc:
                path_doc = path_doc.replace(
                    'a_integrer', 'data_app').replace(
                        'Doc_metadata_rgvz', 'Metadonnees_Regioviz')
                with open(path_doc, 'wb') as f_out:
                    f_out.write(f_in.read())
            else:
                with open(path_doc.replace('a_integrer', 'data_app'), 'wb') as f_out:
                    f_out.write(f_in.read())
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	import pandas as pd
	import geopandas as gpd
	import ujson as json
	import rtree
	import os
	import shutil
	import zipfile

	def zipdir(path, ziph):
	for root, dirs, files in os.walk(path):
	for file in files:
	ziph.write(os.path.join(root, file))

	def idx_generator_func(bounds_ix):
	for i, bound in bounds_ix:
	yield (i, bound, i)


	def make_index(bounds):
	return rtree.index.Index([z for z in idx_generator_func(bounds)],
	Interleaved=True)

	def get_dists_ix_nearest(destination_level, origin_feature):
	subset = gdf[gdf.level == destination_level]
	match = subset[subset.name == origin_feature.name]
	if match.any().any():
	return match.iloc[0]
	dist = 3000
	while True:
	ix_rtree = list(indexes[destination_level].intersection(
	origin_feature.geometry.buffer(dist).bounds,
	objects='raw'))
	close_features = gdf.loc[ix_rtree]
	dists = sorted(
	[(origin_feature.geometry.centroid.distance(ft.geometry.centroid), ft.Index)
	for ft in close_features.itertuples()],
	key=lambda x: x[0])
	if len(dists) != 0:
	return gdf.loc[dists[0][1]]
	else:
	dist += 7000
	if dist > 200000: # Actually this shouldn't happen on our dataset:
	raise ValueError('No nearest feature for : {}'.format(origin_feature.id))

	if __name__ == '__main__':
	# Let's go the appropriate folder containing our files for this project:
	os.chdir('/home/mz/rgvz_fra')

	# Remove the data_app folder if any:
	try:
	shutil.rmtree('data_app')
	except FileNotFoundError:
	pass

	# Create the folder to be used for receiving the modified files:
	os.mkdir('data_app')

	# Read the csv dataset:
	data = pd.read_csv('REGIOVIZ_DATA.csv', sep='\t', encoding='latin-1')
	data.set_index('id', inplace=True)


	# Read each input geom file and convert it to GeoJSON (still using epsg:2154 projection):
	#list_files = ['geom/' + i for i in os.listdir('geom') if 'shp' in i]
	list_files = ['geom/' + i for i in os.listdir('geom') if 'shp' in i
	and i.replace('.shp', '') not in list(set(data.level)) + ['AUT']]
	gdf_geo, name_geo = None, None
	for fpath in list_files:
	gdf = gpd.read_file(fpath)
	name = fpath.replace('.shp', '2154.geojson').replace('geom/', 'data_app/')
	# On the target layer, for each feature we are computing the nearest feature
	# in each other territorial mesh/level:
	if 'territoires_france' in fpath:
	# For later use:
	gdf_geo = gdf.copy()
	name_geo = name.replace('2154', '')

	gdf.set_index('id', inplace=True)
	# Join the dataset on our target layer:
	gdf = gdf.join(data, lsuffix="_")
	gdf.reset_index(drop=False, inplace=True)

	# List the currently available levels:
	levels = set(gdf.level)

	# Drop some useless columns:
	to_drop = set(gdf.columns).difference(
	set(['id', 'geometry', 'level', 'name'] + list(levels))
	)
	gdf.drop(labels=to_drop, axis=1, inplace=True)

	# Index features separatly for each level:
	indexes = {}
	for level in levels:
	indexes[level] = make_index(
	[(a.Index, a.geometry.bounds) for a in gdf[gdf.level == level].itertuples()])

	i = 1
	tot = len(gdf)
	for origin_level in levels:
	print('Current origin level: {} / {}% done'
	.format(origin_level, round(i*100/tot)))
	other_levels = levels.difference({origin_level})
	subset = gdf[gdf.level == origin_level]
	for origin_ft in subset.itertuples():
	i += 1
	for dest_level in other_levels:
	col_name = 'nearest_{}'.format(dest_level)
	closest = get_dists_ix_nearest(dest_level, origin_ft)
	gdf.loc[(origin_ft.Index, col_name)] = closest['id']

	new_columns = ['nearest_{}'.format(l) for l in levels]

	# Drop useless columns again:
	to_drop = set(gdf.columns).difference(
	set(['id', 'geometry'] + new_columns)
	)
	gdf.drop(labels=to_drop, axis=1, inplace=True)


	# Write the GeoJSON file:
	d = json.loads(gdf.to_json())
	d['crs'] = {
	"type": "name",
	"properties": {"name": "urn:ogc:def:crs:EPSG::2154"}
	}
	with open(name, 'w') as f:
	f.write(json.dumps(d))


	data.reset_index(drop=False, inplace=True)
	# .. and export the dataset (csv format, comma separator, no field delimiter
	# except when needed, empty values flagged with 'NA' string)
	data.to_csv('data_app/REGIOVIZ_DATA.csv',
	sep=',', na_rep='NA', index=False, encoding='UTF-8')

	# Convert the metadata sheet to csv:
	a = pd.read_excel('indicateurs_meta.xls', encoding='ISO-8859-1')
	a = a.replace('’?', "'").replace('?’', "'").replace('’', "'")
	a['Methodology'] = a['Methodology.1']
	a.to_csv('data_app/indicateurs_meta.csv',
	sep=',', index=False, encoding='UTF-8')

	# Copy the style file:
	shutil.copy2('styles.json', 'data_app/styles.json')

	# We are supposed to have 10 files in the data_app folder:
	assert len(os.listdir('data_app')) == 10

	# Lets zip them together:
	zipf = zipfile.ZipFile('data.zip', 'w', zipfile.ZIP_DEFLATED)
	zipdir('data_app/', zipf)
	zipf.close()

	# And move the archive.. in the data_app folder:
	shutil.move('data.zip', 'data_app/data.zip')

	# Also export thwe target layer with epsg:4326 projection
	# as it will be available in the application for download :
	with open(name_geo, 'w') as f:
	f.write(gdf_geo.to_crs(epsg=4326).to_json())

	# Also copy the various metadata sheets :
	for name_doc in os.listdir('a_integrer'):
	path_doc = os.path.join('a_integrer', name_doc)
	with open(path_doc, 'rb') as f_in:
	if 'Doc_metadata_rgvz' in path_doc:
	path_doc = path_doc.replace(
	'a_integrer', 'data_app').replace(
	'Doc_metadata_rgvz', 'Metadonnees_Regioviz')
	with open(path_doc, 'wb') as f_out:
	f_out.write(f_in.read())
	else:
	with open(path_doc.replace('a_integrer', 'data_app'), 'wb') as f_out:
	f_out.write(f_in.read())