Skip to content

Instantly share code, notes, and snippets.

@mthh
Last active May 17, 2018 14:00
Show Gist options
  • Save mthh/6e3c20cda4c615447f065d8bacb8dfc9 to your computer and use it in GitHub Desktop.
Save mthh/6e3c20cda4c615447f065d8bacb8dfc9 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import geopandas as gpd
import ujson as json
import rtree
import os
import shutil
import zipfile
def zipdir(path, ziph):
for root, dirs, files in os.walk(path):
for file in files:
ziph.write(os.path.join(root, file))
def idx_generator_func(bounds_ix):
for i, bound in bounds_ix:
yield (i, bound, i)
def make_index(bounds):
return rtree.index.Index([z for z in idx_generator_func(bounds)],
Interleaved=True)
def get_dists_ix_nearest(destination_level, origin_feature):
subset = gdf[gdf.level == destination_level]
match = subset[subset.name == origin_feature.name]
if match.any().any():
return match.iloc[0]
dist = 3000
while True:
ix_rtree = list(indexes[destination_level].intersection(
origin_feature.geometry.buffer(dist).bounds,
objects='raw'))
close_features = gdf.loc[ix_rtree]
dists = sorted(
[(origin_feature.geometry.centroid.distance(ft.geometry.centroid), ft.Index)
for ft in close_features.itertuples()],
key=lambda x: x[0])
if len(dists) != 0:
return gdf.loc[dists[0][1]]
else:
dist += 7000
if dist > 200000: # Actually this shouldn't happen on our dataset:
raise ValueError('No nearest feature for : {}'.format(origin_feature.id))
if __name__ == '__main__':
# Let's go the appropriate folder containing our files for this project:
os.chdir('/home/mz/rgvz_fra')
# Remove the data_app folder if any:
try:
shutil.rmtree('data_app')
except FileNotFoundError:
pass
# Create the folder to be used for receiving the modified files:
os.mkdir('data_app')
# Read the csv dataset:
data = pd.read_csv('REGIOVIZ_DATA.csv', sep='\t', encoding='latin-1')
data.set_index('id', inplace=True)
# Read each input geom file and convert it to GeoJSON (still using epsg:2154 projection):
#list_files = ['geom/' + i for i in os.listdir('geom') if 'shp' in i]
list_files = ['geom/' + i for i in os.listdir('geom') if 'shp' in i
and i.replace('.shp', '') not in list(set(data.level)) + ['AUT']]
gdf_geo, name_geo = None, None
for fpath in list_files:
gdf = gpd.read_file(fpath)
name = fpath.replace('.shp', '2154.geojson').replace('geom/', 'data_app/')
# On the target layer, for each feature we are computing the nearest feature
# in each other territorial mesh/level:
if 'territoires_france' in fpath:
# For later use:
gdf_geo = gdf.copy()
name_geo = name.replace('2154', '')
gdf.set_index('id', inplace=True)
# Join the dataset on our target layer:
gdf = gdf.join(data, lsuffix="_")
gdf.reset_index(drop=False, inplace=True)
# List the currently available levels:
levels = set(gdf.level)
# Drop some useless columns:
to_drop = set(gdf.columns).difference(
set(['id', 'geometry', 'level', 'name'] + list(levels))
)
gdf.drop(labels=to_drop, axis=1, inplace=True)
# Index features separatly for each level:
indexes = {}
for level in levels:
indexes[level] = make_index(
[(a.Index, a.geometry.bounds) for a in gdf[gdf.level == level].itertuples()])
i = 1
tot = len(gdf)
for origin_level in levels:
print('Current origin level: {} / {}% done'
.format(origin_level, round(i*100/tot)))
other_levels = levels.difference({origin_level})
subset = gdf[gdf.level == origin_level]
for origin_ft in subset.itertuples():
i += 1
for dest_level in other_levels:
col_name = 'nearest_{}'.format(dest_level)
closest = get_dists_ix_nearest(dest_level, origin_ft)
gdf.loc[(origin_ft.Index, col_name)] = closest['id']
new_columns = ['nearest_{}'.format(l) for l in levels]
# Drop useless columns again:
to_drop = set(gdf.columns).difference(
set(['id', 'geometry'] + new_columns)
)
gdf.drop(labels=to_drop, axis=1, inplace=True)
# Write the GeoJSON file:
d = json.loads(gdf.to_json())
d['crs'] = {
"type": "name",
"properties": {"name": "urn:ogc:def:crs:EPSG::2154"}
}
with open(name, 'w') as f:
f.write(json.dumps(d))
data.reset_index(drop=False, inplace=True)
# .. and export the dataset (csv format, comma separator, no field delimiter
# except when needed, empty values flagged with 'NA' string)
data.to_csv('data_app/REGIOVIZ_DATA.csv',
sep=',', na_rep='NA', index=False, encoding='UTF-8')
# Convert the metadata sheet to csv:
a = pd.read_excel('indicateurs_meta.xls', encoding='ISO-8859-1')
a = a.replace('’?', "'").replace('?’', "'").replace('’', "'")
a['Methodology'] = a['Methodology.1']
a.to_csv('data_app/indicateurs_meta.csv',
sep=',', index=False, encoding='UTF-8')
# Copy the style file:
shutil.copy2('styles.json', 'data_app/styles.json')
# We are supposed to have 10 files in the data_app folder:
assert len(os.listdir('data_app')) == 10
# Lets zip them together:
zipf = zipfile.ZipFile('data.zip', 'w', zipfile.ZIP_DEFLATED)
zipdir('data_app/', zipf)
zipf.close()
# And move the archive.. in the data_app folder:
shutil.move('data.zip', 'data_app/data.zip')
# Also export thwe target layer with epsg:4326 projection
# as it will be available in the application for download :
with open(name_geo, 'w') as f:
f.write(gdf_geo.to_crs(epsg=4326).to_json())
# Also copy the various metadata sheets :
for name_doc in os.listdir('a_integrer'):
path_doc = os.path.join('a_integrer', name_doc)
with open(path_doc, 'rb') as f_in:
if 'Doc_metadata_rgvz' in path_doc:
path_doc = path_doc.replace(
'a_integrer', 'data_app').replace(
'Doc_metadata_rgvz', 'Metadonnees_Regioviz')
with open(path_doc, 'wb') as f_out:
f_out.write(f_in.read())
else:
with open(path_doc.replace('a_integrer', 'data_app'), 'wb') as f_out:
f_out.write(f_in.read())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment