Last active
November 17, 2018 17:22
-
-
Save jorisvandenbossche/c8590a3617698befad527e66eefb7f5b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Writing of GeoPackage files: performance" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Triggered by https://gis.stackexchange.com/questions/302811/how-to-get-fast-writing-with-geopandas-fiona\n", | |
"\n", | |
"Issues: https://github.com/Toblerity/Fiona/issues/685 and https://github.com/geopandas/geopandas/issues/863" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import geopandas\n", | |
"import fiona\n", | |
"import shapely.geometry\n", | |
"\n", | |
"import random\n", | |
"import string" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'0.4.0+26.g9e584cc'" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"geopandas.__version__" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'1.8.1'" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"fiona.__version__" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'2.3.2'" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"fiona.get_gdal_release_name()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Set-up\n", | |
"\n", | |
"Creating a test DataFrame of 100k rows, with 5 attribute columns (float, int, string):" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"N = 100000" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = geopandas.GeoDataFrame(\n", | |
" {'a': np.random.randn(N), 'b': np.random.randn(N),\n", | |
" 'c': np.random.randn(N), 'd': np.random.randint(100, size=N),\n", | |
" 'e': [''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) for _ in range(N)],\n", | |
" 'geometry': [shapely.geometry.Point(random.random(), random.random()) for _ in range(N)]})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>a</th>\n", | |
" <th>b</th>\n", | |
" <th>c</th>\n", | |
" <th>d</th>\n", | |
" <th>e</th>\n", | |
" <th>geometry</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>-0.737142</td>\n", | |
" <td>0.794280</td>\n", | |
" <td>-0.931802</td>\n", | |
" <td>87</td>\n", | |
" <td>QM1UV</td>\n", | |
" <td>POINT (0.44808983247236 0.3743659897967149)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>-1.837916</td>\n", | |
" <td>0.374512</td>\n", | |
" <td>-1.044283</td>\n", | |
" <td>98</td>\n", | |
" <td>JXK5G</td>\n", | |
" <td>POINT (0.6382756350593217 0.56737446424385)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0.632339</td>\n", | |
" <td>0.552887</td>\n", | |
" <td>-0.527719</td>\n", | |
" <td>32</td>\n", | |
" <td>ERFFB</td>\n", | |
" <td>POINT (0.794332646272509 0.1374427257679276)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>-1.044528</td>\n", | |
" <td>0.338132</td>\n", | |
" <td>1.524393</td>\n", | |
" <td>60</td>\n", | |
" <td>LIZEK</td>\n", | |
" <td>POINT (0.165077467352082 0.04358660000682968)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>-0.385769</td>\n", | |
" <td>-1.533187</td>\n", | |
" <td>-1.613047</td>\n", | |
" <td>86</td>\n", | |
" <td>BPRAI</td>\n", | |
" <td>POINT (0.6914986729831709 0.2205968451911791)</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" a b c d e \\\n", | |
"0 -0.737142 0.794280 -0.931802 87 QM1UV \n", | |
"1 -1.837916 0.374512 -1.044283 98 JXK5G \n", | |
"2 0.632339 0.552887 -0.527719 32 ERFFB \n", | |
"3 -1.044528 0.338132 1.524393 60 LIZEK \n", | |
"4 -0.385769 -1.533187 -1.613047 86 BPRAI \n", | |
"\n", | |
" geometry \n", | |
"0 POINT (0.44808983247236 0.3743659897967149) \n", | |
"1 POINT (0.6382756350593217 0.56737446424385) \n", | |
"2 POINT (0.794332646272509 0.1374427257679276) \n", | |
"3 POINT (0.165077467352082 0.04358660000682968) \n", | |
"4 POINT (0.6914986729831709 0.2205968451911791) " | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'geopandas.geodataframe.GeoDataFrame'>\n", | |
"RangeIndex: 100000 entries, 0 to 99999\n", | |
"Data columns (total 6 columns):\n", | |
"a 100000 non-null float64\n", | |
"b 100000 non-null float64\n", | |
"c 100000 non-null float64\n", | |
"d 100000 non-null int64\n", | |
"e 100000 non-null object\n", | |
"geometry 100000 non-null object\n", | |
"dtypes: float64(3), int64(1), object(2)\n", | |
"memory usage: 4.6+ MB\n" | |
] | |
} | |
], | |
"source": [ | |
"df.info()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Timings and profiling\n", | |
"\n", | |
"Writing to GeoPackage:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 51.8 s, sys: 488 ms, total: 52.3 s\n", | |
"Wall time: 52.6 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"df.to_file(\"test_geopackage.gpkg\", driver='GPKG')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Mimicking the `geopandas.GeoDataFrame.to_file` function to make it easier to profile:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"\n", | |
"def to_file(df, filename, driver=\"ESRI Shapefile\", schema=None, **kwargs):\n", | |
" if schema is None:\n", | |
" schema = geopandas.io.file.infer_schema(df)\n", | |
" filename = os.path.abspath(os.path.expanduser(filename))\n", | |
" with fiona.Env():\n", | |
" with fiona.open(filename, 'w', driver=driver, crs=df.crs,\n", | |
" schema=schema, **kwargs) as colxn:\n", | |
" # small adaptation to original code to split\n", | |
" # materializing of the iterfeatures generator and fiona's writerecords\n", | |
" records = list(df.iterfeatures())\n", | |
" colxn.writerecords(records)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Ensuring this takes more or less the same time:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 46.4 s, sys: 536 ms, total: 46.9 s\n", | |
"Wall time: 47.5 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"to_file(df, \"test_geopackage2.gpkg\", driver='GPKG')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Line by line profile**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%load_ext line_profiler" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Timer unit: 1e-06 s\n", | |
"\n", | |
"Total time: 85.1127 s\n", | |
"File: <ipython-input-10-2252d5f38157>\n", | |
"Function: to_file at line 3\n", | |
"\n", | |
"Line # Hits Time Per Hit % Time Line Contents\n", | |
"==============================================================\n", | |
" 3 def to_file(df, filename, driver=\"ESRI Shapefile\", schema=None, **kwargs):\n", | |
" 4 1 5.0 5.0 0.0 if schema is None:\n", | |
" 5 1 921712.0 921712.0 1.1 schema = geopandas.io.file.infer_schema(df)\n", | |
" 6 1 57.0 57.0 0.0 filename = os.path.abspath(os.path.expanduser(filename))\n", | |
" 7 1 571.0 571.0 0.0 with fiona.Env():\n", | |
" 8 1 3.0 3.0 0.0 with fiona.open(filename, 'w', driver=driver, crs=df.crs,\n", | |
" 9 1 160246.0 160246.0 0.2 schema=schema, **kwargs) as colxn:\n", | |
" 10 # small adaptation to original code to split\n", | |
" 11 # materializing of the iterfeatures generator and fiona's writerecords\n", | |
" 12 1 67786902.0 67786902.0 79.6 records = list(df.iterfeatures())\n", | |
" 13 1 16243219.0 16243219.0 19.1 colxn.writerecords(records)" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"prof = %lprun -r -f to_file to_file(df, \"test_geopackage_profile1.gpkg\", driver='GPKG')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Timer unit: 1e-06 s\n", | |
"\n", | |
"Total time: 85.1127 s\n", | |
"File: <ipython-input-10-2252d5f38157>\n", | |
"Function: to_file at line 3\n", | |
"\n", | |
"Line # Hits Time Per Hit % Time Line Contents\n", | |
"==============================================================\n", | |
" 3 def to_file(df, filename, driver=\"ESRI Shapefile\", schema=None, **kwargs):\n", | |
" 4 1 5.0 5.0 0.0 if schema is None:\n", | |
" 5 1 921712.0 921712.0 1.1 schema = geopandas.io.file.infer_schema(df)\n", | |
" 6 1 57.0 57.0 0.0 filename = os.path.abspath(os.path.expanduser(filename))\n", | |
" 7 1 571.0 571.0 0.0 with fiona.Env():\n", | |
" 8 1 3.0 3.0 0.0 with fiona.open(filename, 'w', driver=driver, crs=df.crs,\n", | |
" 9 1 160246.0 160246.0 0.2 schema=schema, **kwargs) as colxn:\n", | |
" 10 # small adaptation to original code to split\n", | |
" 11 # materializing of the iterfeatures generator and fiona's writerecords\n", | |
" 12 1 67786902.0 67786902.0 79.6 records = list(df.iterfeatures())\n", | |
" 13 1 16243219.0 16243219.0 19.1 colxn.writerecords(records)\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"prof.print_stats()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The profiling also introduces some overhead and possible bias, so timing the costly steps separately (total time was 47 - 52 s):" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"32.5 s ± 2.19 s per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"list(df.iterfeatures())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"records = list(df.iterfeatures())\n", | |
"schema = geopandas.io.file.infer_schema(df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"11.4 s ± 284 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"with fiona.Env():\n", | |
" with fiona.open(\"test_geopackage_profile2.gpkg\", 'w', driver=\"GPKG\", crs=df.crs, schema=schema) as colxn:\n", | |
" colxn.writerecords(records)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Is this specific to GeoPackage?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"8.18 s ± 180 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"with fiona.Env():\n", | |
" with fiona.open(\"test_profile_geojson.geojson\", 'w', driver=\"GeoJSON\", crs=df.crs, schema=schema) as colxn:\n", | |
" colxn.writerecords(records)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Observations:**\n", | |
"\n", | |
"- ca 70 - 80 % of the time is coming from converting the GeoDataFrame to a features dict (`iterfeatures`), 20 - 25% of the time is spent actually writing it with Fiona\n", | |
"- The performance of `GeoDataFrame.iterfeatures` should certainly be improved, and this should already lead to a big improvement overall.\n", | |
"- Writing the records with Fiona itself still takes ca 10s. So even if the time of `iterfeatures` is reduces drastically, writing this file will take > 10s. Are there ways to improve this?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Timings with older GeoPandas / Fiona versions" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"With GeoPandas 0.3 and Fiona 1.7:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 2min 49s, sys: 50.8 s, total: 3min 40s\n", | |
"Wall time: 18min 31s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"df.to_file(\"test_geopackage_geopandas03_fiona17_gdal22.gpkg\", driver='GPKG')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"With GeoPandas 0.4 and Fiona 1.7:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1min 20s, sys: 11.7 s, total: 1min 32s\n", | |
"Wall time: 1min 32s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"df.to_file(\"test_geopackage_geopandas04_fiona17_gdal22.gpkg\", driver='GPKG')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"With GeoPandas 0.4 and Fiona 1.8:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/joris/scipy/geopandas/geopandas/io/file.py:108: FionaDeprecationWarning: Use fiona.Env() instead.\n", | |
" with fiona.drivers():\n", | |
"CPLE_NotSupported in driver GPKG does not support creation option ENCODING\n", | |
"CPLE_NotSupported in dataset /home/joris/scipy/test_geopackage_geopandas04_fiona18_gdal23.gpkg does not support layer creation option ENCODING\n", | |
"CPLE_NotSupported in dataset /home/joris/scipy/test_geopackage_geopandas04_fiona18_gdal23.gpkg does not support layer creation option ENCODING\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 54.1 s, sys: 260 ms, total: 54.4 s\n", | |
"Wall time: 55 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"df.to_file(\"test_geopackage_geopandas04_fiona18_gdal23.gpkg\", driver='GPKG')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Observations:**\n", | |
"\n", | |
"- With the previous version for both packages, it was *really* slow\n", | |
"- Most improvement came from writing multiple records at once with `fiona.Collection.writerecords` in GeoPandas (https://github.com/geopandas/geopandas/pull/605)\n", | |
"- Some additional improvement from increasing the default transaction size in Fiona (https://github.com/Toblerity/Fiona/issues/476, https://github.com/Toblerity/Fiona/pull/491)\n", | |
"\n", | |
"But, even with the latest version of both, it still takes ca 50 seconds, while QGIS only takes a couple of seconds." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "fiona18", | |
"language": "python", | |
"name": "fiona18" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment