martinfleis/tobler_matrix.ipynb

## tobler_matrix.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import geopandas\n",
    "import pandas\n",
    "import numpy\n",
    "\n",
    "from tobler.area_weighted import area_interpolate, _area_tables_binning\n",
    "from libpysal.examples import load_example\n",
    "\n",
    "from geopandas.testing import assert_geodataframe_equal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Example not available: Charleston1\n",
      "Example not downloaded: Chicago parcels\n",
      "Example not downloaded: Chile Migration\n",
      "Example not downloaded: Spirals\n"
     ]
    }
   ],
   "source": [
    "c1 = load_example('Charleston1')\n",
    "c2 = load_example('Charleston2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "crs = 6569\n",
    "tracts = geopandas.read_file(c1.get_path('sc_final_census2.shp')).to_crs(crs)\n",
    "zip_codes = geopandas.read_file(c2.get_path('CharlestonMSA2.shp')).to_crs(crs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create larger dfs for a better benchmark\n",
    "tracts = pandas.concat([tracts] * 20)\n",
    "zip_codes = pandas.concat([zip_codes] * 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tracts['pct_poverty'] = tracts.POV_POP/tracts.POV_TOT\n",
    "tracts['cat'] = numpy.random.randint(0, 10, len(tracts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create DOK matrix\n",
    "table = _area_tables_binning(tracts, zip_codes, \"auto\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<2340x840 sparse matrix of type '<class 'numpy.float32'>'\n",
       "\twith 136800 stored elements in Dictionary Of Keys format>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "table"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "DOK matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "679 ms ± 55.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "results = area_interpolate(\n",
    "    source_df=tracts, \n",
    "    target_df=zip_codes, \n",
    "    intensive_variables=['pct_poverty'], \n",
    "    extensive_variables=['EMP_MALE'],\n",
    "    categorical_variables=['cat'],\n",
    "    table=table\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "CSR matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "csr = table.tocsr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "14.1 ms ± 182 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "results_csr = area_interpolate(\n",
    "    source_df=tracts, \n",
    "    target_df=zip_codes, \n",
    "    intensive_variables=['pct_poverty'], \n",
    "    extensive_variables=['EMP_MALE'],\n",
    "    categorical_variables=['cat'],\n",
    "    table=csr\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert_geodataframe_equal(results, results_csr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "CSC matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "csc = table.tocsc()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "15.9 ms ± 74.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "results_csc = area_interpolate(\n",
    "    source_df=tracts, \n",
    "    target_df=zip_codes, \n",
    "    intensive_variables=['pct_poverty'], \n",
    "    extensive_variables=['EMP_MALE'],\n",
    "    categorical_variables=['cat'],\n",
    "    table=csc\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "LIL matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "lil = table.tolil()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "35.3 ms ± 556 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "results_lil = area_interpolate(\n",
    "    source_df=tracts, \n",
    "    target_df=zip_codes, \n",
    "    intensive_variables=['pct_poverty'], \n",
    "    extensive_variables=['EMP_MALE'],\n",
    "    categorical_variables=['cat'],\n",
    "    table=lil\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "8d1b2c984ad473d756980598d6fae8279815dc9c89a9d51a262cfb04eba7ee8f"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 ('geo_dev')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [],
	"source": [
	"import geopandas\n",
	"import pandas\n",
	"import numpy\n",
	"\n",
	"from tobler.area_weighted import area_interpolate, _area_tables_binning\n",
	"from libpysal.examples import load_example\n",
	"\n",
	"from geopandas.testing import assert_geodataframe_equal"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Example not available: Charleston1\n",
	"Example not downloaded: Chicago parcels\n",
	"Example not downloaded: Chile Migration\n",
	"Example not downloaded: Spirals\n"
	]
	}
	],
	"source": [
	"c1 = load_example('Charleston1')\n",
	"c2 = load_example('Charleston2')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"crs = 6569\n",
	"tracts = geopandas.read_file(c1.get_path('sc_final_census2.shp')).to_crs(crs)\n",
	"zip_codes = geopandas.read_file(c2.get_path('CharlestonMSA2.shp')).to_crs(crs)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"# create larger dfs for a better benchmark\n",
	"tracts = pandas.concat([tracts] * 20)\n",
	"zip_codes = pandas.concat([zip_codes] * 20)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"tracts['pct_poverty'] = tracts.POV_POP/tracts.POV_TOT\n",
	"tracts['cat'] = numpy.random.randint(0, 10, len(tracts))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"# create DOK matrix\n",
	"table = _area_tables_binning(tracts, zip_codes, \"auto\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<2340x840 sparse matrix of type '<class 'numpy.float32'>'\n",
	"\twith 136800 stored elements in Dictionary Of Keys format>"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"table"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"DOK matrix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"679 ms ± 55.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"results = area_interpolate(\n",
	" source_df=tracts, \n",
	" target_df=zip_codes, \n",
	" intensive_variables=['pct_poverty'], \n",
	" extensive_variables=['EMP_MALE'],\n",
	" categorical_variables=['cat'],\n",
	" table=table\n",
	")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"CSR matrix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"csr = table.tocsr()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"14.1 ms ± 182 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"results_csr = area_interpolate(\n",
	" source_df=tracts, \n",
	" target_df=zip_codes, \n",
	" intensive_variables=['pct_poverty'], \n",
	" extensive_variables=['EMP_MALE'],\n",
	" categorical_variables=['cat'],\n",
	" table=csr\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"assert_geodataframe_equal(results, results_csr)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"CSC matrix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"csc = table.tocsc()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"15.9 ms ± 74.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"results_csc = area_interpolate(\n",
	" source_df=tracts, \n",
	" target_df=zip_codes, \n",
	" intensive_variables=['pct_poverty'], \n",
	" extensive_variables=['EMP_MALE'],\n",
	" categorical_variables=['cat'],\n",
	" table=csc\n",
	")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"LIL matrix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [],
	"source": [
	"lil = table.tolil()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"35.3 ms ± 556 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"results_lil = area_interpolate(\n",
	" source_df=tracts, \n",
	" target_df=zip_codes, \n",
	" intensive_variables=['pct_poverty'], \n",
	" extensive_variables=['EMP_MALE'],\n",
	" categorical_variables=['cat'],\n",
	" table=lil\n",
	")"
	]
	}
	],
	"metadata": {
	"interpreter": {
	"hash": "8d1b2c984ad473d756980598d6fae8279815dc9c89a9d51a262cfb04eba7ee8f"
	},
	"kernelspec": {
	"display_name": "Python 3.9.7 ('geo_dev')",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.7"
	},
	"orig_nbformat": 4
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}