Skip to content

Instantly share code, notes, and snippets.

@JamesGardiner
Created April 18, 2016 22:03
Show Gist options
  • Save JamesGardiner/0c2f5c884453664ec672cdd557b84fc3 to your computer and use it in GitHub Desktop.
Save JamesGardiner/0c2f5c884453664ec672cdd557b84fc3 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "# Reading databases in Pandas"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Here is a simple representation of how to read into a dataframe a (large) PostgreSQL database."
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "import json\nimport pandas as pd\nimport psycopg2",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "# Read in configuration options\n# Assumes a config.json file in the same\n# directory with minimum of host, database,\n# user, passw, schema and table key: value pairs\nwith open('config.json') as f:\n conf = json.load(f)",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "# Connection string with required parameters\nconn_str = \"host={} dbname={} user={} password={}\".format(conf['host'],\n conf['database'],\n conf['user'],\n conf['passw']\n )",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "# Get a connection object\nconn = psycopg2.connect(conn_str)",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "# This might fail if the DB is large and exceeds your machine's memory limitations\n# when being read in\ndf = pd.read_sql('select * from {}.{}'.format(conf['schema'],\n conf['table']),\n con=conn)",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "# A better approach when dealing with large databases\n# or a machine with low memory is to create a generator\n# that iterates over a table x rows at a time\ndf = pd.DataFrame()\n\n# Iterating is far better for large databases\nfor chunk in pd.read_sql('select * from {}.{}'.format(conf['schema'],\n conf['table']),\n con=conn,\n chunksize=5000):\n df = df.append(chunk)",
"execution_count": 5,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "# check it works\ndf.head()",
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>pkey</th>\n <th>created</th>\n <th>first_name</th>\n <th>href</th>\n <th>id</th>\n <th>links</th>\n <th>surname</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>47024</td>\n <td>2016-03-31 19:16:25</td>\n <td>Candice Coker</td>\n <td>http://gtr.rcuk.ac.uk:80/gtr/api/persons/181FC...</td>\n <td>181FC03A-FB8E-4C3D-8952-1D0DA0902AB3</td>\n <td>{'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g...</td>\n <td>Morey</td>\n </tr>\n <tr>\n <th>1</th>\n <td>47025</td>\n <td>2016-03-31 19:16:25</td>\n <td>Jaroslaw</td>\n <td>http://gtr.rcuk.ac.uk:80/gtr/api/persons/1FAD8...</td>\n <td>1FAD8549-7B84-4B12-9D6C-F6F2B9A85481</td>\n <td>{'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g...</td>\n <td>Nowak</td>\n </tr>\n <tr>\n <th>2</th>\n <td>47026</td>\n <td>2016-03-31 19:16:25</td>\n <td>Derrick</td>\n <td>http://gtr.rcuk.ac.uk:80/gtr/api/persons/20FEB...</td>\n <td>20FEBA57-1959-40ED-927C-AC919542D3C9</td>\n <td>{'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g...</td>\n <td>Bennett</td>\n </tr>\n <tr>\n <th>3</th>\n <td>47027</td>\n <td>2016-03-31 19:16:25</td>\n <td>Ross</td>\n <td>http://gtr.rcuk.ac.uk:80/gtr/api/persons/14AAC...</td>\n <td>14AACF47-BEE8-41E9-91A0-ADBE5B528AAA</td>\n <td>{'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g...</td>\n <td>Forgan</td>\n </tr>\n <tr>\n <th>4</th>\n <td>47028</td>\n <td>2016-03-31 19:16:25</td>\n <td>Roxanne</td>\n <td>http://gtr.rcuk.ac.uk:80/gtr/api/persons/159AD...</td>\n <td>159ADF8A-1013-4C2E-BB96-93CDA266289C</td>\n <td>{'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g...</td>\n <td>Guenette</td>\n </tr>\n </tbody>\n</table>\n</div>",
"text/plain": " pkey created first_name \\\n0 47024 2016-03-31 19:16:25 Candice Coker \n1 47025 2016-03-31 19:16:25 Jaroslaw \n2 47026 2016-03-31 19:16:25 Derrick \n3 47027 2016-03-31 19:16:25 Ross \n4 47028 2016-03-31 19:16:25 Roxanne \n\n href \\\n0 http://gtr.rcuk.ac.uk:80/gtr/api/persons/181FC... \n1 http://gtr.rcuk.ac.uk:80/gtr/api/persons/1FAD8... \n2 http://gtr.rcuk.ac.uk:80/gtr/api/persons/20FEB... \n3 http://gtr.rcuk.ac.uk:80/gtr/api/persons/14AAC... \n4 http://gtr.rcuk.ac.uk:80/gtr/api/persons/159AD... \n\n id \\\n0 181FC03A-FB8E-4C3D-8952-1D0DA0902AB3 \n1 1FAD8549-7B84-4B12-9D6C-F6F2B9A85481 \n2 20FEBA57-1959-40ED-927C-AC919542D3C9 \n3 14AACF47-BEE8-41E9-91A0-ADBE5B528AAA \n4 159ADF8A-1013-4C2E-BB96-93CDA266289C \n\n links surname \n0 {'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g... Morey \n1 {'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g... Nowak \n2 {'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g... Bennett \n3 {'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g... Forgan \n4 {'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g... Guenette "
},
"metadata": {},
"execution_count": 6
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "df.count()",
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "pkey 51618\ncreated 51618\nfirst_name 51618\nhref 51618\nid 51618\nlinks 51618\nsurname 51618\ndtype: int64"
},
"metadata": {},
"execution_count": 8
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"mimetype": "text/x-python",
"nbconvert_exporter": "python",
"name": "python",
"file_extension": ".py",
"version": "3.5.1",
"pygments_lexer": "ipython3",
"codemirror_mode": {
"version": 3,
"name": "ipython"
}
},
"gist": {
"id": "",
"data": {
"description": "read_sql_pandas.ipynb",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment