Created
February 14, 2018 16:09
-
-
Save sfsheath/590cf73056d75fb4996ed95ea4e255fb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Introduction" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Getting going with joining tables in Pandas." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Load data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Drudgery w/o interest beyond the fact that data is coming from multiple sources." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import gzip # can uncompress gzipped files, useful for accessing the pleiades data\n", | |
"import io # useful routines for input/output\n", | |
"import pandas as pd # for working with \"rows/columns\" oriented data\n", | |
"import urllib.request # for loading documents using http\n", | |
"\n", | |
"%matplotlib inline\n", | |
"\n", | |
"import matplotlib # plotting\n", | |
"import matplotlib.pyplot as plt\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# load csv files\n", | |
"ramphs = pd.read_csv(\"http://sebastianheath.com/roman-amphitheaters/roman-amphitheaters.csv\")\n", | |
"chronogrps = pd.read_csv(\"http://sebastianheath.com/roman-amphitheaters/chronogrps.csv\")\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# load gzipped PLEIADES. key is \"io.BytesIO(gzip.decompress(response.read())\", which \n", | |
"# delivers (so to speak) an uncompressed csv to 'pd.read_csv'.\n", | |
"response = urllib.request.urlopen(\"http://atlantides.org/downloads/pleiades/dumps/pleiades-places-latest.csv.gz\")\n", | |
"pleiades = pd.read_csv(io.BytesIO(gzip.decompress(response.read())))\n", | |
"\n", | |
"# One note: because it takes a long time to load the pleiades data,\n", | |
"# avoid running this cell again unless necessary\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# metmuseum = pd.read_csv(\"https://github.com/metmuseum/openaccess/blob/master/MetObjects.csv?raw=true\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"print(\"Number of pleiades geographic entites loaded: %s\" % len(pleiades))\n", | |
"print(\"Number of roman amphitheater records loaded loaded: %s\" % len(ramphs))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Explore Pleiades" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# simple way to get column names\n", | |
"# list(pleiades)\n", | |
"\n", | |
"# pleiades.dtypes\n", | |
"\n", | |
"# pd.options.display.max_columns = 999" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# unique values\n", | |
"pleiades.featureTypes.unique()\n", | |
"\n", | |
"# sort_values()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# substring searching\n", | |
"pleiades[pleiades.featureTypes.str.contains('amphitheatre')]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# renaming columns\n", | |
"# https://stackoverflow.com/questions/19758364/rename-a-single-column-header-in-a-pandas-dataframe\n", | |
"print(\"https://stackoverflow.com/questions/19758364/rename-a-single-column-header-in-a-pandas-dataframe\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Joins" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# pd.merge\n", | |
"\n", | |
"ramphs.merge(chronogrps, left_on = 'chronogroup', right_on = 'chronogroup')\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# pd.set_index\n" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment