sfsheath/week04.ipynb

## week04.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Getting going with joining tables in Pandas."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Drudgery w/o interest beyond the fact that data is coming from multiple sources."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import gzip            # can uncompress gzipped files, useful for accessing the pleiades data\n",
    "import io              # useful routines for input/output\n",
    "import pandas as pd    # for working with \"rows/columns\" oriented data\n",
    "import urllib.request  # for loading documents using http\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "import matplotlib # plotting\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# load csv files\n",
    "ramphs = pd.read_csv(\"http://sebastianheath.com/roman-amphitheaters/roman-amphitheaters.csv\")\n",
    "chronogrps = pd.read_csv(\"http://sebastianheath.com/roman-amphitheaters/chronogrps.csv\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# load gzipped PLEIADES. key is \"io.BytesIO(gzip.decompress(response.read())\", which \n",
    "# delivers (so to speak) an uncompressed csv to 'pd.read_csv'.\n",
    "response = urllib.request.urlopen(\"http://atlantides.org/downloads/pleiades/dumps/pleiades-places-latest.csv.gz\")\n",
    "pleiades = pd.read_csv(io.BytesIO(gzip.decompress(response.read())))\n",
    "\n",
    "# One note: because it takes a long time to load the pleiades data,\n",
    "# avoid running this cell again unless necessary\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# metmuseum = pd.read_csv(\"https://github.com/metmuseum/openaccess/blob/master/MetObjects.csv?raw=true\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(\"Number of pleiades geographic entites loaded: %s\" % len(pleiades))\n",
    "print(\"Number of roman amphitheater records loaded loaded: %s\" % len(ramphs))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Explore Pleiades"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# simple way to get column names\n",
    "# list(pleiades)\n",
    "\n",
    "# pleiades.dtypes\n",
    "\n",
    "# pd.options.display.max_columns = 999"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# unique values\n",
    "pleiades.featureTypes.unique()\n",
    "\n",
    "# sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# substring searching\n",
    "pleiades[pleiades.featureTypes.str.contains('amphitheatre')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# renaming columns\n",
    "# https://stackoverflow.com/questions/19758364/rename-a-single-column-header-in-a-pandas-dataframe\n",
    "print(\"https://stackoverflow.com/questions/19758364/rename-a-single-column-header-in-a-pandas-dataframe\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Joins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# pd.merge\n",
    "\n",
    "ramphs.merge(chronogrps, left_on = 'chronogroup', right_on = 'chronogroup')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# pd.set_index\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Introduction"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Getting going with joining tables in Pandas."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Load data"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Drudgery w/o interest beyond the fact that data is coming from multiple sources."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import gzip # can uncompress gzipped files, useful for accessing the pleiades data\n",
	"import io # useful routines for input/output\n",
	"import pandas as pd # for working with \"rows/columns\" oriented data\n",
	"import urllib.request # for loading documents using http\n",
	"\n",
	"%matplotlib inline\n",
	"\n",
	"import matplotlib # plotting\n",
	"import matplotlib.pyplot as plt\n",
	"\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# load csv files\n",
	"ramphs = pd.read_csv(\"http://sebastianheath.com/roman-amphitheaters/roman-amphitheaters.csv\")\n",
	"chronogrps = pd.read_csv(\"http://sebastianheath.com/roman-amphitheaters/chronogrps.csv\")\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# load gzipped PLEIADES. key is \"io.BytesIO(gzip.decompress(response.read())\", which \n",
	"# delivers (so to speak) an uncompressed csv to 'pd.read_csv'.\n",
	"response = urllib.request.urlopen(\"http://atlantides.org/downloads/pleiades/dumps/pleiades-places-latest.csv.gz\")\n",
	"pleiades = pd.read_csv(io.BytesIO(gzip.decompress(response.read())))\n",
	"\n",
	"# One note: because it takes a long time to load the pleiades data,\n",
	"# avoid running this cell again unless necessary\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# metmuseum = pd.read_csv(\"https://github.com/metmuseum/openaccess/blob/master/MetObjects.csv?raw=true\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"print(\"Number of pleiades geographic entites loaded: %s\" % len(pleiades))\n",
	"print(\"Number of roman amphitheater records loaded loaded: %s\" % len(ramphs))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Explore Pleiades"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# simple way to get column names\n",
	"# list(pleiades)\n",
	"\n",
	"# pleiades.dtypes\n",
	"\n",
	"# pd.options.display.max_columns = 999"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# unique values\n",
	"pleiades.featureTypes.unique()\n",
	"\n",
	"# sort_values()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# substring searching\n",
	"pleiades[pleiades.featureTypes.str.contains('amphitheatre')]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# renaming columns\n",
	"# https://stackoverflow.com/questions/19758364/rename-a-single-column-header-in-a-pandas-dataframe\n",
	"print(\"https://stackoverflow.com/questions/19758364/rename-a-single-column-header-in-a-pandas-dataframe\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Joins"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# pd.merge\n",
	"\n",
	"ramphs.merge(chronogrps, left_on = 'chronogroup', right_on = 'chronogroup')\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# pd.set_index\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}