sfsheath/mdv2018-week05.ipynb

## mdv2018-week05.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# gist -u https://gist.github.com/e89e674dda2a065143ffc138c3774a3c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import gzip            # can uncompress gzipped files, useful for accessing the pleiades data\n",
    "import io              # useful routines for input/output\n",
    "import pandas as pd    # for working with \"rows/columns\" oriented data\n",
    "import urllib.request  # for loading documents using http\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "import matplotlib # plotting\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "pd.options.display.max_columns = 999"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# load csv files\n",
    "ramphs = pd.read_csv(\"http://sebastianheath.com/roman-amphitheaters/roman-amphitheaters.csv\")\n",
    "chronogrps = pd.read_csv(\"http://sebastianheath.com/roman-amphitheaters/chronogrps.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# load gzipped PLEIADES. key is \"io.BytesIO(gzip.decompress(response.read())\", which \n",
    "# delivers (so to speak) an uncompressed csv to 'pd.read_csv'.\n",
    "response = urllib.request.urlopen(\"http://atlantides.org/downloads/pleiades/dumps/pleiades-places-latest.csv.gz\")\n",
    "pleiades = pd.read_csv(io.BytesIO(gzip.decompress(response.read())))\n",
    "\n",
    "# One note: because it takes a long time to load the pleiades data,\n",
    "# avoid running this cell again unless necessary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html\n",
    "combined = ramphs.merge(chronogrps) \\\n",
    " .assign(path = ramphs.pleiades.str.replace \\\n",
    "         (\"https://pleiades.stoa.org\",\"\")) \\\n",
    " .merge(pleiades, on = 'path', suffixes = ('','_pleiades'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "combined[['id','startdate','minDate','title_pleiades']].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pamphs = pleiades[pleiades.featureTypes.str.contains(\"amphitheatre\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "len(pamphs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "len(ramphs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ramphs['path'] = ramphs.pleiades.str.replace(\"https://pleiades.stoa.org\",\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pamphs.merge(ramphs, on = 'path', how = 'left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.duplicated.html\n",
    "pamphs.merge(ramphs, on = 'path', how = 'left') \\\n",
    "[pamphs.merge(ramphs, on = 'path', how = 'left') \\\n",
    " .path.duplicated(keep = False)]\n",
    "\n",
    "# [['id_y','featureTypes','minDate','maxDate','chronogroup']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sites = pd.DataFrame({'id':['s01','s02','s03'],\n",
    "                      'type':['settlement','cave','temple']})\n",
    "\n",
    "\n",
    "periods = pd.DataFrame({'id':['early','middle','late','classic'],\n",
    "                      'startdate':[100,250,725,350],\n",
    "                      'enddate':[250,725,850,550]})\n",
    "\n",
    "s_to_p = pd.DataFrame({'s_id':['s01','s01','s02','s03'],\n",
    "                      'p_id':['early','middle','late','classic']})\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# let's work on these steps\n",
    "#  1) list the site types associated with periods \n",
    "#     hint: use s_to_p as left hand df (& keep it simple for now)\n",
    "#  2) with that dataframe join start & end dates for the periods\n",
    "#  3) for every settlement type, find max and min start dates\n",
    "\n",
    "\n",
    "#['startdate']['temple']['min']"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# gist -u https://gist.github.com/e89e674dda2a065143ffc138c3774a3c"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import gzip # can uncompress gzipped files, useful for accessing the pleiades data\n",
	"import io # useful routines for input/output\n",
	"import pandas as pd # for working with \"rows/columns\" oriented data\n",
	"import urllib.request # for loading documents using http\n",
	"\n",
	"%matplotlib inline\n",
	"\n",
	"import matplotlib # plotting\n",
	"import matplotlib.pyplot as plt\n",
	"\n",
	"pd.options.display.max_columns = 999"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# load csv files\n",
	"ramphs = pd.read_csv(\"http://sebastianheath.com/roman-amphitheaters/roman-amphitheaters.csv\")\n",
	"chronogrps = pd.read_csv(\"http://sebastianheath.com/roman-amphitheaters/chronogrps.csv\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# load gzipped PLEIADES. key is \"io.BytesIO(gzip.decompress(response.read())\", which \n",
	"# delivers (so to speak) an uncompressed csv to 'pd.read_csv'.\n",
	"response = urllib.request.urlopen(\"http://atlantides.org/downloads/pleiades/dumps/pleiades-places-latest.csv.gz\")\n",
	"pleiades = pd.read_csv(io.BytesIO(gzip.decompress(response.read())))\n",
	"\n",
	"# One note: because it takes a long time to load the pleiades data,\n",
	"# avoid running this cell again unless necessary"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html\n",
	"combined = ramphs.merge(chronogrps) \\\n",
	" .assign(path = ramphs.pleiades.str.replace \\\n",
	" (\"https://pleiades.stoa.org\",\"\")) \\\n",
	" .merge(pleiades, on = 'path', suffixes = ('','_pleiades'))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"combined[['id','startdate','minDate','title_pleiades']].head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"pamphs = pleiades[pleiades.featureTypes.str.contains(\"amphitheatre\")]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"len(pamphs)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"len(ramphs)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"ramphs['path'] = ramphs.pleiades.str.replace(\"https://pleiades.stoa.org\",\"\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"pamphs.merge(ramphs, on = 'path', how = 'left')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.duplicated.html\n",
	"pamphs.merge(ramphs, on = 'path', how = 'left') \\\n",
	"[pamphs.merge(ramphs, on = 'path', how = 'left') \\\n",
	" .path.duplicated(keep = False)]\n",
	"\n",
	"# [['id_y','featureTypes','minDate','maxDate','chronogroup']]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"sites = pd.DataFrame({'id':['s01','s02','s03'],\n",
	" 'type':['settlement','cave','temple']})\n",
	"\n",
	"\n",
	"periods = pd.DataFrame({'id':['early','middle','late','classic'],\n",
	" 'startdate':[100,250,725,350],\n",
	" 'enddate':[250,725,850,550]})\n",
	"\n",
	"s_to_p = pd.DataFrame({'s_id':['s01','s01','s02','s03'],\n",
	" 'p_id':['early','middle','late','classic']})\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# let's work on these steps\n",
	"# 1) list the site types associated with periods \n",
	"# hint: use s_to_p as left hand df (& keep it simple for now)\n",
	"# 2) with that dataframe join start & end dates for the periods\n",
	"# 3) for every settlement type, find max and min start dates\n",
	"\n",
	"\n",
	"#['startdate']['temple']['min']"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}