Skip to content

Instantly share code, notes, and snippets.

@sfsheath
Last active February 21, 2018 18:48
Show Gist options
  • Save sfsheath/e89e674dda2a065143ffc138c3774a3c to your computer and use it in GitHub Desktop.
Save sfsheath/e89e674dda2a065143ffc138c3774a3c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# gist -u https://gist.github.com/e89e674dda2a065143ffc138c3774a3c"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import gzip # can uncompress gzipped files, useful for accessing the pleiades data\n",
"import io # useful routines for input/output\n",
"import pandas as pd # for working with \"rows/columns\" oriented data\n",
"import urllib.request # for loading documents using http\n",
"\n",
"%matplotlib inline\n",
"\n",
"import matplotlib # plotting\n",
"import matplotlib.pyplot as plt\n",
"\n",
"pd.options.display.max_columns = 999"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# load csv files\n",
"ramphs = pd.read_csv(\"http://sebastianheath.com/roman-amphitheaters/roman-amphitheaters.csv\")\n",
"chronogrps = pd.read_csv(\"http://sebastianheath.com/roman-amphitheaters/chronogrps.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# load gzipped PLEIADES. key is \"io.BytesIO(gzip.decompress(response.read())\", which \n",
"# delivers (so to speak) an uncompressed csv to 'pd.read_csv'.\n",
"response = urllib.request.urlopen(\"http://atlantides.org/downloads/pleiades/dumps/pleiades-places-latest.csv.gz\")\n",
"pleiades = pd.read_csv(io.BytesIO(gzip.decompress(response.read())))\n",
"\n",
"# One note: because it takes a long time to load the pleiades data,\n",
"# avoid running this cell again unless necessary"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html\n",
"combined = ramphs.merge(chronogrps) \\\n",
" .assign(path = ramphs.pleiades.str.replace \\\n",
" (\"https://pleiades.stoa.org\",\"\")) \\\n",
" .merge(pleiades, on = 'path', suffixes = ('','_pleiades'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"combined[['id','startdate','minDate','title_pleiades']].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"pamphs = pleiades[pleiades.featureTypes.str.contains(\"amphitheatre\")]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"len(pamphs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"len(ramphs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ramphs['path'] = ramphs.pleiades.str.replace(\"https://pleiades.stoa.org\",\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"pamphs.merge(ramphs, on = 'path', how = 'left')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.duplicated.html\n",
"pamphs.merge(ramphs, on = 'path', how = 'left') \\\n",
"[pamphs.merge(ramphs, on = 'path', how = 'left') \\\n",
" .path.duplicated(keep = False)]\n",
"\n",
"# [['id_y','featureTypes','minDate','maxDate','chronogroup']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sites = pd.DataFrame({'id':['s01','s02','s03'],\n",
" 'type':['settlement','cave','temple']})\n",
"\n",
"\n",
"periods = pd.DataFrame({'id':['early','middle','late','classic'],\n",
" 'startdate':[100,250,725,350],\n",
" 'enddate':[250,725,850,550]})\n",
"\n",
"s_to_p = pd.DataFrame({'s_id':['s01','s01','s02','s03'],\n",
" 'p_id':['early','middle','late','classic']})\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# let's work on these steps\n",
"# 1) list the site types associated with periods \n",
"# hint: use s_to_p as left hand df (& keep it simple for now)\n",
"# 2) with that dataframe join start & end dates for the periods\n",
"# 3) for every settlement type, find max and min start dates\n",
"\n",
"\n",
"#['startdate']['temple']['min']"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment