Skip to content

Instantly share code, notes, and snippets.

@bollwyvl
Last active September 27, 2017 00:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bollwyvl/4e2d9d601c4beb06f61252fc0617880e to your computer and use it in GitHub Desktop.
Save bollwyvl/4e2d9d601c4beb06f61252fc0617880e to your computer and use it in GitHub Desktop.
name: proximity2nature
channels:
- defaults
- conda-forge
dependencies:
- pandas
- lxml
- requests
- pycurl
- jupyterlab
- beautifulsoup4
- pyquery
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Parsing scraped data \n",
"\n",
"from [Scraping.ipynb](Scraping.ipynb)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from requests import get\n",
"from lxml import etree\n",
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"locations = {\n",
" \"Atlanta\": \"1345 Piedmont Avenue, Atlanta, GA 30309\",\n",
" \"Gainesville\": \"1911 Sweetbay Drive, Gainesville, GA 30501\"\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"parser = etree.HTMLParser()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"html_files = list(Path(\"data\").rglob(\"*.html\"))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>End Time</th>\n",
" <th>Location</th>\n",
" <th>Start Date</th>\n",
" <th>Start Time</th>\n",
" <th>URL of event</th>\n",
" <th>organizer</th>\n",
" <th>title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>150</th>\n",
" <td>9:00 pm</td>\n",
" <td>1345 Piedmont Avenue, Atlanta, GA 30309</td>\n",
" <td>2017-10-24</td>\n",
" <td>6:00 pm</td>\n",
" <td>http://atlantabg.org/learn/adult-classes/botan...</td>\n",
" <td>Atlanta Botanical Gardens</td>\n",
" <td>Fine Pen for Botanical Illustration</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69</th>\n",
" <td>12:30 pm</td>\n",
" <td>1345 Piedmont Avenue, Atlanta, GA 30309</td>\n",
" <td>2017-10-10</td>\n",
" <td>10:30 am</td>\n",
" <td>http://atlantabg.org/learn/kids-programs/garde...</td>\n",
" <td>Atlanta Botanical Gardens</td>\n",
" <td>Garden Playtime</td>\n",
" </tr>\n",
" <tr>\n",
" <th>134</th>\n",
" <td>2:30 pm</td>\n",
" <td>1345 Piedmont Avenue, Atlanta, GA 30309</td>\n",
" <td>2017-10-21</td>\n",
" <td>2:00 pm</td>\n",
" <td>http://atlantabg.org/visit/events/garden-chef-...</td>\n",
" <td>Atlanta Botanical Gardens</td>\n",
" <td>Garden Chef Demos</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" End Time Location Start Date Start Time \\\n",
"150 9:00 pm 1345 Piedmont Avenue, Atlanta, GA 30309 2017-10-24 6:00 pm \n",
"69 12:30 pm 1345 Piedmont Avenue, Atlanta, GA 30309 2017-10-10 10:30 am \n",
"134 2:30 pm 1345 Piedmont Avenue, Atlanta, GA 30309 2017-10-21 2:00 pm \n",
"\n",
" URL of event \\\n",
"150 http://atlantabg.org/learn/adult-classes/botan... \n",
"69 http://atlantabg.org/learn/kids-programs/garde... \n",
"134 http://atlantabg.org/visit/events/garden-chef-... \n",
"\n",
" organizer title \n",
"150 Atlanta Botanical Gardens Fine Pen for Botanical Illustration \n",
"69 Atlanta Botanical Gardens Garden Playtime \n",
"134 Atlanta Botanical Gardens Garden Chef Demos "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"events = []\n",
"for file_path in Path(\"data\").rglob(\"*.html\"):\n",
" date = file_path.name.split(\".\")[0]\n",
" html = etree.fromstring(\n",
" file_path.read_text(), \n",
" parser=parser\n",
" ).getroottree()\n",
" day_events = html.xpath(\"//table[@id='events-data-table']\")[0]\n",
" for tr in day_events.xpath(\"tbody/tr\"):\n",
" try:\n",
" a = tr.xpath(\"td/a\")[0]\n",
" location = tr.xpath(\"td[3]\")[0].text\n",
" raw_time = (\n",
" tr.xpath(\"td[2]\")[0].text\n",
" .replace(\"\\n\", \"\")\n",
" .replace(\" \", \"\")\n",
" ).split(\"-\")\n",
" \n",
" event = {\n",
" \"title\": a.text,\n",
" \"URL of event\": a.attrib['href'],\n",
" \"Start Date\": date,\n",
" \"Location\": locations[location],\n",
" \"organizer\": \"Atlanta Botanical Gardens\"\n",
" }\n",
" \n",
" if len(raw_time) == 2:\n",
" event.update({\n",
" \"Start Time\": raw_time[0],\n",
" \"End Time\": raw_time[1]\n",
" })\n",
"\n",
" events.append(event)\n",
" except:\n",
" pass\n",
"df = pd.DataFrame(events).fillna(\"\")\n",
"df.sample(3)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"82412"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Path(\"events.html\").write_text(df.to_html())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment