-
-
Save bollwyvl/4e2d9d601c4beb06f61252fc0617880e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: proximity2nature | |
channels: | |
- defaults | |
- conda-forge | |
dependencies: | |
- pandas | |
- lxml | |
- requests | |
- pycurl | |
- jupyterlab | |
- beautifulsoup4 | |
- pyquery |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Parsing scraped data \n", | |
"\n", | |
"from [Scraping.ipynb](Scraping.ipynb)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"from requests import get\n", | |
"from lxml import etree\n", | |
"from pathlib import Path" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"locations = {\n", | |
" \"Atlanta\": \"1345 Piedmont Avenue, Atlanta, GA 30309\",\n", | |
" \"Gainesville\": \"1911 Sweetbay Drive, Gainesville, GA 30501\"\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"parser = etree.HTMLParser()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"html_files = list(Path(\"data\").rglob(\"*.html\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>End Time</th>\n", | |
" <th>Location</th>\n", | |
" <th>Start Date</th>\n", | |
" <th>Start Time</th>\n", | |
" <th>URL of event</th>\n", | |
" <th>organizer</th>\n", | |
" <th>title</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>150</th>\n", | |
" <td>9:00 pm</td>\n", | |
" <td>1345 Piedmont Avenue, Atlanta, GA 30309</td>\n", | |
" <td>2017-10-24</td>\n", | |
" <td>6:00 pm</td>\n", | |
" <td>http://atlantabg.org/learn/adult-classes/botan...</td>\n", | |
" <td>Atlanta Botanical Gardens</td>\n", | |
" <td>Fine Pen for Botanical Illustration</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>69</th>\n", | |
" <td>12:30 pm</td>\n", | |
" <td>1345 Piedmont Avenue, Atlanta, GA 30309</td>\n", | |
" <td>2017-10-10</td>\n", | |
" <td>10:30 am</td>\n", | |
" <td>http://atlantabg.org/learn/kids-programs/garde...</td>\n", | |
" <td>Atlanta Botanical Gardens</td>\n", | |
" <td>Garden Playtime</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>134</th>\n", | |
" <td>2:30 pm</td>\n", | |
" <td>1345 Piedmont Avenue, Atlanta, GA 30309</td>\n", | |
" <td>2017-10-21</td>\n", | |
" <td>2:00 pm</td>\n", | |
" <td>http://atlantabg.org/visit/events/garden-chef-...</td>\n", | |
" <td>Atlanta Botanical Gardens</td>\n", | |
" <td>Garden Chef Demos</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" End Time Location Start Date Start Time \\\n", | |
"150 9:00 pm 1345 Piedmont Avenue, Atlanta, GA 30309 2017-10-24 6:00 pm \n", | |
"69 12:30 pm 1345 Piedmont Avenue, Atlanta, GA 30309 2017-10-10 10:30 am \n", | |
"134 2:30 pm 1345 Piedmont Avenue, Atlanta, GA 30309 2017-10-21 2:00 pm \n", | |
"\n", | |
" URL of event \\\n", | |
"150 http://atlantabg.org/learn/adult-classes/botan... \n", | |
"69 http://atlantabg.org/learn/kids-programs/garde... \n", | |
"134 http://atlantabg.org/visit/events/garden-chef-... \n", | |
"\n", | |
" organizer title \n", | |
"150 Atlanta Botanical Gardens Fine Pen for Botanical Illustration \n", | |
"69 Atlanta Botanical Gardens Garden Playtime \n", | |
"134 Atlanta Botanical Gardens Garden Chef Demos " | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"events = []\n", | |
"for file_path in Path(\"data\").rglob(\"*.html\"):\n", | |
" date = file_path.name.split(\".\")[0]\n", | |
" html = etree.fromstring(\n", | |
" file_path.read_text(), \n", | |
" parser=parser\n", | |
" ).getroottree()\n", | |
" day_events = html.xpath(\"//table[@id='events-data-table']\")[0]\n", | |
" for tr in day_events.xpath(\"tbody/tr\"):\n", | |
" try:\n", | |
" a = tr.xpath(\"td/a\")[0]\n", | |
" location = tr.xpath(\"td[3]\")[0].text\n", | |
" raw_time = (\n", | |
" tr.xpath(\"td[2]\")[0].text\n", | |
" .replace(\"\\n\", \"\")\n", | |
" .replace(\" \", \"\")\n", | |
" ).split(\"-\")\n", | |
" \n", | |
" event = {\n", | |
" \"title\": a.text,\n", | |
" \"URL of event\": a.attrib['href'],\n", | |
" \"Start Date\": date,\n", | |
" \"Location\": locations[location],\n", | |
" \"organizer\": \"Atlanta Botanical Gardens\"\n", | |
" }\n", | |
" \n", | |
" if len(raw_time) == 2:\n", | |
" event.update({\n", | |
" \"Start Time\": raw_time[0],\n", | |
" \"End Time\": raw_time[1]\n", | |
" })\n", | |
"\n", | |
" events.append(event)\n", | |
" except:\n", | |
" pass\n", | |
"df = pd.DataFrame(events).fillna(\"\")\n", | |
"df.sample(3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"82412" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"Path(\"events.html\").write_text(df.to_html())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment