Skip to content

Instantly share code, notes, and snippets.

@bollwyvl
Last active September 27, 2017 00:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bollwyvl/4e2d9d601c4beb06f61252fc0617880e to your computer and use it in GitHub Desktop.
Save bollwyvl/4e2d9d601c4beb06f61252fc0617880e to your computer and use it in GitHub Desktop.
name: proximity2nature
channels:
- defaults
- conda-forge
dependencies:
- pandas
- lxml
- requests
- pycurl
- jupyterlab
- beautifulsoup4
- pyquery
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Scraping the Atlanta Botanical Gardens Events\n",
"\n",
"next see [Parsing.ipynb](Parsing.ipynb)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from requests import get\n",
"from lxml import etree\n",
"import datetime\n",
"from pathlib import Path\n",
"from time import sleep\n",
"from random import random"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"parser = etree.HTMLParser()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"DATA = Path(\"data\")\n",
"DATA.mkdir(exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"num_days = 365"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It's important to be a good citizen with data."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"delay_base = 3"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"start_date = datetime.datetime(2017, 9, 26)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"date_list = [start_date + datetime.timedelta(days=x) for x in range(0, num_days)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"fetching 2017-12-01\n",
"fetching 2017-12-02\n",
"fetching 2017-12-03\n",
"fetching 2017-12-04\n",
"fetching 2017-12-05\n",
"fetching 2017-12-06\n",
"fetching 2017-12-07\n",
"fetching 2017-12-08\n",
"fetching 2017-12-09\n",
"fetching 2017-12-10\n",
"fetching 2017-12-11\n",
"fetching 2017-12-12\n",
"fetching 2017-12-13\n",
"fetching 2017-12-14\n",
"fetching 2017-12-15\n",
"fetching 2017-12-16\n",
"fetching 2017-12-17\n",
"fetching 2017-12-18\n",
"fetching 2017-12-19\n",
"fetching 2017-12-20\n",
"fetching 2017-12-21\n",
"fetching 2017-12-22\n",
"fetching 2017-12-23\n",
"fetching 2017-12-24\n",
"fetching 2017-12-25\n",
"fetching 2017-12-26\n",
"fetching 2017-12-27\n",
"fetching 2017-12-28\n",
"fetching 2017-12-29\n",
"fetching 2017-12-30\n",
"fetching 2017-12-31\n",
"fetching 2018-01-01\n",
"fetching 2018-01-02\n",
"fetching 2018-01-03\n",
"fetching 2018-01-04\n",
"fetching 2018-01-05\n",
"fetching 2018-01-06\n",
"fetching 2018-01-07\n",
"fetching 2018-01-08\n",
"fetching 2018-01-09\n",
"fetching 2018-01-10\n",
"fetching 2018-01-11\n",
"fetching 2018-01-12\n",
"fetching 2018-01-13\n",
"fetching 2018-01-14\n",
"fetching 2018-01-15\n",
"fetching 2018-01-16\n",
"fetching 2018-01-17\n",
"fetching 2018-01-18\n",
"fetching 2018-01-19\n",
"fetching 2018-01-20\n",
"fetching 2018-01-21\n",
"fetching 2018-01-22\n",
"fetching 2018-01-23\n",
"fetching 2018-01-24\n",
"fetching 2018-01-25\n",
"fetching 2018-01-26\n",
"fetching 2018-01-27\n",
"fetching 2018-01-28\n",
"fetching 2018-01-29\n",
"fetching 2018-01-30\n",
"fetching 2018-01-31\n",
"fetching 2018-02-01\n",
"fetching 2018-02-02\n",
"fetching 2018-02-03\n",
"fetching 2018-02-04\n",
"fetching 2018-02-05\n",
"fetching 2018-02-06\n",
"fetching 2018-02-07\n",
"fetching 2018-02-08\n",
"fetching 2018-02-09\n",
"fetching 2018-02-10\n",
"fetching 2018-02-11\n",
"fetching 2018-02-12\n",
"fetching 2018-02-13\n",
"fetching 2018-02-14\n",
"fetching 2018-02-15\n",
"fetching 2018-02-16\n",
"fetching 2018-02-17\n",
"fetching 2018-02-18\n",
"fetching 2018-02-19\n",
"fetching 2018-02-20\n",
"fetching 2018-02-21\n"
]
}
],
"source": [
"url_template = \"http://atlantabg.org/calendar?date={}\"\n",
"for date in [d.strftime('%Y-%m-%d') for d in date_list]:\n",
" url = url_template.format(date)\n",
" path = DATA / \"{}.html\".format(date)\n",
" if not path.exists():\n",
" print(\"fetching\", date)\n",
" sleep(random() * delay_base)\n",
" path.write_text(get(url).content.decode('utf-8'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment