Created
January 24, 2014 16:04
-
-
Save robroc/8600265 to your computer and use it in GitHub Desktop.
Tourisme Quebec scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scraperwiki | |
import lxml.html | |
base_url = "http://www.tourisme.gouv.qc.ca/publications/bulletins_info/bulletin/heb_frequentation.php?mois=" | |
year_url = "&anneeHaut=" | |
tail_url = "&anneeBas=2012&soumettre=Comparer" | |
# Loop through years between 2000-2013 and months 1-12 | |
for year in range(2000,2014): | |
for month in range(1,13): | |
html = scraperwiki.scrape(base_url + str(month) + year_url + str(year) + tail_url) | |
dom = lxml.html.fromstring(html) | |
# No data for Dec. 2013, so end loop there | |
if year == 2013 and month == 12: | |
break | |
# Loop through every row in the table withh attribute align="center", where the data is, starting on row 4. | |
for tr in dom.cssselect("table[align='center'] table tr")[3:]: | |
# Skip regions that don't exist anymore | |
if "Ancienne" in (tr.cssselect("td")[1].text_content()): | |
continue | |
# Stop reading rows at last rows with total summaries | |
if "Ensemble" in (tr.cssselect("td")[0].text_content()): | |
break | |
# Save data to a dictionary from selected table cells (td), accessed by index numbers. | |
# Remove asterixes from region names and replace commas with periods in decimals. | |
data = { | |
'year' : year, | |
'month' : month, | |
'region' : (tr.cssselect("td")[1].text_content()).replace("*",""), | |
'available_units' : (tr.cssselect("td")[2].text_content()), | |
'occup_units' : (tr.cssselect("td")[4].text_content()), | |
'occup_rate' : (tr.cssselect("td")[6].text_content()).replace(",","."), | |
'price' : (tr.cssselect("td")[8].text_content()).replace(",",".") | |
} | |
# The rest is writing the data to Scraperwiki | |
# using their SQL interface. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment