Skip to content

Instantly share code, notes, and snippets.

@robroc
Created January 24, 2014 16:04
Show Gist options
  • Save robroc/8600265 to your computer and use it in GitHub Desktop.
Save robroc/8600265 to your computer and use it in GitHub Desktop.
Tourisme Quebec scraper
import scraperwiki
import lxml.html
base_url = "http://www.tourisme.gouv.qc.ca/publications/bulletins_info/bulletin/heb_frequentation.php?mois="
year_url = "&anneeHaut="
tail_url = "&anneeBas=2012&soumettre=Comparer"
# Loop through years between 2000-2013 and months 1-12
for year in range(2000,2014):
for month in range(1,13):
html = scraperwiki.scrape(base_url + str(month) + year_url + str(year) + tail_url)
dom = lxml.html.fromstring(html)
# No data for Dec. 2013, so end loop there
if year == 2013 and month == 12:
break
# Loop through every row in the table withh attribute align="center", where the data is, starting on row 4.
for tr in dom.cssselect("table[align='center'] table tr")[3:]:
# Skip regions that don't exist anymore
if "Ancienne" in (tr.cssselect("td")[1].text_content()):
continue
# Stop reading rows at last rows with total summaries
if "Ensemble" in (tr.cssselect("td")[0].text_content()):
break
# Save data to a dictionary from selected table cells (td), accessed by index numbers.
# Remove asterixes from region names and replace commas with periods in decimals.
data = {
'year' : year,
'month' : month,
'region' : (tr.cssselect("td")[1].text_content()).replace("*",""),
'available_units' : (tr.cssselect("td")[2].text_content()),
'occup_units' : (tr.cssselect("td")[4].text_content()),
'occup_rate' : (tr.cssselect("td")[6].text_content()).replace(",","."),
'price' : (tr.cssselect("td")[8].text_content()).replace(",",".")
}
# The rest is writing the data to Scraperwiki
# using their SQL interface.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment