Skip to content

Instantly share code, notes, and snippets.

@davidwtbuxton
Created May 9, 2012 13:51
Show Gist options
  • Save davidwtbuxton/2644638 to your computer and use it in GitHub Desktop.
Save davidwtbuxton/2644638 to your computer and use it in GitHub Desktop.
Scrape scotch prices, save as CSV
import csv
import requests
from BeautifulSoup import BeautifulSoup
from datetime import datetime
# http://www.reddit.com/r/learnpython/comments/tczgd/help_me_improve_this_code_webscraping_liquor/
# http://pastebin.com/TinHnCSp
# http://www.specsonline.com/cgi-bin/snf?body=/cgi-bin/prodlist&index=Liquors%7C255%7CSCOTCH+MALTS
URL = r'http://www.specsonline.com/cgi-bin/snf?body=/cgi-bin/prodlist&index=Liquors%7C255%7CSCOTCH+MALTS'
def read_csv(filename):
"""Read the names and prices from an open CSV file.
Returns a tuple of a list of names of scotches and a list of (time, prices)
tuples.
"""
rows = csv.reader(filename)
# Get the first row which is the name of the scotches.
try:
scotches = next(rows)[1:]
except StopIteration:
# There is no first row! Return empty lists.
return [], []
prices_list = []
# Read the time and the prices from the other rows.
for row in rows:
# First column is the time.
dt = row[0]
# Other columns are prices for scotches.
prices = dict(zip(scotches, row[1:]))
time_prices = (dt, prices)
prices_list.append(time_prices)
return scotches, prices_list
def scrape(url):
"""Scrape HTML for scotch prices.
Returns a dictionary mapping scotch names + sizes to prices.
"""
# Requires third-party requests module.
resp = requests.get(url)
soup = BeautifulSoup(resp.content)
data = {}
# Ignore the first <pre> element.
for pre in soup.findAll('pre')[1:]:
line = pre.string.split('\n')[0] # Ignore lower line
cols = line[:-4].split() # Chop off final '[12]'
size, price, case = cols[-3:] # Final 3 columns
name = ' '.join(cols[:-3]) # All except final 3 columns
label = name + '@' + size # Make key from name and size
data[label] = price
return data
# u'TULLIBARDINE MALT * AGED OAK 750ML 35.78 199.52 [ 6]\n 6/CS [SCOTLAND]'
def main(argv):
"""Updates a CSV file with prices for scotch sraped from the Web."""
# The CSV file must be the first positional argument.
csv_name = argv[1]
old_data = []
scotches = set()
# Get existing data from CSV file (if it exists).
try:
old_scotches, old_data = read_csv(open(csv_name, 'rU'))
scotches.update(old_scotches)
except IOError:
pass
# Get new prices
new_prices = scrape(URL)
# Add any new scotch names to the existing list of names.
scotches.update(new_prices)
# Put the scotch names in alphabetical order in a list.
scotches = sorted(scotches)
with open(csv_name, 'wb') as fh:
writer = csv.writer(fh)
# First row is the names of the scotches.
writer.writerow([''] + scotches)
# Write the old price data to the CSV.
for dt, prices in old_data:
writer.writerow([dt] + [prices.get(s, '') for s in scotches])
# Write the new price data to the CSV.
new_dt = datetime.now()
writer.writerow([new_dt.isoformat()] + [new_prices.get(s, '') for s in scotches])
if __name__ == "__main__":
import sys
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment