davidwtbuxton/scotch.py

## scotch.py
import csv
import requests
from BeautifulSoup import BeautifulSoup
from datetime import datetime


# http://www.reddit.com/r/learnpython/comments/tczgd/help_me_improve_this_code_webscraping_liquor/
# http://pastebin.com/TinHnCSp
# http://www.specsonline.com/cgi-bin/snf?body=/cgi-bin/prodlist&index=Liquors%7C255%7CSCOTCH+MALTS


URL = r'http://www.specsonline.com/cgi-bin/snf?body=/cgi-bin/prodlist&index=Liquors%7C255%7CSCOTCH+MALTS'


def read_csv(filename):
    """Read the names and prices from an open CSV file.

    Returns a tuple of a list of names of scotches and a list of (time, prices)
    tuples.
    """
    rows = csv.reader(filename)

    # Get the first row which is the name of the scotches.
    try:
        scotches = next(rows)[1:]
    except StopIteration:
        # There is no first row! Return empty lists.
        return [], []

    prices_list = []

    # Read the time and the prices from the other rows.
    for row in rows:
        # First column is the time.
        dt = row[0]
        # Other columns are prices for scotches.
        prices = dict(zip(scotches, row[1:]))
        time_prices = (dt, prices)
        prices_list.append(time_prices)

    return scotches, prices_list


def scrape(url):
    """Scrape HTML for scotch prices.

    Returns a dictionary mapping scotch names + sizes to prices.
    """
    # Requires third-party requests module.
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content)

    data = {}

    # Ignore the first <pre> element.
    for pre in soup.findAll('pre')[1:]:
        line = pre.string.split('\n')[0] # Ignore lower line
        cols = line[:-4].split() # Chop off final '[12]'
        size, price, case = cols[-3:] # Final 3 columns
        name = ' '.join(cols[:-3]) # All except final 3 columns
        label = name + '@' + size # Make key from name and size
        data[label] = price

    return data


# u'TULLIBARDINE MALT * AGED OAK         750ML   35.78  199.52 [ 6]\n    6/CS  [SCOTLAND]'


def main(argv):
    """Updates a CSV file with prices for scotch sraped from the Web."""
    # The CSV file must be the first positional argument.
    csv_name = argv[1]

    old_data = []
    scotches = set()

    # Get existing data from CSV file (if it exists).
    try:
        old_scotches, old_data = read_csv(open(csv_name, 'rU'))
        scotches.update(old_scotches)
    except IOError:
        pass

    # Get new prices
    new_prices = scrape(URL)
    # Add any new scotch names to the existing list of names.
    scotches.update(new_prices)
    # Put the scotch names in alphabetical order in a list.
    scotches = sorted(scotches)

    with open(csv_name, 'wb') as fh:
        writer = csv.writer(fh)
        # First row is the names of the scotches.
        writer.writerow([''] + scotches)

        # Write the old price data to the CSV.
        for dt, prices in old_data:
            writer.writerow([dt] + [prices.get(s, '') for s in scotches])

        # Write the new price data to the CSV.
        new_dt = datetime.now()
        writer.writerow([new_dt.isoformat()] + [new_prices.get(s, '') for s in scotches])


if __name__ == "__main__":
    import sys

    main(sys.argv)
	import csv
	import requests
	from BeautifulSoup import BeautifulSoup
	from datetime import datetime


	# http://www.reddit.com/r/learnpython/comments/tczgd/help_me_improve_this_code_webscraping_liquor/
	# http://pastebin.com/TinHnCSp
	# http://www.specsonline.com/cgi-bin/snf?body=/cgi-bin/prodlist&index=Liquors%7C255%7CSCOTCH+MALTS


	URL = r'http://www.specsonline.com/cgi-bin/snf?body=/cgi-bin/prodlist&index=Liquors%7C255%7CSCOTCH+MALTS'


	def read_csv(filename):
	"""Read the names and prices from an open CSV file.

	Returns a tuple of a list of names of scotches and a list of (time, prices)
	tuples.
	"""
	rows = csv.reader(filename)

	# Get the first row which is the name of the scotches.
	try:
	scotches = next(rows)[1:]
	except StopIteration:
	# There is no first row! Return empty lists.
	return [], []

	prices_list = []

	# Read the time and the prices from the other rows.
	for row in rows:
	# First column is the time.
	dt = row[0]
	# Other columns are prices for scotches.
	prices = dict(zip(scotches, row[1:]))
	time_prices = (dt, prices)
	prices_list.append(time_prices)

	return scotches, prices_list


	def scrape(url):
	"""Scrape HTML for scotch prices.

	Returns a dictionary mapping scotch names + sizes to prices.
	"""
	# Requires third-party requests module.
	resp = requests.get(url)
	soup = BeautifulSoup(resp.content)

	data = {}

	# Ignore the first <pre> element.
	for pre in soup.findAll('pre')[1:]:
	line = pre.string.split('\n')[0] # Ignore lower line
	cols = line[:-4].split() # Chop off final '[12]'
	size, price, case = cols[-3:] # Final 3 columns
	name = ' '.join(cols[:-3]) # All except final 3 columns
	label = name + '@' + size # Make key from name and size
	data[label] = price

	return data


	# u'TULLIBARDINE MALT * AGED OAK 750ML 35.78 199.52 [ 6]\n 6/CS [SCOTLAND]'


	def main(argv):
	"""Updates a CSV file with prices for scotch sraped from the Web."""
	# The CSV file must be the first positional argument.
	csv_name = argv[1]

	old_data = []
	scotches = set()

	# Get existing data from CSV file (if it exists).
	try:
	old_scotches, old_data = read_csv(open(csv_name, 'rU'))
	scotches.update(old_scotches)
	except IOError:
	pass

	# Get new prices
	new_prices = scrape(URL)
	# Add any new scotch names to the existing list of names.
	scotches.update(new_prices)
	# Put the scotch names in alphabetical order in a list.
	scotches = sorted(scotches)

	with open(csv_name, 'wb') as fh:
	writer = csv.writer(fh)
	# First row is the names of the scotches.
	writer.writerow([''] + scotches)

	# Write the old price data to the CSV.
	for dt, prices in old_data:
	writer.writerow([dt] + [prices.get(s, '') for s in scotches])

	# Write the new price data to the CSV.
	new_dt = datetime.now()
	writer.writerow([new_dt.isoformat()] + [new_prices.get(s, '') for s in scotches])


	if __name__ == "__main__":
	import sys

	main(sys.argv)