ssanderson/avocadownload.py

## avocadownload.py
from __future__ import print_function

import requests
import pandas as pd

from six.moves.urllib_parse import urlencode


def download_avocado_data(dest, start_date, end_date):
    base_url = 'https://www.marketnews.usda.gov/mnp/fv-report-retail'
    query_params = {
        'class': ['FRUITS'],
        'commodity': ['AVOCADOS'],
        'compareLy': ['No'],
        'endDate': [end_date.strftime("%m/%d/%Y")],
        # API calls for format=excel actually return html tables.
        # Your guess as to why that's the case is as good as mine.
        'format': ['excel'],
        'organic': ['ALL'],
        'portal': ['fv'],
        'region': ['ALL'],
        'repDate': [start_date.strftime("%m/%d/%Y")],
        'type': ['retail'],
    }

    url = base_url + '?' + urlencode(query_params, doseq=1)
    resp = requests.get(url, stream=True)

    resp.raise_for_status()

    with open(dest, 'wb') as f:
        print("Writing to [%s]" % dest)
        for block in resp.iter_content(chunk_size=4096):
            f.write(block)


def read_avocado_data(path):
    # Read data.
    frame = pd.read_html(path, header=0, parse_dates=['Date'])[0]

    # Cleanup
    frame = frame[frame['Unit'] == 'each']
    frame['Organic'] = (frame['Organic'] == 'Y')
    frame['Variety'].replace(
        {'VARIOUS GREENSKIN VARIETIES': 'GREENSKIN'},
        inplace=True,
    )

    # Drop useless columns.
    return frame.drop(
        ['Class', 'Commodity', 'Environment', 'Unit'],
        axis=1,
    )


download_avocado_data('avocados.html',
                      pd.Timestamp('2015-05-11'),
                      pd.Timestamp('2017-05-10'))


df = read_avocado_data('avocados.html')
	from __future__ import print_function

	import requests
	import pandas as pd

	from six.moves.urllib_parse import urlencode


	def download_avocado_data(dest, start_date, end_date):
	base_url = 'https://www.marketnews.usda.gov/mnp/fv-report-retail'
	query_params = {
	'class': ['FRUITS'],
	'commodity': ['AVOCADOS'],
	'compareLy': ['No'],
	'endDate': [end_date.strftime("%m/%d/%Y")],
	# API calls for format=excel actually return html tables.
	# Your guess as to why that's the case is as good as mine.
	'format': ['excel'],
	'organic': ['ALL'],
	'portal': ['fv'],
	'region': ['ALL'],
	'repDate': [start_date.strftime("%m/%d/%Y")],
	'type': ['retail'],
	}

	url = base_url + '?' + urlencode(query_params, doseq=1)
	resp = requests.get(url, stream=True)

	resp.raise_for_status()

	with open(dest, 'wb') as f:
	print("Writing to [%s]" % dest)
	for block in resp.iter_content(chunk_size=4096):
	f.write(block)


	def read_avocado_data(path):
	# Read data.
	frame = pd.read_html(path, header=0, parse_dates=['Date'])[0]

	# Cleanup
	frame = frame[frame['Unit'] == 'each']
	frame['Organic'] = (frame['Organic'] == 'Y')
	frame['Variety'].replace(
	{'VARIOUS GREENSKIN VARIETIES': 'GREENSKIN'},
	inplace=True,
	)

	# Drop useless columns.
	return frame.drop(
	['Class', 'Commodity', 'Environment', 'Unit'],
	axis=1,
	)


	download_avocado_data('avocados.html',
	pd.Timestamp('2015-05-11'),
	pd.Timestamp('2017-05-10'))


	df = read_avocado_data('avocados.html')