Skip to content

Instantly share code, notes, and snippets.

@ssanderson
Last active May 16, 2017 21:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ssanderson/18b59fa641fd5c60cb72ec79a00c2bb1 to your computer and use it in GitHub Desktop.
Save ssanderson/18b59fa641fd5c60cb72ec79a00c2bb1 to your computer and use it in GitHub Desktop.
Avocado Scraper
from __future__ import print_function
import requests
import pandas as pd
from six.moves.urllib_parse import urlencode
def download_avocado_data(dest, start_date, end_date):
base_url = 'https://www.marketnews.usda.gov/mnp/fv-report-retail'
query_params = {
'class': ['FRUITS'],
'commodity': ['AVOCADOS'],
'compareLy': ['No'],
'endDate': [end_date.strftime("%m/%d/%Y")],
# API calls for format=excel actually return html tables.
# Your guess as to why that's the case is as good as mine.
'format': ['excel'],
'organic': ['ALL'],
'portal': ['fv'],
'region': ['ALL'],
'repDate': [start_date.strftime("%m/%d/%Y")],
'type': ['retail'],
}
url = base_url + '?' + urlencode(query_params, doseq=1)
resp = requests.get(url, stream=True)
resp.raise_for_status()
with open(dest, 'wb') as f:
print("Writing to [%s]" % dest)
for block in resp.iter_content(chunk_size=4096):
f.write(block)
def read_avocado_data(path):
# Read data.
frame = pd.read_html(path, header=0, parse_dates=['Date'])[0]
# Cleanup
frame = frame[frame['Unit'] == 'each']
frame['Organic'] = (frame['Organic'] == 'Y')
frame['Variety'].replace(
{'VARIOUS GREENSKIN VARIETIES': 'GREENSKIN'},
inplace=True,
)
# Drop useless columns.
return frame.drop(
['Class', 'Commodity', 'Environment', 'Unit'],
axis=1,
)
download_avocado_data('avocados.html',
pd.Timestamp('2015-05-11'),
pd.Timestamp('2017-05-10'))
df = read_avocado_data('avocados.html')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment