Skip to content

Instantly share code, notes, and snippets.

@priancho
Last active October 11, 2017 05:47
Show Gist options
  • Save priancho/2c66e98c22637dfe9aa9c378d8f26d3f to your computer and use it in GitHub Desktop.
Save priancho/2c66e98c22637dfe9aa9c378d8f26d3f to your computer and use it in GitHub Desktop.
Download Amazon Product Data (small version) directly into HDFS.
# Download Amazon Product Data (small version).
# For details, see: http://jmcauley.ucsd.edu/data/amazon/
#
# Caution!
# Before running this download script, create a directiory in
# HDFS to save files. Then, set it in BASE_PATH variable.
#
#---
# Downloading using ipython.
#---
import sys
from os import path
from urllib import request
from lxml import html
from hdfs3 import HDFileSystem
ROOT_URL = 'http://snap.stanford.edu/data/amazon/'
CHUNK_SIZE = 10*1024*1024
BASE_PATH = '/user/myaccount/data/amazon_product_data_small'
DATASE_XPATH = '//*[@id="main"]/table[1]/tr/td[2]/a'
if __name__ == '__main__':
# Connect to HDFs
hdfs = HDFileSystem()
# Open and parse the Web page.
doc = html.parse(request.urlopen(ROOT_URL))
# Download datasets.
for n in node.xpath(DATASET_XPATH):
# Get the URL for a dataset.
data_url = n.get('href')
filename = path.basename(data_url)
# Save it into HDFS directly.
print("Getting the data from %s" % (data_url))
response = request.urlopen(data_url)
with hdfs.open(path.join(BASE_PATH, filename), 'wb') as f:
while True:
chunk = response.read(CHUNK_SIZE)
if not chunk:
sys.stderr.write('\n')
break
f.write(chunk)
sys.stderr.write('.')
sys.stderr.flush()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment