priancho/get_amazon_product_data_small.py

## get_amazon_product_data_small.py
# Download Amazon Product Data (small version).
#  For details, see: http://jmcauley.ucsd.edu/data/amazon/
#
# Caution!
#   Before running this download script, create a directiory in
#   HDFS to save files. Then, set it in BASE_PATH variable.
#

#---
# Downloading using ipython.
#---
import sys
from os import path
from urllib import request
from lxml import html
from hdfs3 import HDFileSystem

ROOT_URL = 'http://snap.stanford.edu/data/amazon/'
CHUNK_SIZE = 10*1024*1024
BASE_PATH = '/user/myaccount/data/amazon_product_data_small'
DATASE_XPATH = '//*[@id="main"]/table[1]/tr/td[2]/a'

if __name__ == '__main__':
    # Connect to HDFs
    hdfs = HDFileSystem()

    # Open and parse the Web page.
    doc = html.parse(request.urlopen(ROOT_URL))

    # Download datasets.
    for n in node.xpath(DATASET_XPATH):
        # Get the URL for a dataset.
        data_url = n.get('href')
        filename = path.basename(data_url)

        # Save it into HDFS directly.
        print("Getting the data from %s" % (data_url))
        response = request.urlopen(data_url)
        with hdfs.open(path.join(BASE_PATH, filename), 'wb') as f:
            while True:
                chunk = response.read(CHUNK_SIZE)
                if not chunk:
                    sys.stderr.write('\n')
                    break
                f.write(chunk)
                sys.stderr.write('.')
                sys.stderr.flush()
	# Download Amazon Product Data (small version).
	# For details, see: http://jmcauley.ucsd.edu/data/amazon/
	#
	# Caution!
	# Before running this download script, create a directiory in
	# HDFS to save files. Then, set it in BASE_PATH variable.
	#

	#---
	# Downloading using ipython.
	#---
	import sys
	from os import path
	from urllib import request
	from lxml import html
	from hdfs3 import HDFileSystem

	ROOT_URL = 'http://snap.stanford.edu/data/amazon/'
	CHUNK_SIZE = 1010241024
	BASE_PATH = '/user/myaccount/data/amazon_product_data_small'
	DATASE_XPATH = '//*[@id="main"]/table[1]/tr/td[2]/a'

	if __name__ == '__main__':
	# Connect to HDFs
	hdfs = HDFileSystem()

	# Open and parse the Web page.
	doc = html.parse(request.urlopen(ROOT_URL))

	# Download datasets.
	for n in node.xpath(DATASET_XPATH):
	# Get the URL for a dataset.
	data_url = n.get('href')
	filename = path.basename(data_url)

	# Save it into HDFS directly.
	print("Getting the data from %s" % (data_url))
	response = request.urlopen(data_url)
	with hdfs.open(path.join(BASE_PATH, filename), 'wb') as f:
	while True:
	chunk = response.read(CHUNK_SIZE)
	if not chunk:
	sys.stderr.write('\n')
	break
	f.write(chunk)
	sys.stderr.write('.')
	sys.stderr.flush()