brettpetch/download_and_test.py

## download_and_test.py
def data_check(url='https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_US_v1_00.tsv.gz',
               file='./data/amazon_reviews_multilingual_US_v1_00.tsv'):
    """
    The following function does the following:
        1. Check for file to exist
            - Go to 3.
        2. If file not exists:
            a. Check if there is a data folder.
            b. If there is no data folder, create one.
            c. Download data from the website to the data folder.
            d. Unzip data using gzip
        3. Load Data.
    :param url: File download address
    :param file: File location (rel os)
    :return: loaded data from external function.
    """
    import gzip
    import os
    import shutil
    import urllib.request

    try:
        if os.path.isfile(file):
            print("Found data!")
        else:
            print("Data not found.")
            directory = "./data/"
            if not os.path.exists(directory):
                print("Creating directory...")
                os.makedirs(directory)
            print("Downloading data... This may take a while (1.5gb)")
            urllib.request.urlretrieve(url, './data/amazon_reviews_multilingual_US_v1_00.tsv.gz')
            print("Unzipping data")
            with gzip.open(directory + 'amazon_reviews_multilingual_US_v1_00.tsv.gz', 'rb') as f_in:
                with open(file, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            print("Cleaning up...")
            os.remove('./data/amazon_reviews_multilingual_US_v1_00.tsv.gz')
            print("Done.")
    finally:
        return load_data(file)

data = data_check()
	def data_check(url='https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_US_v1_00.tsv.gz',
	file='./data/amazon_reviews_multilingual_US_v1_00.tsv'):
	"""
	The following function does the following:
	1. Check for file to exist
	- Go to 3.
	2. If file not exists:
	a. Check if there is a data folder.
	b. If there is no data folder, create one.
	c. Download data from the website to the data folder.
	d. Unzip data using gzip
	3. Load Data.
	:param url: File download address
	:param file: File location (rel os)
	:return: loaded data from external function.
	"""
	import gzip
	import os
	import shutil
	import urllib.request

	try:
	if os.path.isfile(file):
	print("Found data!")
	else:
	print("Data not found.")
	directory = "./data/"
	if not os.path.exists(directory):
	print("Creating directory...")
	os.makedirs(directory)
	print("Downloading data... This may take a while (1.5gb)")
	urllib.request.urlretrieve(url, './data/amazon_reviews_multilingual_US_v1_00.tsv.gz')
	print("Unzipping data")
	with gzip.open(directory + 'amazon_reviews_multilingual_US_v1_00.tsv.gz', 'rb') as f_in:
	with open(file, 'wb') as f_out:
	shutil.copyfileobj(f_in, f_out)
	print("Cleaning up...")
	os.remove('./data/amazon_reviews_multilingual_US_v1_00.tsv.gz')
	print("Done.")
	finally:
	return load_data(file)

	data = data_check()