Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import os
import uuid
import pathlib
import zipfile
import pandas as pd
import urllib.request
from tqdm import tqdm
from memory_profiler import profile
BASE_DIR = pathlib.Path(__file__).parent.absolute()
def download_url(url, output_path):
class DownloadProgressBar(tqdm):
def update_to(self, b=1, bsize=1, tsize=None):
if tsize is not None:
self.total = tsize
self.update(b * bsize - self.n)
with DownloadProgressBar(unit='B', unit_scale=True,
miniters=1, desc=url.split('/')[-1]) as t:
urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)
# movielens movie rating dataset
movielens_data_set_url ='https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip'
file_name = movielens_data_set_url.split('/')[-1]
file_path = os.path.join(BASE_DIR, file_name)
# download large dataset with tqdm
if not os.path.isfile(file_path):
download_url(movielens_data_set_url, os.path.join(BASE_DIR, file_name))
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(BASE_DIR)
def data_transform(data):
pass
def transfer_subset_data(data):
# The mode='a' tells pandas to append
data.to_csv(os.path.join(BASE_DIR, 'output', "new_file_" + str(uuid.uuid4())),
header=True, mode='a')
@profile
def process_data():
df_partition = pd.read_csv('ml-latest/ratings.csv')
print(df_partition.shape)
df_partition = pd.read_csv('ml-latest/ratings.csv', chunksize=20000, names=['userId', 'movieId', 'rating', 'timestamp'])
# Each chunk is in df format
for chunk in df_partition:
# transform data if necessary
data_transform(chunk)
# process subset of data
transfer_subset_data(chunk)
process_data()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.