Skip to content

Instantly share code, notes, and snippets.

@timothymugayi
Created May 1, 2020 14:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save timothymugayi/85094fe69a881695d57302f49f6765a1 to your computer and use it in GitHub Desktop.
Save timothymugayi/85094fe69a881695d57302f49f6765a1 to your computer and use it in GitHub Desktop.
import os
import uuid
import pathlib
import zipfile
import pandas as pd
import urllib.request
from tqdm import tqdm
from memory_profiler import profile
BASE_DIR = pathlib.Path(__file__).parent.absolute()
def download_url(url, output_path):
class DownloadProgressBar(tqdm):
def update_to(self, b=1, bsize=1, tsize=None):
if tsize is not None:
self.total = tsize
self.update(b * bsize - self.n)
with DownloadProgressBar(unit='B', unit_scale=True,
miniters=1, desc=url.split('/')[-1]) as t:
urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)
# movielens movie rating dataset
movielens_data_set_url ='https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip'
file_name = movielens_data_set_url.split('/')[-1]
file_path = os.path.join(BASE_DIR, file_name)
# download large dataset with tqdm
if not os.path.isfile(file_path):
download_url(movielens_data_set_url, os.path.join(BASE_DIR, file_name))
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(BASE_DIR)
def data_transform(data):
pass
def transfer_subset_data(data):
# The mode='a' tells pandas to append
data.to_csv(os.path.join(BASE_DIR, 'output', "new_file_" + str(uuid.uuid4())),
header=True, mode='a')
@profile
def process_data():
df_partition = pd.read_csv('ml-latest/ratings.csv')
print(df_partition.shape)
df_partition = pd.read_csv('ml-latest/ratings.csv', chunksize=20000, names=['userId', 'movieId', 'rating', 'timestamp'])
# Each chunk is in df format
for chunk in df_partition:
# transform data if necessary
data_transform(chunk)
# process subset of data
transfer_subset_data(chunk)
process_data()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment