Created
May 1, 2020 14:35
-
-
Save timothymugayi/85094fe69a881695d57302f49f6765a1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import uuid | |
import pathlib | |
import zipfile | |
import pandas as pd | |
import urllib.request | |
from tqdm import tqdm | |
from memory_profiler import profile | |
BASE_DIR = pathlib.Path(__file__).parent.absolute() | |
def download_url(url, output_path): | |
class DownloadProgressBar(tqdm): | |
def update_to(self, b=1, bsize=1, tsize=None): | |
if tsize is not None: | |
self.total = tsize | |
self.update(b * bsize - self.n) | |
with DownloadProgressBar(unit='B', unit_scale=True, | |
miniters=1, desc=url.split('/')[-1]) as t: | |
urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to) | |
# movielens movie rating dataset | |
movielens_data_set_url ='https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip' | |
file_name = movielens_data_set_url.split('/')[-1] | |
file_path = os.path.join(BASE_DIR, file_name) | |
# download large dataset with tqdm | |
if not os.path.isfile(file_path): | |
download_url(movielens_data_set_url, os.path.join(BASE_DIR, file_name)) | |
with zipfile.ZipFile(file_path, 'r') as zip_ref: | |
zip_ref.extractall(BASE_DIR) | |
def data_transform(data): | |
pass | |
def transfer_subset_data(data): | |
# The mode='a' tells pandas to append | |
data.to_csv(os.path.join(BASE_DIR, 'output', "new_file_" + str(uuid.uuid4())), | |
header=True, mode='a') | |
@profile | |
def process_data(): | |
df_partition = pd.read_csv('ml-latest/ratings.csv') | |
print(df_partition.shape) | |
df_partition = pd.read_csv('ml-latest/ratings.csv', chunksize=20000, names=['userId', 'movieId', 'rating', 'timestamp']) | |
# Each chunk is in df format | |
for chunk in df_partition: | |
# transform data if necessary | |
data_transform(chunk) | |
# process subset of data | |
transfer_subset_data(chunk) | |
process_data() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment