Skip to content

Instantly share code, notes, and snippets.

@pythoninthegrass
Created May 9, 2024 18:51
Show Gist options
  • Save pythoninthegrass/d39352309b3becdfd96333707f2e725f to your computer and use it in GitHub Desktop.
Save pythoninthegrass/d39352309b3becdfd96333707f2e725f to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# SOURCE: https://www.youtube.com/watch?v=xtFo1IiZqzM
import pandas as pd
from pathlib import Path
from tqdm import tqdm
fn = Path("../csv/anime_dataset/users-score-2023.csv")
# TODO: programmatic chunksize calculation
chunksize = 5000 # naive amount
data = pd.read_csv(fn, chunksize=chunksize)
num_rows = sum(1 for _ in data)
counter = 0
# results = []
# TODO: test tqdm
for chunk in tqdm(data, total=num_rows):
chunk.columns = chunk.columns.str.strip().str.replace(' ', '_').str.lower()
chunk['username'] = chunk['username'].apply(lambda x: x.strip())
chunk['anime_title'] = chunk['anime_title'].apply(lambda x: x.strip())
chunk.drop(columns=['anime_id'], inplace=True)
chunk.rename(columns={'username': 'name', 'anime_title': 'anime', 'rating': 'rating'}, inplace=True)
chunk.drop(columns=['user_id'], inplace=True)
results = [] # or results = {} or results = set() depending on your needs and performance considerations
counter += 1
# results.append(chunk)
if counter == 5:
break
else:
progress = counter * chunksize
print(f"Progress: {progress}/{num_rows}")
# TODO: refactor this to use a single dataframe
# results = chunk.groupby('anime')['rating'].agg(['mean', 'size']).sort_values(by='size', ascending=False).head(10)
print(results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment