Skip to content

Instantly share code, notes, and snippets.

@sneakers-the-rat
Created March 5, 2018 09:50
Show Gist options
  • Save sneakers-the-rat/dffe995c508c9ebf2593eb973f393d7a to your computer and use it in GitHub Desktop.
Save sneakers-the-rat/dffe995c508c9ebf2593eb973f393d7a to your computer and use it in GitHub Desktop.
multiprocessing cleaning ops
import os
import pandas as pd
import numpy as np
from itertools import cycle
from tqdm import tqdm
from multiprocessing import Pool
import time
def concat_sport(sport, i):
sport_files = [s for s in files if s.startswith(sport)]
sport_df = pd.read_pickle("".join([comp_dir,sport_files[0]]))
for f in tqdm(sport_files[1:], total=len(sport_files), position=i):
try:
temp_df = pd.read_pickle("".join([comp_dir,f]))
sport_df = pd.concat([sport_df, temp_df], axis=0)
except ValueError:
print("{} concat error".format(sport))
# Because of the way we've been grouping, there will be lots of all na columns, drop them
sport_df = sport_df.dropna(axis=1, how="all")
sport_df = sport_df.infer_objects()
# replace Nones with nans
sport_df = sport_df.replace([None], [np.nan])
# fix up ranks
try:
sport_df['rank'] = sport_df['rank'].str.strip()
except:
pass
try:
sport_df['rank'] = sport_df['rank'].astype(np.float)
sport_df['rank'] = sport_df['rank'].replace("", np.nan)
except:
pass
save_fn = "{}{}.pkl".format(final_dir, sport)
sport_df.to_pickle(save_fn)
if __name__ == "__main__":
comp_dir = '<dir>'
final_dir = '<dir>'
files = os.listdir(comp_dir)
sports = [s.split("_")[0] for s in files]
uq_sports = np.unique(sports)
cyc = cycle(range(1, 8))
p = Pool(processes=7)
res = []
for chunk in uq_sports:
res.append(p.apply_async(concat_sport, (chunk, cyc.next())))
pbar = tqdm(total=len(uq_sports), position=0)
sports_done = 0
while not all([r.ready() for r in res]):
new_sports_done = np.sum([r.ready() for r in res])
if new_sports_done > sports_done:
pbar.update(new_sports_done-sports_done)
sports_done = new_sports_done
time.sleep(1)
print('\ncompleted :)')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment