Last active
May 25, 2018 10:13
-
-
Save snakers4/b246de548669543dc3b5dbb49d4c2f0c to your computer and use it in GitHub Desktop.
Using tqdm with multiprocessing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tqdm | |
import pandas as pd | |
import numpy as np | |
from multiprocessing import Pool | |
import os | |
# drop all the unknown points and all closed points | |
# for each SK_ID_CURR calculate the counts of time in each status | |
# normalize by the max len (we know of) in any of the meaningful statuses | |
df_gr_by = df_bb[(df_bb.STATUS.isin(['0','1', '2', '3', '5', '4']))].groupby(by=['SK_ID_BUREAU'])[['MONTHS_BALANCE','STATUS']] | |
sk_list = list(df_bb[(df_bb.STATUS.isin(['0','1', '2', '3', '5', '4']))].SK_ID_BUREAU.unique()) | |
def produce_bb_indexes(sk_id): | |
global df_gr_by | |
groupby_object = df_gr_by.get_group(sk_id) | |
l = groupby_object.MONTHS_BALANCE.min() | |
r = groupby_object.MONTHS_BALANCE.max() | |
# max period length | |
length = r - l | |
if length == 0: | |
length = 1 | |
value_counts = dict(groupby_object.STATUS.value_counts()) | |
for i in range(0,5): | |
if str(i) not in value_counts: | |
value_counts[str(i)] = 0 | |
value_counts['0'] += length-sum(value_counts.values()) | |
value_counts = {k:v/length for k,v in value_counts.items()} | |
return value_counts | |
with Pool(10) as p: | |
count_data = list(tqdm.tqdm(p.imap(produce_bb_indexes, sk_list), total=len(sk_list))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment