Skip to content

Instantly share code, notes, and snippets.

@snakers4
Last active May 25, 2018 10:13
Show Gist options
  • Save snakers4/b246de548669543dc3b5dbb49d4c2f0c to your computer and use it in GitHub Desktop.
Save snakers4/b246de548669543dc3b5dbb49d4c2f0c to your computer and use it in GitHub Desktop.
Using tqdm with multiprocessing
import tqdm
import pandas as pd
import numpy as np
from multiprocessing import Pool
import os
# drop all the unknown points and all closed points
# for each SK_ID_CURR calculate the counts of time in each status
# normalize by the max len (we know of) in any of the meaningful statuses
df_gr_by = df_bb[(df_bb.STATUS.isin(['0','1', '2', '3', '5', '4']))].groupby(by=['SK_ID_BUREAU'])[['MONTHS_BALANCE','STATUS']]
sk_list = list(df_bb[(df_bb.STATUS.isin(['0','1', '2', '3', '5', '4']))].SK_ID_BUREAU.unique())
def produce_bb_indexes(sk_id):
global df_gr_by
groupby_object = df_gr_by.get_group(sk_id)
l = groupby_object.MONTHS_BALANCE.min()
r = groupby_object.MONTHS_BALANCE.max()
# max period length
length = r - l
if length == 0:
length = 1
value_counts = dict(groupby_object.STATUS.value_counts())
for i in range(0,5):
if str(i) not in value_counts:
value_counts[str(i)] = 0
value_counts['0'] += length-sum(value_counts.values())
value_counts = {k:v/length for k,v in value_counts.items()}
return value_counts
with Pool(10) as p:
count_data = list(tqdm.tqdm(p.imap(produce_bb_indexes, sk_list), total=len(sk_list)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment