Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
valueCounts = {}
def CountAll():
global all_columns, nanCounts, valueCounts
all_columns = list(df)
nanCounts = df.isnull().sum()
for x in all_columns:
valueCounts[x] = df[x].value_counts()
"""Random but proportional replacement(RBPR) of numeric"""
def Fill_NaNs_Numeric(col):
mini = df[col].min()
maxi = df[col].max()
"""Selecting ONLY non-NaNs."""
temp = df[df[col].notnull()][col] # type --> pd.Series
"""Any continuous data is 'always' divided into 45 bins (Hard-Coded)."""
bin_size = 45
bins = np.linspace(mini, maxi, bin_size)
"""Filling the bins (with non-NaNs) and calculating mean of each bin."""
non_NaNs_per_bin = []
mean_of_bins = []
non_NaNs_per_bin.append(len(temp[(temp <= bins[0])]))
mean_of_bins.append(temp[(temp <= bins[0])].mean())
for x in range(1, bin_size):
non_NaNs_per_bin.append(len(temp[(temp <= bins[x]) & (temp > bins[x-1])]))
mean_of_bins.append(temp[(temp <= bins[x]) & (temp > bins[x-1])].mean())
mean_of_bins = pd.Series(mean_of_bins)
# np.around() on list 'proportion' may create trouble and we may get a zero-value imputed, hence,
mean_of_bins.fillna(temp.mean(), inplace= True)
non_NaNs_per_bin = np.array(non_NaNs_per_bin)
"""Followoing part is SAME as Fill_NaNs_Catigorical()"""
"""Calculating probability and expected value."""
proportion = np.array(non_NaNs_per_bin) / valueCounts[col].sum() * nanCounts[col]
proportion = np.around(proportion).astype('int')
"""Adjusting proportion."""
diff = int(nanCounts[col] - np.sum(proportion))
if diff > 0:
for x in range(diff):
idx = random.randint(0, len(proportion) - 1)
proportion[idx] = proportion[idx] + 1
else:
diff = -diff
while(diff != 0):
idx = random.randint(0, len(proportion) - 1)
if proportion[idx] > 0:
proportion[idx] = proportion[idx] - 1
diff = diff - 1
"""Filling NaNs."""
nan_indexes = df[df[col].isnull()].index.tolist()
for x in range(len(proportion)):
if proportion[x] > 0:
random_subset = random.sample(population= nan_indexes, k= proportion[x])
df.loc[random_subset, col] = mean_of_bins[x] # <--- Replacing with bin mean
nan_indexes = list(set(nan_indexes) - set(random_subset))
"""-------------------------------------------------------------------------"""
@abcorep

This comment has been minimized.

Copy link

abcorep commented Dec 14, 2019

Hey, @Vernal-Inertia, before anything, thanks for sharing the code and writing the article on RBPR; it's quite elucidating!
I was wondering if maybe you could explain a little further the proportion calculus? It's not totally clear for me yet.

@Vernal-Inertia

This comment has been minimized.

Copy link
Owner Author

Vernal-Inertia commented Dec 15, 2019

Hello @abcorep, thank you for taking out the time to read the article. Understanding the mathematics part is actually simple. Assume there are 2 boys and 3 girls; now you have to distribute 10 chocolates to them? One answer is distributing 2 chocolates to each. But with a different perspective we can also have another solution - where in we distribute chocolates in proportion to the gender:

(no. of girls)/(total children) = 3/5 --> this the ratio of all girls to total children.
So how many chocolates go to only girls? --> (3/5)*10 = 6
And how many chocolates go to only boys? --> (2/5)*10 = 4

Thus 10(number of chocolates) has been split into 6:4 which is actually 3:2, that is ratio of girls to boys. Hope this resolves your doubt.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.