Skip to content

Instantly share code, notes, and snippets.

@papanisaicharan
Last active May 9, 2019 13:50
"""
How to Handle Noisy Data in preprocessing of data?
Binning method
"""
import math
import random
import statistics
size = int(input("enter the size : "))
numbers = [random.randrange(100) for i in range(size)]
# sorting
numbers = sorted(numbers)
print("chosen numbers : ",numbers)
bins = int(input("How many bins(input should divide the size perfectly)? : "))
if size%bins == 0:
# number of item in each bin
c = int(size/bins)
# Partition into (equi-depth) bins
equi_depth = [[numbers[p] for p in range(i,i+c)] for i in range(0,size,c) ]
# Smoothing by bin means
smooth_bin_means = [[statistics.mean(i) for j in range(c)] for i in equi_depth]
# Smoothing by bin boundaries
smooth_bin_boundary = []
for i in equi_depth:
min_num = i[0]
max_num = i[-1]
item = []
for j in i:
if j - min_num <= max_num - j:
item.append(min_num)
else:
item.append(max_num)
smooth_bin_boundary.append(item)
print("Partition into (equi-depth) bins : ",equi_depth)
print("Smoothing by bin means : ",smooth_bin_means)
print("Smoothing by bin boundaries : ",smooth_bin_boundary)
else:
print("incorrect input")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment