abdel1979/gist:47b239fae381737ea2a210ec3ada18c5

## gistfile1.txt
import pandas as pd
import matplotlib.pyplot as plt
from scipy import special
import numpy as np


# Read the data
data = pd.read_csv('raw_scores.list1',sep=' ', header=None)
x = data[1]

# Check data description and shape
print (data.describe())


# check if any duplicate lines
duplicateID = data[data.duplicated(subset=0)]
print("Duplicated lines count is :",duplicateID.shape[0])


## check if any outliers (outside range(0,10))
outl=data[data[1] >= 10.0]
print("Outliers count (POI greater than 10) = ", outl.shape[0])
outl=data[data[1] <= 0.0]
print("Outliers count (POI less than 0) = ", outl.shape[0])

# Drawing barPlot by [k,k+1] range to see data
data[1].value_counts(bins=[0,1,2,3,4,5,6,7,8,9]).plot.bar(figsize=(10,5))
plt.xlabel('range of POI')
plt.ylabel('Occurences')
plt.tight_layout()
plt.show()


# Start experimenting the best distribution algorithm + normalisation that answer to below requirements:
#Requirement 1: Limit the values between a minimum constant value that will be noted "Min" and maximum constant value #noted "Max".
#Requirement 2: The density of Points should decrease significantly going from "Min" value to "Max" value.
#In our experimentation we define : dif = minimum(difference between [k-1,k] and [k,k=1]) should be greater than 0 (we #raise a bingo if > 5 )
#Requirement 3: Very few values should be near to Max.


# Definition of some useful functions
## Function 'countBin' to calculate how many POI in each [k,k+1]
def countBin(l,i):
    if len(l.value_counts(bins=[i, i+1]).tolist()) == 0:
        return 0
    else:
        return l.value_counts(bins=[i, i+1]).tolist()[0]


# Create 'check_requirements' function to check results agree to above requirements.
def check_requirements(l):
    t1=countBin(l,0)
    print("range [ 0 - 1 ]",t1)
    t2=countBin(l,9)
    dif = 10
    for i in range(1,10):
        print("range [",i,"-",i+1,"]", countBin(l,i))
        t1 = t1 + countBin(l,i)
        if dif > (countBin(l,i-1) - countBin(l,i)):
            dif = countBin(l,i-1) - countBin(l,i)
    print("total=" ,t1, "dif=", dif, "t2=", t2)
    if (t1  == 91897) and (dif>=5) and (t2 in range(5,250)):
        print("==========================================")
        print("============== BINGO =====================")
        print("==========================================")


def Experiment_dis(distribution,l,n,m,step):
    for i in np.arange(n, m, step):
        if distribution == "zipfian":
            y = (l + 1) ** (-i) / special.zetac(i)

        if distribution == "pareto":
            y = i / l ** (i + 1)

        if distribution == "binomial":
            y = (1 / i) ** (0.4 * l)

        if distribution == "lomax":
            y = 1 / (i + l) ** (4)

        if distribution == "weibull":
            y = (5 / i) * (l / i) ** (5 - 1) * np.exp(-(l / i) ** 5)

        y = 1 / y  # to preserve order (Requirement4) since all distribution involved will inverse the order.
        y = 10 * (y - min(y)) / (max(y) - min(y))  # Normalisation to [0,10]
        print("i=", i)
        check_requirements(y)
        print("-----")
    data[2] = y
    print(data.head())


#Experiment_dis("zipfian",x,1,5,0.1)
#best score obtained is dif=10 t2=7 for i=2.6

#Experiment_dis("pareto",x,1,5,0.1)
#best score obtained is dif=9 t2=7 for i=1.2

#Experiment_dis("binomial",x,1,5,0.1)
#best score obtained is dif=10 t2=6 for i=1.8

#Experiment_dis("lomax",x,1,10,0.1)
#best score obtained is dif=9 t2=7 for i=7.7

#Experiment_dis("weibull",x,1,2,0.1)
# Did not give good result, hence not adapted


### We choose then Zipfian with shape parameter = 2.6 since it present the best score with regard to requirements.
Experiment_dis("zipfian",x,2.5,2.6,0.1)


## Drawing Plot to see new distribution after normalisation using zipfian
data[2].value_counts(bins=[0,1,2,3,4,5,6,7,8,9]).plot.bar(figsize=(10,5))
plt.xlabel('range of POI')
plt.ylabel('Occurences')
plt.tight_layout()
plt.show()


## Saving the output into CSV
data.to_csv(r'submission1.csv')
	import pandas as pd
	import matplotlib.pyplot as plt
	from scipy import special
	import numpy as np


	# Read the data
	data = pd.read_csv('raw_scores.list1',sep=' ', header=None)
	x = data[1]

	# Check data description and shape
	print (data.describe())



	# check if any duplicate lines
	duplicateID = data[data.duplicated(subset=0)]
	print("Duplicated lines count is :",duplicateID.shape[0])


	## check if any outliers (outside range(0,10))
	outl=data[data[1] >= 10.0]
	print("Outliers count (POI greater than 10) = ", outl.shape[0])
	outl=data[data[1] <= 0.0]
	print("Outliers count (POI less than 0) = ", outl.shape[0])

	# Drawing barPlot by [k,k+1] range to see data
	data[1].value_counts(bins=[0,1,2,3,4,5,6,7,8,9]).plot.bar(figsize=(10,5))
	plt.xlabel('range of POI')
	plt.ylabel('Occurences')
	plt.tight_layout()
	plt.show()


	# Start experimenting the best distribution algorithm + normalisation that answer to below requirements:
	#Requirement 1: Limit the values between a minimum constant value that will be noted "Min" and maximum constant value #noted "Max".
	#Requirement 2: The density of Points should decrease significantly going from "Min" value to "Max" value.
	#In our experimentation we define : dif = minimum(difference between [k-1,k] and [k,k=1]) should be greater than 0 (we #raise a bingo if > 5 )
	#Requirement 3: Very few values should be near to Max.


	# Definition of some useful functions
	## Function 'countBin' to calculate how many POI in each [k,k+1]
	def countBin(l,i):
	if len(l.value_counts(bins=[i, i+1]).tolist()) == 0:
	return 0
	else:
	return l.value_counts(bins=[i, i+1]).tolist()[0]



	# Create 'check_requirements' function to check results agree to above requirements.
	def check_requirements(l):
	t1=countBin(l,0)
	print("range [ 0 - 1 ]",t1)
	t2=countBin(l,9)
	dif = 10
	for i in range(1,10):
	print("range [",i,"-",i+1,"]", countBin(l,i))
	t1 = t1 + countBin(l,i)
	if dif > (countBin(l,i-1) - countBin(l,i)):
	dif = countBin(l,i-1) - countBin(l,i)
	print("total=" ,t1, "dif=", dif, "t2=", t2)
	if (t1 == 91897) and (dif>=5) and (t2 in range(5,250)):
	print("==========================================")
	print("============== BINGO =====================")
	print("==========================================")





	def Experiment_dis(distribution,l,n,m,step):
	for i in np.arange(n, m, step):
	if distribution == "zipfian":
	y = (l + 1) ** (-i) / special.zetac(i)

	if distribution == "pareto":
	y = i / l ** (i + 1)

	if distribution == "binomial":
	y = (1 / i) ** (0.4 * l)

	if distribution == "lomax":
	y = 1 / (i + l) ** (4)

	if distribution == "weibull":
	y = (5 / i) * (l / i) ** (5 - 1) * np.exp(-(l / i) ** 5)

	y = 1 / y # to preserve order (Requirement4) since all distribution involved will inverse the order.
	y = 10 * (y - min(y)) / (max(y) - min(y)) # Normalisation to [0,10]
	print("i=", i)
	check_requirements(y)
	print("-----")
	data[2] = y
	print(data.head())



	#Experiment_dis("zipfian",x,1,5,0.1)
	#best score obtained is dif=10 t2=7 for i=2.6

	#Experiment_dis("pareto",x,1,5,0.1)
	#best score obtained is dif=9 t2=7 for i=1.2

	#Experiment_dis("binomial",x,1,5,0.1)
	#best score obtained is dif=10 t2=6 for i=1.8

	#Experiment_dis("lomax",x,1,10,0.1)
	#best score obtained is dif=9 t2=7 for i=7.7

	#Experiment_dis("weibull",x,1,2,0.1)
	# Did not give good result, hence not adapted


	### We choose then Zipfian with shape parameter = 2.6 since it present the best score with regard to requirements.
	Experiment_dis("zipfian",x,2.5,2.6,0.1)


	## Drawing Plot to see new distribution after normalisation using zipfian
	data[2].value_counts(bins=[0,1,2,3,4,5,6,7,8,9]).plot.bar(figsize=(10,5))
	plt.xlabel('range of POI')
	plt.ylabel('Occurences')
	plt.tight_layout()
	plt.show()



	## Saving the output into CSV
	data.to_csv(r'submission1.csv')