Last active
August 28, 2019 20:05
-
-
Save abdel1979/47b239fae381737ea2a210ec3ada18c5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import matplotlib.pyplot as plt | |
from scipy import special | |
import numpy as np | |
# Read the data | |
data = pd.read_csv('raw_scores.list1',sep=' ', header=None) | |
x = data[1] | |
# Check data description and shape | |
print (data.describe()) | |
# check if any duplicate lines | |
duplicateID = data[data.duplicated(subset=0)] | |
print("Duplicated lines count is :",duplicateID.shape[0]) | |
## check if any outliers (outside range(0,10)) | |
outl=data[data[1] >= 10.0] | |
print("Outliers count (POI greater than 10) = ", outl.shape[0]) | |
outl=data[data[1] <= 0.0] | |
print("Outliers count (POI less than 0) = ", outl.shape[0]) | |
# Drawing barPlot by [k,k+1] range to see data | |
data[1].value_counts(bins=[0,1,2,3,4,5,6,7,8,9]).plot.bar(figsize=(10,5)) | |
plt.xlabel('range of POI') | |
plt.ylabel('Occurences') | |
plt.tight_layout() | |
plt.show() | |
# Start experimenting the best distribution algorithm + normalisation that answer to below requirements: | |
#Requirement 1: Limit the values between a minimum constant value that will be noted "Min" and maximum constant value #noted "Max". | |
#Requirement 2: The density of Points should decrease significantly going from "Min" value to "Max" value. | |
#In our experimentation we define : dif = minimum(difference between [k-1,k] and [k,k=1]) should be greater than 0 (we #raise a bingo if > 5 ) | |
#Requirement 3: Very few values should be near to Max. | |
# Definition of some useful functions | |
## Function 'countBin' to calculate how many POI in each [k,k+1] | |
def countBin(l,i): | |
if len(l.value_counts(bins=[i, i+1]).tolist()) == 0: | |
return 0 | |
else: | |
return l.value_counts(bins=[i, i+1]).tolist()[0] | |
# Create 'check_requirements' function to check results agree to above requirements. | |
def check_requirements(l): | |
t1=countBin(l,0) | |
print("range [ 0 - 1 ]",t1) | |
t2=countBin(l,9) | |
dif = 10 | |
for i in range(1,10): | |
print("range [",i,"-",i+1,"]", countBin(l,i)) | |
t1 = t1 + countBin(l,i) | |
if dif > (countBin(l,i-1) - countBin(l,i)): | |
dif = countBin(l,i-1) - countBin(l,i) | |
print("total=" ,t1, "dif=", dif, "t2=", t2) | |
if (t1 == 91897) and (dif>=5) and (t2 in range(5,250)): | |
print("==========================================") | |
print("============== BINGO =====================") | |
print("==========================================") | |
def Experiment_dis(distribution,l,n,m,step): | |
for i in np.arange(n, m, step): | |
if distribution == "zipfian": | |
y = (l + 1) ** (-i) / special.zetac(i) | |
if distribution == "pareto": | |
y = i / l ** (i + 1) | |
if distribution == "binomial": | |
y = (1 / i) ** (0.4 * l) | |
if distribution == "lomax": | |
y = 1 / (i + l) ** (4) | |
if distribution == "weibull": | |
y = (5 / i) * (l / i) ** (5 - 1) * np.exp(-(l / i) ** 5) | |
y = 1 / y # to preserve order (Requirement4) since all distribution involved will inverse the order. | |
y = 10 * (y - min(y)) / (max(y) - min(y)) # Normalisation to [0,10] | |
print("i=", i) | |
check_requirements(y) | |
print("-----") | |
data[2] = y | |
print(data.head()) | |
#Experiment_dis("zipfian",x,1,5,0.1) | |
#best score obtained is dif=10 t2=7 for i=2.6 | |
#Experiment_dis("pareto",x,1,5,0.1) | |
#best score obtained is dif=9 t2=7 for i=1.2 | |
#Experiment_dis("binomial",x,1,5,0.1) | |
#best score obtained is dif=10 t2=6 for i=1.8 | |
#Experiment_dis("lomax",x,1,10,0.1) | |
#best score obtained is dif=9 t2=7 for i=7.7 | |
#Experiment_dis("weibull",x,1,2,0.1) | |
# Did not give good result, hence not adapted | |
### We choose then Zipfian with shape parameter = 2.6 since it present the best score with regard to requirements. | |
Experiment_dis("zipfian",x,2.5,2.6,0.1) | |
## Drawing Plot to see new distribution after normalisation using zipfian | |
data[2].value_counts(bins=[0,1,2,3,4,5,6,7,8,9]).plot.bar(figsize=(10,5)) | |
plt.xlabel('range of POI') | |
plt.ylabel('Occurences') | |
plt.tight_layout() | |
plt.show() | |
## Saving the output into CSV | |
data.to_csv(r'submission1.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment