Skip to content

Instantly share code, notes, and snippets.

@abdel1979
Last active August 28, 2019 20:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abdel1979/47b239fae381737ea2a210ec3ada18c5 to your computer and use it in GitHub Desktop.
Save abdel1979/47b239fae381737ea2a210ec3ada18c5 to your computer and use it in GitHub Desktop.
import pandas as pd
import matplotlib.pyplot as plt
from scipy import special
import numpy as np
# Read the data
data = pd.read_csv('raw_scores.list1',sep=' ', header=None)
x = data[1]
# Check data description and shape
print (data.describe())
# check if any duplicate lines
duplicateID = data[data.duplicated(subset=0)]
print("Duplicated lines count is :",duplicateID.shape[0])
## check if any outliers (outside range(0,10))
outl=data[data[1] >= 10.0]
print("Outliers count (POI greater than 10) = ", outl.shape[0])
outl=data[data[1] <= 0.0]
print("Outliers count (POI less than 0) = ", outl.shape[0])
# Drawing barPlot by [k,k+1] range to see data
data[1].value_counts(bins=[0,1,2,3,4,5,6,7,8,9]).plot.bar(figsize=(10,5))
plt.xlabel('range of POI')
plt.ylabel('Occurences')
plt.tight_layout()
plt.show()
# Start experimenting the best distribution algorithm + normalisation that answer to below requirements:
#Requirement 1: Limit the values between a minimum constant value that will be noted "Min" and maximum constant value #noted "Max".
#Requirement 2: The density of Points should decrease significantly going from "Min" value to "Max" value.
#In our experimentation we define : dif = minimum(difference between [k-1,k] and [k,k=1]) should be greater than 0 (we #raise a bingo if > 5 )
#Requirement 3: Very few values should be near to Max.
# Definition of some useful functions
## Function 'countBin' to calculate how many POI in each [k,k+1]
def countBin(l,i):
if len(l.value_counts(bins=[i, i+1]).tolist()) == 0:
return 0
else:
return l.value_counts(bins=[i, i+1]).tolist()[0]
# Create 'check_requirements' function to check results agree to above requirements.
def check_requirements(l):
t1=countBin(l,0)
print("range [ 0 - 1 ]",t1)
t2=countBin(l,9)
dif = 10
for i in range(1,10):
print("range [",i,"-",i+1,"]", countBin(l,i))
t1 = t1 + countBin(l,i)
if dif > (countBin(l,i-1) - countBin(l,i)):
dif = countBin(l,i-1) - countBin(l,i)
print("total=" ,t1, "dif=", dif, "t2=", t2)
if (t1 == 91897) and (dif>=5) and (t2 in range(5,250)):
print("==========================================")
print("============== BINGO =====================")
print("==========================================")
def Experiment_dis(distribution,l,n,m,step):
for i in np.arange(n, m, step):
if distribution == "zipfian":
y = (l + 1) ** (-i) / special.zetac(i)
if distribution == "pareto":
y = i / l ** (i + 1)
if distribution == "binomial":
y = (1 / i) ** (0.4 * l)
if distribution == "lomax":
y = 1 / (i + l) ** (4)
if distribution == "weibull":
y = (5 / i) * (l / i) ** (5 - 1) * np.exp(-(l / i) ** 5)
y = 1 / y # to preserve order (Requirement4) since all distribution involved will inverse the order.
y = 10 * (y - min(y)) / (max(y) - min(y)) # Normalisation to [0,10]
print("i=", i)
check_requirements(y)
print("-----")
data[2] = y
print(data.head())
#Experiment_dis("zipfian",x,1,5,0.1)
#best score obtained is dif=10 t2=7 for i=2.6
#Experiment_dis("pareto",x,1,5,0.1)
#best score obtained is dif=9 t2=7 for i=1.2
#Experiment_dis("binomial",x,1,5,0.1)
#best score obtained is dif=10 t2=6 for i=1.8
#Experiment_dis("lomax",x,1,10,0.1)
#best score obtained is dif=9 t2=7 for i=7.7
#Experiment_dis("weibull",x,1,2,0.1)
# Did not give good result, hence not adapted
### We choose then Zipfian with shape parameter = 2.6 since it present the best score with regard to requirements.
Experiment_dis("zipfian",x,2.5,2.6,0.1)
## Drawing Plot to see new distribution after normalisation using zipfian
data[2].value_counts(bins=[0,1,2,3,4,5,6,7,8,9]).plot.bar(figsize=(10,5))
plt.xlabel('range of POI')
plt.ylabel('Occurences')
plt.tight_layout()
plt.show()
## Saving the output into CSV
data.to_csv(r'submission1.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment