-
-
Save ashtom84/1847c2a621e9f9e5ab8d28aed956019d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
# load the data | |
bike = pd.read_csv("bike.csv") | |
df = pd.read_csv("df_weather.csv") | |
df = df.iloc[:, 1:] | |
df.rename(columns={'cty2': 'city'}, inplace=True) | |
bikeW = pd.merge(bike, df, how = 'left', on = 'city') | |
# divide a brand's distribution between nbS different intervals for the weather feature var | |
def sub_region(array, row = '0', var = 'avg_days_precip', nbS = 4, split = "default"): | |
bikeV = bikeW[bikeW[var] > -1] | |
minA = min(bikeV[var]) | |
maxA = max(bikeV[var]) | |
if split == "default": | |
splitL = [minA + k*float(maxA - minA)/nbS for k in range(nbS)] | |
splitU = [minA + k*float(maxA - minA)/nbS for k in range(1, nbS+1)] | |
else: | |
splitL = [minA] + [splt for splt in split] | |
splitU = [splt for splt in split] + [maxA] | |
nbS = len(split)+1 | |
count = np.zeros(nbS) | |
for elt in array: | |
for lo, up in zip(splitL, splitU): | |
if elt >= lo and elt < up: | |
pos = [k for k, x in zip(range(nbS), splitL) if x == lo][0] | |
count[pos] += 1 | |
colInt = [] | |
prop = [] | |
for i in range(len(count)): | |
prop.append(count[i]/len(array)) | |
colInt.append(var + ": " + str(splitL[i]) + "-" + str(splitU[i])) | |
sub_dist = pd.DataFrame(columns = colInt, index = [row]) | |
sub_dist.loc[row] = prop | |
return sub_dist | |
# compute the proportion among a brand for a given weather feature: P(feature split|brand) | |
def prop_var(var, nbS = 4, size = 10, split="defaut"): | |
bikeV = bikeW[bikeW[var] > -1] | |
bikeVinfo = bikeV.groupby('make')[var].agg(['size' ,'min', 'mean', 'max'])\ | |
.sort(['size'], ascending=[0]) | |
makesize = [] | |
for mk in bikeV.make: | |
makesize.append(bikeVinfo.ix[mk]['size']) | |
bikeV['makesize'] = makesize | |
bikeV.head() | |
prop = pd.DataFrame({}) | |
mksize = [] | |
for mk in np.unique(bikeV.make): | |
mksize.append(np.unique(bikeV[bikeV.make == mk]['makesize'])[0]) | |
temp = sub_region(bikeV[bikeV.make == mk][var], mk, var, nbS, split) | |
prop = pd.concat([prop, temp], axis = 0) | |
prop = pd.concat([prop, pd.DataFrame({'makesize': mksize}, index = prop.index)], axis=1) | |
prop = prop[prop.makesize > size] | |
Hi, Hi_R, Hi_Int = [], [], [] | |
Lo, Lo_R, Lo_Int = [], [], [] | |
for mk in prop.index: | |
m, M = min(prop.ix[mk][:-1]), max(prop.ix[mk][:-1]) | |
indm = [i for i in range(len(prop.columns)) if prop.ix[mk][i] == m][0] | |
indM = [i for i in range(len(prop.columns)) if prop.ix[mk][i] == M] | |
if len(indM) == 1: | |
indM = indM[0] | |
else: | |
indM = indM[1] | |
Intm, IntM = prop.columns[indm].split(":")[1], prop.columns[indM].split(":")[1] | |
Lo.append(m) | |
Lo_R.append(indm) | |
Lo_Int.append(Intm) | |
Hi.append(M) | |
Hi_R.append(indM) | |
Hi_Int.append(IntM) | |
prop = pd.concat([prop, pd.DataFrame({'Highest': Hi, 'Lowest': Lo, \ | |
'High_Region': Hi_R, 'Low_Region': Lo_R, \ | |
'High_Int': Hi_Int, 'Low_Int': Lo_Int}, \ | |
index = prop.index)], axis=1) | |
return prop | |
# compute the probability of getting a brand given that the observation lies in a | |
# particular split of the previous weather feature: P(brand|feature split) | |
def proba_var(var, nbS = 4, size = 10, split="default"): | |
bikeV = bikeW[bikeW[var] > -1] | |
prop = prop_var(var, nbS, size, split) | |
mkind = prop.index | |
proba = pd.DataFrame({}) | |
for mk in mkind: | |
probaCol = ['proba_tot'] | |
probaVal = [] | |
probaVal.append(prop.ix[mk]['makesize']/len(bikeV)) | |
for i in range(nbS): | |
coli = prop.columns[i] | |
sizei = round(np.dot(prop[coli], prop['makesize']))#+1 | |
mk_nbi = round(np.dot(prop.ix[mk][coli], prop.ix[mk]['makesize'])) | |
probaCol.append('proba_' + str(i+1) + ": " + coli) | |
probaVal.append(mk_nbi/sizei) | |
temp = pd.DataFrame(columns = probaCol, index = [mk]) | |
temp.loc[mk] = probaVal | |
proba = pd.concat([proba, temp], axis=0) | |
return proba |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment