Skip to content

Instantly share code, notes, and snippets.

@ashtom84
Created April 2, 2016 23:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ashtom84/1847c2a621e9f9e5ab8d28aed956019d to your computer and use it in GitHub Desktop.
Save ashtom84/1847c2a621e9f9e5ab8d28aed956019d to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
# load the data
bike = pd.read_csv("bike.csv")
df = pd.read_csv("df_weather.csv")
df = df.iloc[:, 1:]
df.rename(columns={'cty2': 'city'}, inplace=True)
bikeW = pd.merge(bike, df, how = 'left', on = 'city')
# divide a brand's distribution between nbS different intervals for the weather feature var
def sub_region(array, row = '0', var = 'avg_days_precip', nbS = 4, split = "default"):
bikeV = bikeW[bikeW[var] > -1]
minA = min(bikeV[var])
maxA = max(bikeV[var])
if split == "default":
splitL = [minA + k*float(maxA - minA)/nbS for k in range(nbS)]
splitU = [minA + k*float(maxA - minA)/nbS for k in range(1, nbS+1)]
else:
splitL = [minA] + [splt for splt in split]
splitU = [splt for splt in split] + [maxA]
nbS = len(split)+1
count = np.zeros(nbS)
for elt in array:
for lo, up in zip(splitL, splitU):
if elt >= lo and elt < up:
pos = [k for k, x in zip(range(nbS), splitL) if x == lo][0]
count[pos] += 1
colInt = []
prop = []
for i in range(len(count)):
prop.append(count[i]/len(array))
colInt.append(var + ": " + str(splitL[i]) + "-" + str(splitU[i]))
sub_dist = pd.DataFrame(columns = colInt, index = [row])
sub_dist.loc[row] = prop
return sub_dist
# compute the proportion among a brand for a given weather feature: P(feature split|brand)
def prop_var(var, nbS = 4, size = 10, split="defaut"):
bikeV = bikeW[bikeW[var] > -1]
bikeVinfo = bikeV.groupby('make')[var].agg(['size' ,'min', 'mean', 'max'])\
.sort(['size'], ascending=[0])
makesize = []
for mk in bikeV.make:
makesize.append(bikeVinfo.ix[mk]['size'])
bikeV['makesize'] = makesize
bikeV.head()
prop = pd.DataFrame({})
mksize = []
for mk in np.unique(bikeV.make):
mksize.append(np.unique(bikeV[bikeV.make == mk]['makesize'])[0])
temp = sub_region(bikeV[bikeV.make == mk][var], mk, var, nbS, split)
prop = pd.concat([prop, temp], axis = 0)
prop = pd.concat([prop, pd.DataFrame({'makesize': mksize}, index = prop.index)], axis=1)
prop = prop[prop.makesize > size]
Hi, Hi_R, Hi_Int = [], [], []
Lo, Lo_R, Lo_Int = [], [], []
for mk in prop.index:
m, M = min(prop.ix[mk][:-1]), max(prop.ix[mk][:-1])
indm = [i for i in range(len(prop.columns)) if prop.ix[mk][i] == m][0]
indM = [i for i in range(len(prop.columns)) if prop.ix[mk][i] == M]
if len(indM) == 1:
indM = indM[0]
else:
indM = indM[1]
Intm, IntM = prop.columns[indm].split(":")[1], prop.columns[indM].split(":")[1]
Lo.append(m)
Lo_R.append(indm)
Lo_Int.append(Intm)
Hi.append(M)
Hi_R.append(indM)
Hi_Int.append(IntM)
prop = pd.concat([prop, pd.DataFrame({'Highest': Hi, 'Lowest': Lo, \
'High_Region': Hi_R, 'Low_Region': Lo_R, \
'High_Int': Hi_Int, 'Low_Int': Lo_Int}, \
index = prop.index)], axis=1)
return prop
# compute the probability of getting a brand given that the observation lies in a
# particular split of the previous weather feature: P(brand|feature split)
def proba_var(var, nbS = 4, size = 10, split="default"):
bikeV = bikeW[bikeW[var] > -1]
prop = prop_var(var, nbS, size, split)
mkind = prop.index
proba = pd.DataFrame({})
for mk in mkind:
probaCol = ['proba_tot']
probaVal = []
probaVal.append(prop.ix[mk]['makesize']/len(bikeV))
for i in range(nbS):
coli = prop.columns[i]
sizei = round(np.dot(prop[coli], prop['makesize']))#+1
mk_nbi = round(np.dot(prop.ix[mk][coli], prop.ix[mk]['makesize']))
probaCol.append('proba_' + str(i+1) + ": " + coli)
probaVal.append(mk_nbi/sizei)
temp = pd.DataFrame(columns = probaCol, index = [mk])
temp.loc[mk] = probaVal
proba = pd.concat([proba, temp], axis=0)
return proba
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment