Skip to content

Instantly share code, notes, and snippets.

@gauravgola96
Last active January 28, 2018 07:00
Show Gist options
  • Save gauravgola96/e30f85b8cefb2fe7f3d169c8efd36896 to your computer and use it in GitHub Desktop.
Save gauravgola96/e30f85b8cefb2fe7f3d169c8efd36896 to your computer and use it in GitHub Desktop.
Big mart sales prediction (Analytics Vidhya)
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pylab as plt
# for plotting graphs in notebook
%pylab inline
train = pd.read_csv("C:\\Users\\Gaurav_Gola\\Desktop\\project\\Big mart sales\\Train.csv")
test = pd.read_csv("C:\\Users\\Gaurav_Gola\\Desktop\\project\\Big mart sales\\test.csv")
train.head()
#combining train and test dataframes for data cleaning
train['source'] = 'train'
test['source'] = 'test'
data = pd.concat([train,test],ignore_index=True)
data.Item_Identifier.value_counts() #1559 items
train.isnull().sum()
data.isnull().sum() #null in test is 5681
data.describe()
#Filter categorical variables
categorical_columns = [x for x in data.dtypes.index if data.dtypes[x]=='object']
categorical_columns # all are not required
categorical_columns = [x for x in categorical_columns if x not in ["Item_Identifier","Outlet_Identifier","source"]]
categorical_columns
for col in categorical_columns:
print ("frequency of categorical variables")
print(col)
print(data[col].value_counts())
item_avg_weight = data.pivot_table(values= 'Item_Weight', index='Item_Identifier')
item_avg_weight.head()
item_avg_weight.Item_Weight['DRA12'] #just a check
null_item_weight = data['Item_Weight'].isnull()
null_item_weight.head()
data.loc[null_item_weight,'Item_Weight'] = data.loc[null_item_weight,'Item_Identifier'].apply(lambda x: item_avg_weight.Item_Weight[x])
data.isnull().sum() # item_weight filled
#outlet size is having missing values
# impute the values from its mode values
from scipy.stats import mode
outlet_size_mode = data.dropna(subset=['Outlet_Size']).pivot_table(values='Outlet_Size',columns='Outlet_Type',aggfunc=(lambda x:mode(x).mode[0]))
outlet_size_mode
#missing values of Outlet size
null_outlet_size = data['Outlet_Size'].isnull()
data.loc[null_outlet_size,'Outlet_Size'] = data.loc[null_outlet_size,'Outlet_Type'].apply(lambda x: outlet_size_mode[x])
# filling missing values of Outlet size with mode value
#mode value is related to outlet type
data.isnull().sum()
#Feature engineering
# Outlet type
#combining two or morw outlet type
data.pivot_table(values='Item_Outlet_Sales',index='Outlet_Type')
# should not combine any outlet type b/c of significant difference b/w mean values of sales
data.isnull().sum()
# Item visibility
# values woth zero makes no sense
# replace it with mean value according to Item identifier
avg_visb = data.pivot_table(values='Item_Visibility',index='Item_Identifier')
avg_visb
zero_value_visb = (data['Item_Visibility']==0)
zero_value_visb
data.loc[zero_value_visb,'Item_Visibility'] = data.loc[zero_value_visb,'Item_Identifier'].apply(lambda x:avg_visb.Item_Visibility[x])
#zeros of Item_Visibility is reaplced from mean of value of particular Item_Identifier
data.Item_Type.unique()
#create a new variable
#Now Item_type are of 16 type we can convert them into 3 broad categories
# Each Item_type had Item_Identifier with start as FD or DR or NC
# example item_type = Soft drink check Item Identifier it always start with DR
data['combined_item_type'] = data['Item_Identifier'].apply(lambda x:x[0:2])
data['combined_item_type'] = data['combined_item_type'].map({'FD':'Food',
'DR':'Drinks',
'NC':'Non-consumables'})
data.combined_item_type.value_counts()
#year of sales variable is important
#create a new variable of number of years
data['outlet_years'] = 2013 - data.Outlet_Establishment_Year
#Item_Fat_content has ambiguity
data.Item_Fat_Content.unique()
data.Item_Fat_Content = data.Item_Fat_Content.map({'Low Fat':'Low Fat','Regular':'Regular','low fat':'Low Fat','LF':'Low Fat','reg':'Regular'})
data.Item_Fat_Content.value_counts()
#But what about non consumable in combined_item_type
# non consumable cannot have fat content
#separte them as non edible in item fat content
data.loc[data.combined_item_type=='Non-consumables','Item_Fat_Content']='Non-edible'
data.Item_Fat_Content.value_counts()
#dummy variables creation
#converting all the categorical variables into numerical varialbles
#sklearn preprocessing lable encoder is used
data.Outlet_Identifier.unique()
#creat a new variable oullet same as outlet Identifier
#We will not remove outlet identifier b/c it is required in file submission
# so dummy of new variable outlet is produced
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#create new variable outlet
data['Outlet'] = le.fit_transform(data['Outlet_Identifier'])
#le.fit_transform convert it into numerical
type(data.Outlet)
convert_var = [x for x in data.dtypes.index if data.dtypes[x]=='object']
convert_var
var = ['Item_Fat_Content','Outlet_Location_Type','combined_item_type','Outlet_Size','Outlet_Type','Outlet']
for x in var:
data[x]=le.fit_transform(data[x])
#all the columns in var are converted in numericals
# now we can perform dummy creation
data = pd.get_dummies(data,columns=var)
data.dtypes
data.head()
#Drop the columns which have been converted to different types:
data.drop(['Item_Type','Outlet_Establishment_Year'],axis=1,inplace=True)
#Divide into test and train:
train = data.loc[data['source']=="train"]
test = data.loc[data['source']=="test"]
#Drop unnecessary columns:
test.drop(['Item_Outlet_Sales','source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)
#Define target and ID columns:
target = 'Item_Outlet_Sales'
IDcol = ['Item_Identifier','Outlet_Identifier']
#Define target and ID columns:
target = 'Item_Outlet_Sales'
IDcol = ['Item_Identifier','Outlet_Identifier']
from sklearn import cross_validation, metrics
def modelfit(alg, dtrain, dtest, predictors, target, IDcol, filename):
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain[target])
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
#Perform cross-validation:
cv_score = cross_validation.cross_val_score(alg, dtrain[predictors], dtrain[target], cv=20, scoring='mean_squared_error')
cv_score = np.sqrt(np.abs(cv_score))
#Print model report:
print ("\nModel Report")
print ("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error(dtrain[target].values, dtrain_predictions)))
print("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
#Predict on testing data:
dtest[target] = alg.predict(dtest[predictors])
#Export submission file:
IDcol.append(target)
submission = pd.DataFrame({ x: dtest[x] for x in IDcol})
submission.to_csv(filename, index=False)
from sklearn.linear_model import LinearRegression, Ridge, Lasso
predictors = [x for x in train.columns if x not in [target]+IDcol]
# print predictors
alg1 = LinearRegression(normalize=True)
modelfit(alg1, train, test, predictors, target, IDcol, 'alg1.csv')
coef1 = pd.Series(alg1.coef_, predictors).sort_values()
coef1.plot(kind='bar', title='Model Coefficients')
coef1
#We can see this is better than baseline model.
#But if you notice the coefficients, they are very large in magnitude which signifies overfitting.
#Ridge regression
predictors = [x for x in train.columns if x not in [target]+IDcol]
alg2 = Ridge(alpha=0.05,normalize=True)
modelfit(alg2, train, test, predictors, target, IDcol, 'alg2.csv')
coef2 = pd.Series(alg2.coef_, predictors).sort_values()
coef2.plot(kind='bar', title='Model Coefficients')
#Now the regression coefficients are much better
#RMSE is same
#but better performance
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment