Skip to content

Instantly share code, notes, and snippets.

@manmohan24nov
Created September 30, 2020 10:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save manmohan24nov/7381a8f5f077d0846108290a8bd568cf to your computer and use it in GitHub Desktop.
Save manmohan24nov/7381a8f5f077d0846108290a8bd568cf to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
# Loat the train and test data
train_df = pd.read_csv('train.csv')
train_df['df_type'] = 'train'
test_df = pd.read_csv('test.csv')
test_df['df_type'] = 'test'
# concatenating test and train data
combined_data = pd.concat([train_df, test_df],ignore_index=True)
# check null values
print(train_df.apply(lambda x: sum(x.isnull())))
# remove null values
avg_weight = combined_data.pivot_table(values='Item_Weight', index='Item_Identifier')
missing_bool = combined_data['Item_Weight'].isnull()
combined_data.loc[missing_bool,'Item_Weight'] = combined_data.loc[missing_bool,'Item_Identifier'].apply(lambda x: avg_weight.loc[x])
avg_visibility = combined_data.pivot_table(values='Item_Visibility', index='Item_Identifier')
missing_bool = combined_data['Item_Visibility'] == 0
combined_data.loc[missing_bool,'Item_Visibility'] = combined_data.loc[missing_bool,'Item_Identifier'].apply(lambda x: avg_visibility.loc[x])
combined_data['Item_Fat_Content'] = combined_data['Item_Fat_Content'].replace({'LF':'Low Fat',
'reg':'Regular Fat',
'low fat':'Low Fat'})
combined_data['Outlet_Years'] = 2013 - combined_data['Outlet_Establishment_Year']
train = combined_data[combined_data['df_type'] == 'train']
train.drop(['Outlet_Size','Outlet_Establishment_Year','df_type'],axis=1,inplace=True)
# train data information
train.info()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment