Skip to content

Instantly share code, notes, and snippets.

@Zeptogreens
Created March 22, 2021 13:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Zeptogreens/353f29dde6321d606c443c94e1344df4 to your computer and use it in GitHub Desktop.
Save Zeptogreens/353f29dde6321d606c443c94e1344df4 to your computer and use it in GitHub Desktop.
Python Implementation:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
Importing Data
d1 = pd.read_excel("1_yield_data.xlsx")
d2 = pd.read_excel("4_marketyard_prices_data.xlsx")
d3 = pd.read_excel("8_market_yard_locations_data.xlsx")
d4 = pd.read_excel("9_warehouse_data.xlsx")
d1.head(3)
d2.head(3)
d3.head(3)
d4.head(3)
d5.head(3)
EDA
#using append( ) function
final_data = d2.append([d1, d3, d4])
final_data.columns
#using drop() function
final_data.drop(["DDate", "S.No", "AmcCode", "YardCode", "CommCode", "VarityCode", "S.No", "Dist_id", "MarketYard code", "SI. No", "District id", "AMC code", "S. no", "Wh_type_code","Warehouse_code","District_id","AmcName"], inplace = True, axis=1)
final_data.isnull().sum()
final_data.mean()
final_data.info()
##data cleaning
final_data['Arrivals'].fillna(value=final_data['Arrivals'].mean(), inplace=True)
final_data['Minimum'].fillna(value=final_data['Minimum'].mean(), inplace=True)
final_data['Maximum'].fillna(value=final_data['Maximum'].mean(), inplace=True)
final_data['Model'].fillna(value=final_data['Model'].mean(), inplace=True)
final_data['Season_yield'].fillna(value=final_data['Season_yield'].mean(), inplace=True)
final_data['Total'].fillna(value=final_data['Total'].mean(), inplace=True)
final_data['Lattitude'].fillna(value=final_data['Lattitude'].mean(), inplace=True)
final_data['Longitude'].fillna(value=final_data['Longitude'].mean(), inplace=True)
final_data['Capacity'].fillna(value=final_data['Capacity'].mean(), inplace=True)
final_data['Occupancy'].fillna(value=final_data['Occupancy'].mean(), inplace=True)
final_data['Vacancy'].fillna(value=final_data['Vacancy'].mean(), inplace=True)
final_data.isnull().sum()
final_data.tail(3)
final_data.head(3)
for column in ['Crop', 'Season', 'MarketYard', 'AMC','Remarks','Stage of Work','CommName','YardName','VarityName','District','Region','Wh type','Warehouse','Address','Status']:
final_data[column].fillna(final_data[column].mode()[0], inplace=True)
final_data.isnull().sum()
final_data.drop(["Godown Capacity in MTs"], inplace = True, axis=1)
final_data.info()
Data Visualization
plt.figure(figsize=(10,10))
sns.heatmap(final_data.corr(),annot=True,cmap='coolwarm')
import sweetviz as sv
my_report = sv.analyze(final_data)
my_report.show_html()
#pandas_profiling
from pandas_profiling import ProfileReport
prof = ProfileReport(final_data)
prof.to_file(output_file='output.html')
prof
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
groundnut = final_data[final_data['Crop'] == 'groundnut']
bengalgram= final_data[final_data['Crop'] == 'bengal gram']
maize = final_data[final_data['Crop'] == 'maize']
groundnut['Total'].iplot(kind='box', title='Box plot of groundnut')
bengalgram['Total'].iplot(kind='box', title='Box plot of bengalgram')
maize['Total'].iplot(kind='box', title='Box plot of maize')
sns.distplot(final_data['Model'], color = 'r')
plt.title('Model Distribution', fontsize = 16)
plt.xlabel('Model', fontsize = 14)
plt.ylabel('Frequency', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.savefig('distplot.png')
plt.show()
final_data.drop([ "Minimum", "Capacity", "Occupancy", "MarketYard", "Arrivals", "YardName", "Region","Year","Stage of Work","Wh type","Address","Status","District","Warehouse","Remarks","Vacancy","Longitude","Lattitude","AMC"], inplace = True, axis=1)
Label encoding
from sklearn.preprocessing import LabelEncoder
enc =LabelEncoder()
final_data.CommName=enc.fit_transform(final_data.CommName)
final_data.VarityName=enc.fit_transform(final_data.VarityName)
final_data.Crop=enc.fit_transform(final_data.Crop)
final_data.Season=enc.fit_transform(final_data.Season)
final_data.isnull().sum()
final_data.info()
Data Split
x = final_data.drop(['Model'],axis=1)
y= final_data.Model
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=0)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train=scaler.fit_transform(x_train)
y_train = scaler.fit_transform(y_train.values.reshape(-1,1))
from sklearn.metrics import r2_score
Importing the required packages into our python environment
Our primary packages for this project are going to be pandas for data processing, NumPy to work with arrays, matplotlib, seaborn,cufflinks,sweetviz, pandas profiling for data visualizations, and finally scikit-learn for building an evaluating our ML model. Let’s import all the required packages into our python environment.
Python Implementation:
[ ]
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
Importing Data
Here, we are going to work with four datasets(the yield_data, the marketyard_price data, the marketyard_location data, the warehouse_data) respectively, that contains various features and information across the e-marketplaces and the crops. Using the ‘read_excel’ function provided by the Pandas package, we can import the data into our python environment. After importing the data, we can use the ‘head’ function to get a glimpse of our dataset.
[ ]
d1 = pd.read_excel("1_yield_data.xlsx")
d2 = pd.read_excel("4_marketyard_prices_data.xlsx")
d3 = pd.read_excel("8_market_yard_locations_data.xlsx")
d4 = pd.read_excel("9_warehouse_data.xlsx")
[ ]
d1.head(3)
[ ]
d2.head(3)
[ ]
d3.head(3)
[ ]
d4.head(3)
EDA
Now let’s move on to the EDA part. We begin our EDA process by appending the imported datasets using 'append' function, then removing all the null values that contain in our dataset, we can do this in python using the ‘dropna’ function and also replacing the NaN values with the mode of that column using 'fillna' function, as dropping too many values (both NaN and null) may loss the data.
#using append( ) function
[ ]
final_data = d2.append([d1, d3, d4])
final_data.columns
Index(['DDate', 'AmcCode', 'AmcName', 'YardCode', 'YardName', 'CommCode',
'CommName', 'VarityCode', 'VarityName', 'Arrivals', 'Minimum',
'Maximum', 'Model', 'S.No', 'Crop', 'Dist_id', 'District', 'Year',
'Season', 'Season_yield', 'Total', 'SI. No', 'District id',
'Godown Capacity in MTs', 'MarketYard code', 'MarketYard', 'AMC code',
'AMC', 'Stage of Work', 'Lattitude', 'Longitude', 'Remarks', 'S. no',
'Region', 'Wh_type_code', 'Wh type', 'Warehouse_code', 'Warehouse',
'District_id', 'Capacity', 'Occupancy', 'Vacancy', 'Address', 'Status'],
dtype='object')
Here, we've appended the four datasets the yield_data, the marketyard_price data, the marketyard_location data, the warehouse_data
#using drop() function
[1]
final_data.drop(["DDate", "S.No", "AmcCode", "YardCode", "CommCode", "VarityCode", "S.No", "Dist_id", "MarketYard code", "SI. No", "District id", "AMC code", "S. no", "Wh_type_code","Warehouse_code","District_id","AmcName"], inplace = True, axis=1)
Using the drop() we have dropped the columns which are repeated and are negligible through description in Excel dictionary of the related dataset
[2]
final_data.isnull().sum()
[3]
final_data.mean()
[ ]
final_data.info()
#replacing the NaN values with the mean value using "fillna" function
[4]
##data cleaning
final_data['Arrivals'].fillna(value=final_data['Arrivals'].mean(), inplace=True)
final_data['Minimum'].fillna(value=final_data['Minimum'].mean(), inplace=True)
final_data['Maximum'].fillna(value=final_data['Maximum'].mean(), inplace=True)
final_data['Model'].fillna(value=final_data['Model'].mean(), inplace=True)
final_data['Season_yield'].fillna(value=final_data['Season_yield'].mean(), inplace=True)
final_data['Total'].fillna(value=final_data['Total'].mean(), inplace=True)
final_data['Lattitude'].fillna(value=final_data['Lattitude'].mean(), inplace=True)
final_data['Longitude'].fillna(value=final_data['Longitude'].mean(), inplace=True)
final_data['Capacity'].fillna(value=final_data['Capacity'].mean(), inplace=True)
final_data['Occupancy'].fillna(value=final_data['Occupancy'].mean(), inplace=True)
final_data['Vacancy'].fillna(value=final_data['Vacancy'].mean(), inplace=True)
[6]
final_data.isnull().sum()
[ ]
final_data.tail(3)
[ ]
final_data.head(3)
#replacing the NaN values with the mode value using "fillna" function
[7]
for column in ['Crop', 'Season', 'MarketYard', 'AMC','Remarks','Stage of Work','CommName','YardName','VarityName','District','Region','Wh type','Warehouse','Address','Status']:
final_data[column].fillna(final_data[column].mode()[0], inplace=True)
[8]
final_data.isnull().sum()
Column [Godown Capacity in MTs] has to be dropped as it has 'mixed' inferred_type (as determined by Pandas). This is is not currently supported; column types should not contain mixed data. e.g. only floats or strings, but not a combination
[ ]
final_data.drop(["Godown Capacity in MTs"], inplace = True, axis=1)
[ ]
final_data.info()
Data Visualization
In this process, we are going to produce different types of charts including heatmaps, barcharts by using seaborn, plotly and sweetviz visualization libraries
[ ]
plt.figure(figsize=(10,10))
sns.heatmap(final_data.corr(),annot=True,cmap='coolwarm')
#sweetviz
[9]
import sweetviz as sv
[10]
my_report = sv.analyze(final_data)
my_report.show_html()
#pandas_profiling
[11]
from pandas_profiling import ProfileReport
prof = ProfileReport(final_data)
prof.to_file(output_file='output.html')
[ ]
prof
Here, there are few warnings in the pandas profiling report which are needed to be drop as they effet the accuracy score
[ ]
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
[ ]
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
[ ]
groundnut = final_data[final_data['Crop'] == 'groundnut']
bengalgram= final_data[final_data['Crop'] == 'bengal gram']
maize = final_data[final_data['Crop'] == 'maize']
[ ]
groundnut['Total'].iplot(kind='box', title='Box plot of groundnut')
[ ]
bengalgram['Total'].iplot(kind='box', title='Box plot of bengalgram')
[ ]
maize['Total'].iplot(kind='box', title='Box plot of maize')
[ ]
sns.distplot(final_data['Model'], color = 'r')
plt.title('Model Distribution', fontsize = 16)
plt.xlabel('Model', fontsize = 14)
plt.ylabel('Frequency', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.savefig('distplot.png')
plt.show()
[ ]
final_data.drop([ "Minimum", "Capacity", "Occupancy", "MarketYard", "Arrivals", "YardName", "Region","Year","Stage of Work","Wh type","Address","Status","District","Warehouse","Remarks","Vacancy","Longitude","Lattitude","AMC"], inplace = True, axis=1)
Label encoding
In label encoding in Python, we replace the categorical value with a numeric value between 0 and the number of classes minus 1. If the categorical variable value contains 5 distinct classes, we use (0, 1, 2, 3, and 4)
[ ]
from sklearn.preprocessing import LabelEncoder
[ ]
enc =LabelEncoder()
final_data.CommName=enc.fit_transform(final_data.CommName)
final_data.VarityName=enc.fit_transform(final_data.VarityName)
final_data.Crop=enc.fit_transform(final_data.Crop)
final_data.Season=enc.fit_transform(final_data.Season)
[ ]
final_data.isnull().sum()
CommName 0
VarityName 0
Maximum 0
Model 0
Crop 0
Season 0
Season_yield 0
Total 0
dtype: int64
[ ]
final_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 43676 entries, 0 to 338
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CommName 43676 non-null int32
1 VarityName 43676 non-null int32
2 Maximum 43676 non-null float64
3 Model 43676 non-null float64
4 Crop 43676 non-null int32
5 Season 43676 non-null int32
6 Season_yield 43676 non-null float64
7 Total 43676 non-null float64
dtypes: float64(4), int32(4)
memory usage: 2.3 MB
Feature Selection & Data Split
In this process we are going to define the ‘X’ variable (independent variable) and the ‘Y’ variable (dependent variable). After defining the variables, we will use them to split the data into a train set and test set. Splitting the data can be done using the ‘train_test_split’ function provided by scikit-learn in python.
[ ]
x = final_data.drop(['Model'],axis=1)
y= final_data.Model
[ ]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=0)
[ ]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train=scaler.fit_transform(x_train)
y_train = scaler.fit_transform(y_train.values.reshape(-1,1))
[ ]
from sklearn.metrics import r2_score
Now that we have all our required elements to build our linear regression models. So, let’s proceed to our next step which is building the model using scikit-learn in python.
####Modeling
#####Linear Regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train,y_train)
y_predict=model.predict(x_test)
r2_score(y_test,y_predict)
###XgBoost
import xgboost as xg
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=0)
from sklearn.metrics import mean_squared_error as MSE
x = final_data.drop(['Model'],axis=1)
y= final_data.Model
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=0)
model = XGBRegressor(learning_rate=0.1,random_state=0, n_estimators=10)
model.fit(x_train,y_train)
y_predict=model.predict(x_test)
r2_score(y_test,y_predict)
###KNN
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=5)
model.fit(x_train,y_train)
y_predict=model.predict(x_test)
r2_score(y_test,y_predict)
###Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10,random_state=0)
model.fit(x_train,y_train)
y_predict=model.predict(x_test)
r2_score(y_test,y_predict)
###Support Vector Machine
from sklearn.svm import SVR
from sklearn.svm import SVR
model=SVR()
model.fit(x_train, y_train)
y_predict=model.predict(x_test)
r2_score(y_test,y_predict)
###DecisionTree Regressor
from sklearn.tree import DecisionTreeRegressor
model=DecisionTreeRegressor(random_state=42)
model.fit(x_train,y_train)
y_predict=model.predict(x_test)
r2_score(y_test,y_predict)
###Adaboost
from sklearn.ensemble import AdaBoostRegressor
adaboost_regressor = AdaBoostRegressor(n_estimators=1500, learning_rate = 0.001, loss='exponential')
model.fit(x_train,y_train)
y_predict=model.predict(x_test)
r2_score(y_test,y_predict)
####Graphical analysis of data
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import calendar
fig = px.bar(d1, x="Crop", y="Total", color="District")
fig.show()
fig.write_html("total-yield.html")
fig = px.bar(d1, x="Season", y="Total", color="Crop")
fig.show()
fig.write_html("total-yield.html")
fig = px.bar(d1, x="District", y="Total", color="Crop")
fig.show()
fig.write_html("total-yield.html")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment