Created
March 22, 2021 13:02
-
-
Save Zeptogreens/353f29dde6321d606c443c94e1344df4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Python Implementation: | |
import pandas as pd | |
import numpy as np | |
from matplotlib import pyplot as plt | |
from matplotlib import pyplot as plt | |
import seaborn as sns | |
%matplotlib inline | |
Importing Data | |
d1 = pd.read_excel("1_yield_data.xlsx") | |
d2 = pd.read_excel("4_marketyard_prices_data.xlsx") | |
d3 = pd.read_excel("8_market_yard_locations_data.xlsx") | |
d4 = pd.read_excel("9_warehouse_data.xlsx") | |
d1.head(3) | |
d2.head(3) | |
d3.head(3) | |
d4.head(3) | |
d5.head(3) | |
EDA | |
#using append( ) function | |
final_data = d2.append([d1, d3, d4]) | |
final_data.columns | |
#using drop() function | |
final_data.drop(["DDate", "S.No", "AmcCode", "YardCode", "CommCode", "VarityCode", "S.No", "Dist_id", "MarketYard code", "SI. No", "District id", "AMC code", "S. no", "Wh_type_code","Warehouse_code","District_id","AmcName"], inplace = True, axis=1) | |
final_data.isnull().sum() | |
final_data.mean() | |
final_data.info() | |
##data cleaning | |
final_data['Arrivals'].fillna(value=final_data['Arrivals'].mean(), inplace=True) | |
final_data['Minimum'].fillna(value=final_data['Minimum'].mean(), inplace=True) | |
final_data['Maximum'].fillna(value=final_data['Maximum'].mean(), inplace=True) | |
final_data['Model'].fillna(value=final_data['Model'].mean(), inplace=True) | |
final_data['Season_yield'].fillna(value=final_data['Season_yield'].mean(), inplace=True) | |
final_data['Total'].fillna(value=final_data['Total'].mean(), inplace=True) | |
final_data['Lattitude'].fillna(value=final_data['Lattitude'].mean(), inplace=True) | |
final_data['Longitude'].fillna(value=final_data['Longitude'].mean(), inplace=True) | |
final_data['Capacity'].fillna(value=final_data['Capacity'].mean(), inplace=True) | |
final_data['Occupancy'].fillna(value=final_data['Occupancy'].mean(), inplace=True) | |
final_data['Vacancy'].fillna(value=final_data['Vacancy'].mean(), inplace=True) | |
final_data.isnull().sum() | |
final_data.tail(3) | |
final_data.head(3) | |
for column in ['Crop', 'Season', 'MarketYard', 'AMC','Remarks','Stage of Work','CommName','YardName','VarityName','District','Region','Wh type','Warehouse','Address','Status']: | |
final_data[column].fillna(final_data[column].mode()[0], inplace=True) | |
final_data.isnull().sum() | |
final_data.drop(["Godown Capacity in MTs"], inplace = True, axis=1) | |
final_data.info() | |
Data Visualization | |
plt.figure(figsize=(10,10)) | |
sns.heatmap(final_data.corr(),annot=True,cmap='coolwarm') | |
import sweetviz as sv | |
my_report = sv.analyze(final_data) | |
my_report.show_html() | |
#pandas_profiling | |
from pandas_profiling import ProfileReport | |
prof = ProfileReport(final_data) | |
prof.to_file(output_file='output.html') | |
prof | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from plotly.offline import init_notebook_mode, iplot | |
import cufflinks as cf | |
cf.go_offline() | |
cf.set_config_file(offline=False, world_readable=True) | |
groundnut = final_data[final_data['Crop'] == 'groundnut'] | |
bengalgram= final_data[final_data['Crop'] == 'bengal gram'] | |
maize = final_data[final_data['Crop'] == 'maize'] | |
groundnut['Total'].iplot(kind='box', title='Box plot of groundnut') | |
bengalgram['Total'].iplot(kind='box', title='Box plot of bengalgram') | |
maize['Total'].iplot(kind='box', title='Box plot of maize') | |
sns.distplot(final_data['Model'], color = 'r') | |
plt.title('Model Distribution', fontsize = 16) | |
plt.xlabel('Model', fontsize = 14) | |
plt.ylabel('Frequency', fontsize = 14) | |
plt.xticks(fontsize = 12) | |
plt.yticks(fontsize = 12) | |
plt.savefig('distplot.png') | |
plt.show() | |
final_data.drop([ "Minimum", "Capacity", "Occupancy", "MarketYard", "Arrivals", "YardName", "Region","Year","Stage of Work","Wh type","Address","Status","District","Warehouse","Remarks","Vacancy","Longitude","Lattitude","AMC"], inplace = True, axis=1) | |
Label encoding | |
from sklearn.preprocessing import LabelEncoder | |
enc =LabelEncoder() | |
final_data.CommName=enc.fit_transform(final_data.CommName) | |
final_data.VarityName=enc.fit_transform(final_data.VarityName) | |
final_data.Crop=enc.fit_transform(final_data.Crop) | |
final_data.Season=enc.fit_transform(final_data.Season) | |
final_data.isnull().sum() | |
final_data.info() | |
Data Split | |
x = final_data.drop(['Model'],axis=1) | |
y= final_data.Model | |
from sklearn.model_selection import train_test_split | |
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=0) | |
from sklearn.preprocessing import StandardScaler | |
scaler = StandardScaler() | |
x_train=scaler.fit_transform(x_train) | |
y_train = scaler.fit_transform(y_train.values.reshape(-1,1)) | |
from sklearn.metrics import r2_score | |
Importing the required packages into our python environment | |
Our primary packages for this project are going to be pandas for data processing, NumPy to work with arrays, matplotlib, seaborn,cufflinks,sweetviz, pandas profiling for data visualizations, and finally scikit-learn for building an evaluating our ML model. Let’s import all the required packages into our python environment. | |
Python Implementation: | |
[ ] | |
import pandas as pd | |
import numpy as np | |
from matplotlib import pyplot as plt | |
from matplotlib import pyplot as plt | |
import seaborn as sns | |
%matplotlib inline | |
Importing Data | |
Here, we are going to work with four datasets(the yield_data, the marketyard_price data, the marketyard_location data, the warehouse_data) respectively, that contains various features and information across the e-marketplaces and the crops. Using the ‘read_excel’ function provided by the Pandas package, we can import the data into our python environment. After importing the data, we can use the ‘head’ function to get a glimpse of our dataset. | |
[ ] | |
d1 = pd.read_excel("1_yield_data.xlsx") | |
d2 = pd.read_excel("4_marketyard_prices_data.xlsx") | |
d3 = pd.read_excel("8_market_yard_locations_data.xlsx") | |
d4 = pd.read_excel("9_warehouse_data.xlsx") | |
[ ] | |
d1.head(3) | |
[ ] | |
d2.head(3) | |
[ ] | |
d3.head(3) | |
[ ] | |
d4.head(3) | |
EDA | |
Now let’s move on to the EDA part. We begin our EDA process by appending the imported datasets using 'append' function, then removing all the null values that contain in our dataset, we can do this in python using the ‘dropna’ function and also replacing the NaN values with the mode of that column using 'fillna' function, as dropping too many values (both NaN and null) may loss the data. | |
#using append( ) function | |
[ ] | |
final_data = d2.append([d1, d3, d4]) | |
final_data.columns | |
Index(['DDate', 'AmcCode', 'AmcName', 'YardCode', 'YardName', 'CommCode', | |
'CommName', 'VarityCode', 'VarityName', 'Arrivals', 'Minimum', | |
'Maximum', 'Model', 'S.No', 'Crop', 'Dist_id', 'District', 'Year', | |
'Season', 'Season_yield', 'Total', 'SI. No', 'District id', | |
'Godown Capacity in MTs', 'MarketYard code', 'MarketYard', 'AMC code', | |
'AMC', 'Stage of Work', 'Lattitude', 'Longitude', 'Remarks', 'S. no', | |
'Region', 'Wh_type_code', 'Wh type', 'Warehouse_code', 'Warehouse', | |
'District_id', 'Capacity', 'Occupancy', 'Vacancy', 'Address', 'Status'], | |
dtype='object') | |
Here, we've appended the four datasets the yield_data, the marketyard_price data, the marketyard_location data, the warehouse_data | |
#using drop() function | |
[1] | |
final_data.drop(["DDate", "S.No", "AmcCode", "YardCode", "CommCode", "VarityCode", "S.No", "Dist_id", "MarketYard code", "SI. No", "District id", "AMC code", "S. no", "Wh_type_code","Warehouse_code","District_id","AmcName"], inplace = True, axis=1) | |
Using the drop() we have dropped the columns which are repeated and are negligible through description in Excel dictionary of the related dataset | |
[2] | |
final_data.isnull().sum() | |
[3] | |
final_data.mean() | |
[ ] | |
final_data.info() | |
#replacing the NaN values with the mean value using "fillna" function | |
[4] | |
##data cleaning | |
final_data['Arrivals'].fillna(value=final_data['Arrivals'].mean(), inplace=True) | |
final_data['Minimum'].fillna(value=final_data['Minimum'].mean(), inplace=True) | |
final_data['Maximum'].fillna(value=final_data['Maximum'].mean(), inplace=True) | |
final_data['Model'].fillna(value=final_data['Model'].mean(), inplace=True) | |
final_data['Season_yield'].fillna(value=final_data['Season_yield'].mean(), inplace=True) | |
final_data['Total'].fillna(value=final_data['Total'].mean(), inplace=True) | |
final_data['Lattitude'].fillna(value=final_data['Lattitude'].mean(), inplace=True) | |
final_data['Longitude'].fillna(value=final_data['Longitude'].mean(), inplace=True) | |
final_data['Capacity'].fillna(value=final_data['Capacity'].mean(), inplace=True) | |
final_data['Occupancy'].fillna(value=final_data['Occupancy'].mean(), inplace=True) | |
final_data['Vacancy'].fillna(value=final_data['Vacancy'].mean(), inplace=True) | |
[6] | |
final_data.isnull().sum() | |
[ ] | |
final_data.tail(3) | |
[ ] | |
final_data.head(3) | |
#replacing the NaN values with the mode value using "fillna" function | |
[7] | |
for column in ['Crop', 'Season', 'MarketYard', 'AMC','Remarks','Stage of Work','CommName','YardName','VarityName','District','Region','Wh type','Warehouse','Address','Status']: | |
final_data[column].fillna(final_data[column].mode()[0], inplace=True) | |
[8] | |
final_data.isnull().sum() | |
Column [Godown Capacity in MTs] has to be dropped as it has 'mixed' inferred_type (as determined by Pandas). This is is not currently supported; column types should not contain mixed data. e.g. only floats or strings, but not a combination | |
[ ] | |
final_data.drop(["Godown Capacity in MTs"], inplace = True, axis=1) | |
[ ] | |
final_data.info() | |
Data Visualization | |
In this process, we are going to produce different types of charts including heatmaps, barcharts by using seaborn, plotly and sweetviz visualization libraries | |
[ ] | |
plt.figure(figsize=(10,10)) | |
sns.heatmap(final_data.corr(),annot=True,cmap='coolwarm') | |
#sweetviz | |
[9] | |
import sweetviz as sv | |
[10] | |
my_report = sv.analyze(final_data) | |
my_report.show_html() | |
#pandas_profiling | |
[11] | |
from pandas_profiling import ProfileReport | |
prof = ProfileReport(final_data) | |
prof.to_file(output_file='output.html') | |
[ ] | |
prof | |
Here, there are few warnings in the pandas profiling report which are needed to be drop as they effet the accuracy score | |
[ ] | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from plotly.offline import init_notebook_mode, iplot | |
[ ] | |
import cufflinks as cf | |
cf.go_offline() | |
cf.set_config_file(offline=False, world_readable=True) | |
[ ] | |
groundnut = final_data[final_data['Crop'] == 'groundnut'] | |
bengalgram= final_data[final_data['Crop'] == 'bengal gram'] | |
maize = final_data[final_data['Crop'] == 'maize'] | |
[ ] | |
groundnut['Total'].iplot(kind='box', title='Box plot of groundnut') | |
[ ] | |
bengalgram['Total'].iplot(kind='box', title='Box plot of bengalgram') | |
[ ] | |
maize['Total'].iplot(kind='box', title='Box plot of maize') | |
[ ] | |
sns.distplot(final_data['Model'], color = 'r') | |
plt.title('Model Distribution', fontsize = 16) | |
plt.xlabel('Model', fontsize = 14) | |
plt.ylabel('Frequency', fontsize = 14) | |
plt.xticks(fontsize = 12) | |
plt.yticks(fontsize = 12) | |
plt.savefig('distplot.png') | |
plt.show() | |
[ ] | |
final_data.drop([ "Minimum", "Capacity", "Occupancy", "MarketYard", "Arrivals", "YardName", "Region","Year","Stage of Work","Wh type","Address","Status","District","Warehouse","Remarks","Vacancy","Longitude","Lattitude","AMC"], inplace = True, axis=1) | |
Label encoding | |
In label encoding in Python, we replace the categorical value with a numeric value between 0 and the number of classes minus 1. If the categorical variable value contains 5 distinct classes, we use (0, 1, 2, 3, and 4) | |
[ ] | |
from sklearn.preprocessing import LabelEncoder | |
[ ] | |
enc =LabelEncoder() | |
final_data.CommName=enc.fit_transform(final_data.CommName) | |
final_data.VarityName=enc.fit_transform(final_data.VarityName) | |
final_data.Crop=enc.fit_transform(final_data.Crop) | |
final_data.Season=enc.fit_transform(final_data.Season) | |
[ ] | |
final_data.isnull().sum() | |
CommName 0 | |
VarityName 0 | |
Maximum 0 | |
Model 0 | |
Crop 0 | |
Season 0 | |
Season_yield 0 | |
Total 0 | |
dtype: int64 | |
[ ] | |
final_data.info() | |
<class 'pandas.core.frame.DataFrame'> | |
Int64Index: 43676 entries, 0 to 338 | |
Data columns (total 8 columns): | |
# Column Non-Null Count Dtype | |
--- ------ -------------- ----- | |
0 CommName 43676 non-null int32 | |
1 VarityName 43676 non-null int32 | |
2 Maximum 43676 non-null float64 | |
3 Model 43676 non-null float64 | |
4 Crop 43676 non-null int32 | |
5 Season 43676 non-null int32 | |
6 Season_yield 43676 non-null float64 | |
7 Total 43676 non-null float64 | |
dtypes: float64(4), int32(4) | |
memory usage: 2.3 MB | |
Feature Selection & Data Split | |
In this process we are going to define the ‘X’ variable (independent variable) and the ‘Y’ variable (dependent variable). After defining the variables, we will use them to split the data into a train set and test set. Splitting the data can be done using the ‘train_test_split’ function provided by scikit-learn in python. | |
[ ] | |
x = final_data.drop(['Model'],axis=1) | |
y= final_data.Model | |
[ ] | |
from sklearn.model_selection import train_test_split | |
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=0) | |
[ ] | |
from sklearn.preprocessing import StandardScaler | |
scaler = StandardScaler() | |
x_train=scaler.fit_transform(x_train) | |
y_train = scaler.fit_transform(y_train.values.reshape(-1,1)) | |
[ ] | |
from sklearn.metrics import r2_score | |
Now that we have all our required elements to build our linear regression models. So, let’s proceed to our next step which is building the model using scikit-learn in python. | |
####Modeling | |
#####Linear Regression | |
from sklearn.linear_model import LinearRegression | |
model = LinearRegression() | |
model.fit(x_train,y_train) | |
y_predict=model.predict(x_test) | |
r2_score(y_test,y_predict) | |
###XgBoost | |
import xgboost as xg | |
from xgboost import XGBRegressor | |
from sklearn.metrics import accuracy_score | |
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=0) | |
from sklearn.metrics import mean_squared_error as MSE | |
x = final_data.drop(['Model'],axis=1) | |
y= final_data.Model | |
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=0) | |
model = XGBRegressor(learning_rate=0.1,random_state=0, n_estimators=10) | |
model.fit(x_train,y_train) | |
y_predict=model.predict(x_test) | |
r2_score(y_test,y_predict) | |
###KNN | |
from sklearn.neighbors import KNeighborsRegressor | |
model = KNeighborsRegressor(n_neighbors=5) | |
model.fit(x_train,y_train) | |
y_predict=model.predict(x_test) | |
r2_score(y_test,y_predict) | |
###Random Forest Regressor | |
from sklearn.ensemble import RandomForestRegressor | |
model = RandomForestRegressor(n_estimators=10,random_state=0) | |
model.fit(x_train,y_train) | |
y_predict=model.predict(x_test) | |
r2_score(y_test,y_predict) | |
###Support Vector Machine | |
from sklearn.svm import SVR | |
from sklearn.svm import SVR | |
model=SVR() | |
model.fit(x_train, y_train) | |
y_predict=model.predict(x_test) | |
r2_score(y_test,y_predict) | |
###DecisionTree Regressor | |
from sklearn.tree import DecisionTreeRegressor | |
model=DecisionTreeRegressor(random_state=42) | |
model.fit(x_train,y_train) | |
y_predict=model.predict(x_test) | |
r2_score(y_test,y_predict) | |
###Adaboost | |
from sklearn.ensemble import AdaBoostRegressor | |
adaboost_regressor = AdaBoostRegressor(n_estimators=1500, learning_rate = 0.001, loss='exponential') | |
model.fit(x_train,y_train) | |
y_predict=model.predict(x_test) | |
r2_score(y_test,y_predict) | |
####Graphical analysis of data | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import calendar | |
fig = px.bar(d1, x="Crop", y="Total", color="District") | |
fig.show() | |
fig.write_html("total-yield.html") | |
fig = px.bar(d1, x="Season", y="Total", color="Crop") | |
fig.show() | |
fig.write_html("total-yield.html") | |
fig = px.bar(d1, x="District", y="Total", color="Crop") | |
fig.show() | |
fig.write_html("total-yield.html") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment