This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # Install specific libraries | |
| ! pip install transformers | |
| ! pip install pycaret | |
| import numpy as np | |
| import pandas as pd | |
| import pycaret | |
| import transformers | |
| from transformers import AutoModel, BertTokenizerFast | |
| import matplotlib.pyplot as plt | |
| from sklearn.metrics import plot_confusion_matrix | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # Set up environment | |
| !pip install transformers | |
| from transformers import pipeline | |
| unmasker = pipeline('fill-mask', model='bert-base-uncased') | |
| # To freak you out :) | |
| unmasker("Artificial Intelligence [MASK] take over the world.") | |
| # Understanding context.. | |
| unmasker("My wife is so obsessed with cleanliness, that [MASK] will throw me out of the house one day.") | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # Library imports | |
| import pandas as pd | |
| import numpy as np | |
| import spacy | |
| from sklearn.svm import LinearSVC | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.pipeline import Pipeline | |
| import joblib | |
| import string | |
| from spacy.lang.en.stop_words import STOP_WORDS | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # Split data into 80:20 Training Test | |
| from sklearn.model_selection import train_test_split | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) | |
| # Model training | |
| from sklearn.ensemble import RandomForestRegressor | |
| rf_reg = RandomForestRegressor() | |
| rf_reg.fit(X_train, y_train) | |
| # Model R2 on Training & Test | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # Drop 'Source_Delhi' | |
| X = data_train.loc[:, ['Total_Stops', 'journey_day', 'journey_month', 'dep_hour', | |
| 'dep_min', 'arrival_hour', 'arrival_min', 'Duration_hours', | |
| 'Duration_mins', 'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo', | |
| 'Airline_Jet Airways', 'Airline_Multiple carriers', 'Airline_Other', | |
| 'Airline_SpiceJet', 'Airline_Vistara', 'Source_Chennai', | |
| 'Source_Kolkata', 'Source_Mumbai', 'Destination_Cochin', | |
| 'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata']] | |
| X.head() | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # Checking for Multicollinearity | |
| from statsmodels.stats.outliers_influence import variance_inflation_factor | |
| def calc_vif(z): | |
| # Calculating Variable Inflation Factor (VIF) | |
| vif = pd.DataFrame() | |
| vif["variables"] = z.columns | |
| vif["VIF"] = [variance_inflation_factor(z.values, i) for i in range(z.shape[1])] | |
| return(vif) | |
| # Compute VIF on X | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | X = data_train.loc[:, ['Total_Stops', 'journey_day', 'journey_month', 'dep_hour', | |
| 'dep_min', 'arrival_hour', 'arrival_min', 'Duration_hours', | |
| 'Duration_mins', 'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo', | |
| 'Airline_Jet Airways', 'Airline_Multiple carriers', 'Airline_Other', | |
| 'Airline_SpiceJet', 'Airline_Vistara', 'Source_Chennai', 'Source_Delhi', | |
| 'Source_Kolkata', 'Source_Mumbai', 'Destination_Cochin', | |
| 'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata']] | |
| y = data_train.iloc[:, 1] | |
| print(X.shape, y.shape) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # Concatenate dataframe --> train_data + Airline + Source + Destination | |
| data_train = pd.concat([dataset, Airline, Source, Destination], axis = 1) # axis = 1 signifies column | |
| data_train.drop(["Airline", "Source", "Destination"], axis = 1, inplace = True) | |
| data_train.head() | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # Additional_Info contains almost 80% no_info | |
| # Route and Total_Stops are related to each other | |
| dataset.drop(["Route", "Additional_Info"], axis = 1, inplace = True) | |
| # Feature engineering on: Total_Stops | |
| print(dataset["Total_Stops"].value_counts()) | |
| # As this is case of Ordinal Categorical type we perform LabelEncoder | |
| # Here Values are assigned with corresponding keys | |
| dataset.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace = True) | |
| dataset.head() | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # Feature engineering on: Destination | |
| print(dataset["Destination"].value_counts()) | |
| # Renaming destination 'New Delhi' to 'Delhi' - to match with Source | |
| Destination = dataset[["Destination"]] | |
| Current_Destination_List = Destination['Destination'] | |
| New_Destination_List = [] | |
| for value in Current_Destination_List: | |
| if value in ['New Delhi']: | |
| New_Destination_List.append('Delhi') | |
| else: |