Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Pranjal-Soni/0298c968fa42bbdbf8215bcebcb84a79 to your computer and use it in GitHub Desktop.
Save Pranjal-Soni/0298c968fa42bbdbf8215bcebcb84a79 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fake Job Description Prediction\n",
"\n",
"### About Dataset :\n",
"This dataset contains 18K job descriptions out of which about 800 are fake. The data consists of both textual information and meta-information about the jobs. The dataset can be used to create classification models which can learn the job descriptions which are fraudulent."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import nltk\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"WNlemma = nltk.WordNetLemmatizer()\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"#importing the data\n",
"data = pd.read_csv('fake_job_postings.csv')\n",
"target = data['fraudulent']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Exploratory Data Analysis"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"job_id 0.000000\n",
"title 0.000000\n",
"location 1.935123\n",
"department 64.580537\n",
"salary_range 83.959732\n",
"company_profile 18.501119\n",
"description 0.005593\n",
"requirements 15.072707\n",
"benefits 40.324385\n",
"telecommuting 0.000000\n",
"has_company_logo 0.000000\n",
"has_questions 0.000000\n",
"employment_type 19.412752\n",
"required_experience 39.429530\n",
"required_education 45.329978\n",
"industry 27.421700\n",
"function 36.101790\n",
"fraudulent 0.000000\n",
"dtype: float64\n"
]
}
],
"source": [
"print((data.isna().sum()/len(data))*100)\n",
"#drop department and salary_range because it have about 60 persent null values and removing irrelavent data from the dataset\n",
"data.drop(['job_id','salary_range','department','benefits'],axis=1,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>location</th>\n",
" <th>company_profile</th>\n",
" <th>description</th>\n",
" <th>requirements</th>\n",
" <th>telecommuting</th>\n",
" <th>has_company_logo</th>\n",
" <th>has_questions</th>\n",
" <th>employment_type</th>\n",
" <th>required_experience</th>\n",
" <th>required_education</th>\n",
" <th>industry</th>\n",
" <th>function</th>\n",
" <th>fraudulent</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Marketing Intern</td>\n",
" <td>US, NY, New York</td>\n",
" <td>We're Food52, and we've created a groundbreaki...</td>\n",
" <td>Food52, a fast-growing, James Beard Award-winn...</td>\n",
" <td>Experience with content management systems a m...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Other</td>\n",
" <td>Internship</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Marketing</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Customer Service - Cloud Video Production</td>\n",
" <td>NZ, , Auckland</td>\n",
" <td>90 Seconds, the worlds Cloud Video Production ...</td>\n",
" <td>Organised - Focused - Vibrant - Awesome!Do you...</td>\n",
" <td>What we expect from you:Your key responsibilit...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Full-time</td>\n",
" <td>Not Applicable</td>\n",
" <td>NaN</td>\n",
" <td>Marketing and Advertising</td>\n",
" <td>Customer Service</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Commissioning Machinery Assistant (CMA)</td>\n",
" <td>US, IA, Wever</td>\n",
" <td>Valor Services provides Workforce Solutions th...</td>\n",
" <td>Our client, located in Houston, is actively se...</td>\n",
" <td>Implement pre-commissioning and commissioning ...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Account Executive - Washington DC</td>\n",
" <td>US, DC, Washington</td>\n",
" <td>Our passion for improving quality of life thro...</td>\n",
" <td>THE COMPANY: ESRI – Environmental Systems Rese...</td>\n",
" <td>EDUCATION: Bachelor’s or Master’s in GIS, busi...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Full-time</td>\n",
" <td>Mid-Senior level</td>\n",
" <td>Bachelor's Degree</td>\n",
" <td>Computer Software</td>\n",
" <td>Sales</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Bill Review Manager</td>\n",
" <td>US, FL, Fort Worth</td>\n",
" <td>SpotSource Solutions LLC is a Global Human Cap...</td>\n",
" <td>JOB TITLE: Itemization Review ManagerLOCATION:...</td>\n",
" <td>QUALIFICATIONS:RN license in the State of Texa...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Full-time</td>\n",
" <td>Mid-Senior level</td>\n",
" <td>Bachelor's Degree</td>\n",
" <td>Hospital &amp; Health Care</td>\n",
" <td>Health Care Provider</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" title location \\\n",
"0 Marketing Intern US, NY, New York \n",
"1 Customer Service - Cloud Video Production NZ, , Auckland \n",
"2 Commissioning Machinery Assistant (CMA) US, IA, Wever \n",
"3 Account Executive - Washington DC US, DC, Washington \n",
"4 Bill Review Manager US, FL, Fort Worth \n",
"\n",
" company_profile \\\n",
"0 We're Food52, and we've created a groundbreaki... \n",
"1 90 Seconds, the worlds Cloud Video Production ... \n",
"2 Valor Services provides Workforce Solutions th... \n",
"3 Our passion for improving quality of life thro... \n",
"4 SpotSource Solutions LLC is a Global Human Cap... \n",
"\n",
" description \\\n",
"0 Food52, a fast-growing, James Beard Award-winn... \n",
"1 Organised - Focused - Vibrant - Awesome!Do you... \n",
"2 Our client, located in Houston, is actively se... \n",
"3 THE COMPANY: ESRI – Environmental Systems Rese... \n",
"4 JOB TITLE: Itemization Review ManagerLOCATION:... \n",
"\n",
" requirements telecommuting \\\n",
"0 Experience with content management systems a m... 0 \n",
"1 What we expect from you:Your key responsibilit... 0 \n",
"2 Implement pre-commissioning and commissioning ... 0 \n",
"3 EDUCATION: Bachelor’s or Master’s in GIS, busi... 0 \n",
"4 QUALIFICATIONS:RN license in the State of Texa... 0 \n",
"\n",
" has_company_logo has_questions employment_type required_experience \\\n",
"0 1 0 Other Internship \n",
"1 1 0 Full-time Not Applicable \n",
"2 1 0 NaN NaN \n",
"3 1 0 Full-time Mid-Senior level \n",
"4 1 1 Full-time Mid-Senior level \n",
"\n",
" required_education industry function \\\n",
"0 NaN NaN Marketing \n",
"1 NaN Marketing and Advertising Customer Service \n",
"2 NaN NaN NaN \n",
"3 Bachelor's Degree Computer Software Sales \n",
"4 Bachelor's Degree Hospital & Health Care Health Care Provider \n",
"\n",
" fraudulent \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],\n",
" <a list of 20 Text xticklabel objects>)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 864x432 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"#Plotting the bar plot jobs funtion counts\n",
"plt.figure(figsize=(12,6))\n",
"plt.bar(data.function.value_counts().index[:20],data.function.value_counts()[:20])\n",
"plt.title('Top 20 Job Functions')\n",
"plt.xlabel('Job function Name')\n",
"plt.ylabel(\"No. of Job Functions\")\n",
"degrees = 75\n",
"plt.xticks(rotation=degrees)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0, 0.5, 'Industry Name')"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 720x432 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"#plot top 20 industries which give most of job posting\n",
"plt.figure(figsize=(10,6))\n",
"plt.barh(data.industry.value_counts()[:20].index,data.industry.value_counts()[:20])\n",
"plt.title('Top 20 Industries With More Jobs')\n",
"plt.xlabel('No. of Jobs')\n",
"plt.ylabel(\"Industry Name\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of fake Job description: 866\n",
"Number of correct job description: 17014\n"
]
}
],
"source": [
"#separate both data to analyse them\n",
"fradulent_data = data[data.fraudulent==1]\n",
"non_fradulent_data = data[data.fraudulent==0]\n",
"\n",
"print('Number of fake Job description: ',len(fradulent_data))\n",
"print('Number of correct job description: ',len(non_fradulent_data))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"title 0.000000\n",
"location 2.193995\n",
"company_profile 67.782910\n",
"description 0.115473\n",
"requirements 17.782910\n",
"telecommuting 0.000000\n",
"has_company_logo 0.000000\n",
"has_questions 0.000000\n",
"employment_type 27.829099\n",
"required_experience 50.230947\n",
"required_education 52.078522\n",
"industry 31.755196\n",
"function 38.914550\n",
"fraudulent 0.000000\n",
"dtype: float64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Missing values in fake jobs data\n",
"fradulent_data.isna().sum()/866*100"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"title 0.000000\n",
"location 1.921947\n",
"company_profile 15.992712\n",
"description 0.000000\n",
"requirements 14.934760\n",
"telecommuting 0.000000\n",
"has_company_logo 0.000000\n",
"has_questions 0.000000\n",
"employment_type 18.984366\n",
"required_experience 38.879746\n",
"required_education 44.986482\n",
"industry 27.201128\n",
"function 35.958622\n",
"fraudulent 0.000000\n",
"dtype: float64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#missing values in correct jobs\n",
"non_fradulent_data.isna().sum()/len(non_fradulent_data)*100"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"#removing target values form dataset\n",
"data.drop(['fraudulent'],axis=1,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['title', 'location', 'company_profile', 'description', 'requirements',\n",
" 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',\n",
" 'required_experience', 'required_education', 'industry', 'function'],\n",
" dtype='object')"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cleaning and preprocessing the data"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"def remove_links(text):\n",
" text = re.sub(r'http://[\\w|\\S]+',' ',str(text))\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def lower_text(text):\n",
" text = str(text)\n",
" text = ' '.join(x.lower() for x in text.split())\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import stopwords\n",
"stop = stopwords.words('english')\n",
"def remove_stopWords(text):\n",
" text = str(text)\n",
" text = ''.join( x for x in text if x.split() not in stop)\n",
" return(text)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def remove_non_imp_words(series):\n",
" most_freq = pd.Series(' '.join(series).split()).value_counts()[:30]\n",
" less_freq = pd.Series(' '.join(series).split()).value_counts()[-30:]\n",
" series = series.apply(lambda x: \" \".join(x for x in x.split() if x not in most_freq))\n",
" series = series.apply(lambda x: \" \".join(x for x in x.split() if x not in less_freq))\n",
" return series\n",
"\n",
"def remove_nan(text):\n",
" if text == 'nan' or text == '':\n",
" text = 'not given'\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def data_cleaning(feature):\n",
" feature = feature.apply(lambda x : remove_links(x))\n",
" feature = feature.apply(lambda x : lower_text(x))\n",
" feature = feature.apply(lambda x : remove_stopWords(x))\n",
" feature = feature.str.replace('[^\\w\\s]',' 111')\n",
" feature = remove_non_imp_words(feature)\n",
" feature = feature.apply(lambda x: ' '.join([WNlemma.lemmatize(word) for word in x.split()]))\n",
" feature = feature.apply(lambda x : remove_nan(x))\n",
" return feature\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"data['required_education'].fillna('no_info_about_education',inplace = True)\n",
"data['employment_type'].fillna('no_info_about_employment',inplace = True)\n",
"data['required_experience'].fillna('experience_not_asked',inplace = True)\n",
"data['industry'].fillna('industry_not_given',inplace = True)\n",
"data['function'].fillna('function_not_given',inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Pranjal Soni\\AppData\\Roaming\\Python\\Python37\\site-packages\\ipykernel_launcher.py:6: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \n"
]
}
],
"source": [
"#dealing with company profile feature\n",
"for i in range(len(data.company_profile)):\n",
" if data.company_profile[i]=='NaN':\n",
" data.company_profile[i] = 'company_profile_not_given'\n",
" else:\n",
" data.company_profile[i] = 'company_profile_given'"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"#dealing catogorical data\n",
"cat_cols = ['employment_type','required_experience','required_education','industry','function','company_profile']\n",
"for c in cat_cols:\n",
" encoded = pd.get_dummies(data[c])\n",
" data = pd.concat([data,encoded],axis = 1 )\n",
"cat_cols = ['employment_type','required_experience','required_education','industry','function','title','location','company_profile']\n",
"data.drop(cat_cols,axis=1,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"#dealing with text data\n",
"description = data['description']+ ' ' + data['requirements']\n",
"description = data_cleaning(description)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"tfidf = TfidfVectorizer( min_df = 0.05, ngram_range=(1,3))\n",
"tfidf_features = tfidf.fit_transform(description) \n",
"tfidf_vect_df = pd.DataFrame(tfidf_features.todense(), columns = tfidf.get_feature_names())\n",
"data = pd.concat([data, tfidf_vect_df], axis = 1)\n",
"data.drop(['description','requirements'],axis = 1 , inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(17880, 727)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Applying different classification algorithms"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_selection import SelectFromModel\n",
"from sklearn.metrics import accuracy_score,classification_report\n",
"from sklearn.metrics import precision_score,recall_score\n",
"from sklearn.metrics import confusion_matrix\n",
"from sklearn.model_selection import StratifiedKFold\n",
"\n",
"X = data\n",
"y = target"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.utils import resample\n",
"\n",
"def oversampling(X_train,y_train,target_name):\n",
" \n",
" training_set = pd.concat([X_train, y_train], axis=1)\n",
" \n",
" # Separating classes\n",
" fraud = training_set[training_set[target_name] == 1]\n",
" not_fraud = training_set[training_set[target_name] == 0]\n",
" \n",
" oversample = resample(fraud, \n",
" replace=True, \n",
" n_samples=len(not_fraud),\n",
" random_state=42)\n",
" \n",
" # Returning to new training set\n",
" oversample_train = pd.concat([not_fraud, oversample])\n",
" oversample_train[target_name].value_counts(normalize=True)\n",
" \n",
" # Separate oversampled data into X and y sets\n",
" oversample_x_train = oversample_train.drop('fraudulent', axis=1)\n",
" oversample_y_train = oversample_train[target_name]\n",
" return(oversample_x_train, oversample_y_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Logistic Regression Model"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ********************************************** \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Data Accuracy : 0.94967305855558\n",
"Test Data Accuracy : 0.9172259507829977\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[12518 1093]\n",
" [ 277 13334]]\n",
"Confusion Matrix of Test Data : \n",
" [[3134 269]\n",
" [ 27 146]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.99 0.92 0.95 3403\n",
" 1 0.35 0.84 0.50 173\n",
"\n",
" accuracy 0.92 3576\n",
" macro avg 0.67 0.88 0.73 3576\n",
"weighted avg 0.96 0.92 0.93 3576\n",
"\n",
"\n",
" ********************************************** \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Data Accuracy : 0.9454485342737492\n",
"Test Data Accuracy : 0.9012863534675615\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[12502 1109]\n",
" [ 376 13235]]\n",
"Confusion Matrix of Test Data : \n",
" [[3076 327]\n",
" [ 26 147]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.99 0.90 0.95 3403\n",
" 1 0.31 0.85 0.45 173\n",
"\n",
" accuracy 0.90 3576\n",
" macro avg 0.65 0.88 0.70 3576\n",
"weighted avg 0.96 0.90 0.92 3576\n",
"\n",
"\n",
" ********************************************** \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Data Accuracy : 0.9480567188303578\n",
"Test Data Accuracy : 0.9077181208053692\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[12552 1059]\n",
" [ 355 13256]]\n",
"Confusion Matrix of Test Data : \n",
" [[3094 309]\n",
" [ 21 152]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.99 0.91 0.95 3403\n",
" 1 0.33 0.88 0.48 173\n",
"\n",
" accuracy 0.91 3576\n",
" macro avg 0.66 0.89 0.71 3576\n",
"weighted avg 0.96 0.91 0.93 3576\n",
"\n",
"\n",
" ********************************************** \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Data Accuracy : 0.949636323561825\n",
"Test Data Accuracy : 0.9049217002237137\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[12535 1076]\n",
" [ 295 13316]]\n",
"Confusion Matrix of Test Data : \n",
" [[3090 313]\n",
" [ 27 146]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.99 0.91 0.95 3403\n",
" 1 0.32 0.84 0.46 173\n",
"\n",
" accuracy 0.90 3576\n",
" macro avg 0.65 0.88 0.70 3576\n",
"weighted avg 0.96 0.90 0.92 3576\n",
"\n",
"\n",
" ********************************************** \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Data Accuracy : 0.9475095503967088\n",
"Test Data Accuracy : 0.9119127516778524\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[12538 1074]\n",
" [ 355 13257]]\n",
"Confusion Matrix of Test Data : \n",
" [[3121 281]\n",
" [ 34 140]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.99 0.92 0.95 3402\n",
" 1 0.33 0.80 0.47 174\n",
"\n",
" accuracy 0.91 3576\n",
" macro avg 0.66 0.86 0.71 3576\n",
"weighted avg 0.96 0.91 0.93 3576\n",
"\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
]
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"lr_clf = LogisticRegression(n_jobs=-1)\n",
"\n",
"kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)\n",
"for train_ix, test_ix in kfold.split(X_new,y):\n",
" print(' ********************************************** ')\n",
" #select rows\n",
" X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]\n",
" y_train, y_test = y[train_ix], y[test_ix]\n",
" \n",
" oversample_x_train, oversample_y_train = oversampling(X_train,y_train,'fraudulent')\n",
" \n",
" \n",
" #fit the model with training data\n",
" lr_clf.fit(oversample_x_train, oversample_y_train)\n",
" \n",
" #feature_selection\n",
" sfm = SelectFromModel(lr_clf,threshold=0.002)\n",
" sfm.fit(oversample_x_train, oversample_y_train)\n",
" \n",
" X_important_train = sfm.transform(oversample_x_train)\n",
" X_important_test = sfm.transform(X_test)\n",
" \n",
" lr_clf = LogisticRegression()\n",
" lr_clf.fit(X_important_train, oversample_y_train)\n",
" y_train_pred = lr_clf.predict(X_important_train)\n",
" y_test_pred = lr_clf.predict(X_important_test)\n",
" #evalute the model\n",
" print('Train Data Accuracy : ',accuracy_score( oversample_y_train,y_train_pred))\n",
" print('Test Data Accuracy : ',accuracy_score(y_test,y_test_pred))\n",
" \n",
" print()\n",
" print('Confusion Matrix of Train Data : \\n',confusion_matrix( oversample_y_train,y_train_pred))\n",
" print('Confusion Matrix of Test Data : \\n',confusion_matrix(y_test,y_test_pred))\n",
" print()\n",
" #pricision and recall for test and train data\n",
" print(classification_report(y_test,y_test_pred))\n",
" print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## KNN Model"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ********************************************** \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Data Accuracy : 0.9530159429872896\n",
"Test Data Accuracy : 0.9544183445190156\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[13290 321]\n",
" [ 958 12653]]\n",
"Confusion Matrix of Test Data : \n",
" [[3278 125]\n",
" [ 38 135]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.99 0.96 0.98 3403\n",
" 1 0.52 0.78 0.62 173\n",
"\n",
" accuracy 0.95 3576\n",
" macro avg 0.75 0.87 0.80 3576\n",
"weighted avg 0.97 0.95 0.96 3576\n",
"\n",
"\n",
" ********************************************** \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Data Accuracy : 0.9511791932995372\n",
"Test Data Accuracy : 0.9555369127516778\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[13308 303]\n",
" [ 1026 12585]]\n",
"Confusion Matrix of Test Data : \n",
" [[3288 115]\n",
" [ 44 129]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.99 0.97 0.98 3403\n",
" 1 0.53 0.75 0.62 173\n",
"\n",
" accuracy 0.96 3576\n",
" macro avg 0.76 0.86 0.80 3576\n",
"weighted avg 0.96 0.96 0.96 3576\n",
"\n",
"\n",
" ********************************************** \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Data Accuracy : 0.9570200573065902\n",
"Test Data Accuracy : 0.9538590604026845\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[13294 317]\n",
" [ 853 12758]]\n",
"Confusion Matrix of Test Data : \n",
" [[3285 118]\n",
" [ 47 126]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.99 0.97 0.98 3403\n",
" 1 0.52 0.73 0.60 173\n",
"\n",
" accuracy 0.95 3576\n",
" macro avg 0.75 0.85 0.79 3576\n",
"weighted avg 0.96 0.95 0.96 3576\n",
"\n",
"\n",
" ********************************************** \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Data Accuracy : 0.9548159576812872\n",
"Test Data Accuracy : 0.9530201342281879\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[13302 309]\n",
" [ 921 12690]]\n",
"Confusion Matrix of Test Data : \n",
" [[3277 126]\n",
" [ 42 131]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.99 0.96 0.98 3403\n",
" 1 0.51 0.76 0.61 173\n",
"\n",
" accuracy 0.95 3576\n",
" macro avg 0.75 0.86 0.79 3576\n",
"weighted avg 0.96 0.95 0.96 3576\n",
"\n",
"\n",
" ********************************************** \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Data Accuracy : 0.9522847487511019\n",
"Test Data Accuracy : 0.9569351230425056\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[13311 301]\n",
" [ 998 12614]]\n",
"Confusion Matrix of Test Data : \n",
" [[3287 115]\n",
" [ 39 135]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.99 0.97 0.98 3402\n",
" 1 0.54 0.78 0.64 174\n",
"\n",
" accuracy 0.96 3576\n",
" macro avg 0.76 0.87 0.81 3576\n",
"weighted avg 0.97 0.96 0.96 3576\n",
"\n",
"\n"
]
}
],
"source": [
"from sklearn.neighbors import KNeighborsClassifier\n",
"\n",
"knn_clf = KNeighborsClassifier(n_jobs=-1)\n",
"\n",
"kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)\n",
"for train_ix, test_ix in kfold.split(X_new,y):\n",
" print(' ********************************************** ')\n",
" #select rows\n",
" X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]\n",
" y_train, y_test = y[train_ix], y[test_ix]\n",
" \n",
" oversample_x_train, oversample_y_train = oversampling(X_train,y_train,'fraudulent')\n",
" \n",
" \n",
" #fit the model with training data\n",
" lr_clf.fit(oversample_x_train, oversample_y_train)\n",
" \n",
" \n",
" knn_clf = KNeighborsClassifier()\n",
" knn_clf.fit(oversample_x_train, oversample_y_train)\n",
" y_train_pred = knn_clf.predict(oversample_x_train)\n",
" y_test_pred = knn_clf.predict(X_test)\n",
" #evalute the model\n",
" print('Train Data Accuracy : ',accuracy_score( oversample_y_train,y_train_pred))\n",
" print('Test Data Accuracy : ',accuracy_score(y_test,y_test_pred))\n",
" \n",
" print()\n",
" print('Confusion Matrix of Train Data : \\n',confusion_matrix( oversample_y_train,y_train_pred))\n",
" print('Confusion Matrix of Test Data : \\n',confusion_matrix(y_test,y_test_pred))\n",
" print()\n",
" #pricision and recall for test and train data\n",
" print(classification_report(y_test,y_test_pred))\n",
" print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Random Forest Model"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ********************************************** \n",
"Train Data Accuracy : 0.9865549922856514\n",
"Test Data Accuracy : 0.9711968680089486\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[13431 180]\n",
" [ 186 13425]]\n",
"Confusion Matrix of Test Data : \n",
" [[3359 44]\n",
" [ 59 114]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.99 0.98 3403\n",
" 1 0.72 0.66 0.69 173\n",
"\n",
" accuracy 0.97 3576\n",
" macro avg 0.85 0.82 0.84 3576\n",
"weighted avg 0.97 0.97 0.97 3576\n",
"\n",
"\n",
" ********************************************** \n",
"Train Data Accuracy : 0.9805304533098229\n",
"Test Data Accuracy : 0.9541387024608501\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[13355 256]\n",
" [ 274 13337]]\n",
"Confusion Matrix of Test Data : \n",
" [[3304 99]\n",
" [ 65 108]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.97 0.98 3403\n",
" 1 0.52 0.62 0.57 173\n",
"\n",
" accuracy 0.95 3576\n",
" macro avg 0.75 0.80 0.77 3576\n",
"weighted avg 0.96 0.95 0.96 3576\n",
"\n",
"\n",
" ********************************************** \n",
"Train Data Accuracy : 0.9814855631474543\n",
"Test Data Accuracy : 0.9661633109619687\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[13335 276]\n",
" [ 228 13383]]\n",
"Confusion Matrix of Test Data : \n",
" [[3336 67]\n",
" [ 54 119]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.98 0.98 3403\n",
" 1 0.64 0.69 0.66 173\n",
"\n",
" accuracy 0.97 3576\n",
" macro avg 0.81 0.83 0.82 3576\n",
"weighted avg 0.97 0.97 0.97 3576\n",
"\n",
"\n",
" ********************************************** \n",
"Train Data Accuracy : 0.9812284181911689\n",
"Test Data Accuracy : 0.9619686800894854\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[13330 281]\n",
" [ 230 13381]]\n",
"Confusion Matrix of Test Data : \n",
" [[3322 81]\n",
" [ 55 118]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.98 0.98 3403\n",
" 1 0.59 0.68 0.63 173\n",
"\n",
" accuracy 0.96 3576\n",
" macro avg 0.79 0.83 0.81 3576\n",
"weighted avg 0.96 0.96 0.96 3576\n",
"\n",
"\n",
" ********************************************** \n",
"Train Data Accuracy : 0.9840581839553335\n",
"Test Data Accuracy : 0.9600111856823266\n",
"\n",
"Confusion Matrix of Train Data : \n",
" [[13408 204]\n",
" [ 230 13382]]\n",
"Confusion Matrix of Test Data : \n",
" [[3334 68]\n",
" [ 75 99]]\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.98 0.98 3402\n",
" 1 0.59 0.57 0.58 174\n",
"\n",
" accuracy 0.96 3576\n",
" macro avg 0.79 0.77 0.78 3576\n",
"weighted avg 0.96 0.96 0.96 3576\n",
"\n",
"\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"rf_clf = RandomForestClassifier(n_jobs=-1)\n",
"\n",
"kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)\n",
"for train_ix, test_ix in kfold.split(X_new,y):\n",
" print(' ********************************************** ')\n",
" #select rows\n",
" X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]\n",
" y_train, y_test = y[train_ix], y[test_ix]\n",
" \n",
" oversample_x_train, oversample_y_train = oversampling(X_train,y_train,'fraudulent')\n",
" \n",
" \n",
" #fit the model with training data\n",
" rf_clf.fit(oversample_x_train, oversample_y_train)\n",
" \n",
" #feature_selection\n",
" sfm = SelectFromModel(rf_clf,threshold=0.002)\n",
" sfm.fit(oversample_x_train, oversample_y_train)\n",
" \n",
" X_important_train = sfm.transform(oversample_x_train)\n",
" X_important_test = sfm.transform(X_test)\n",
" \n",
" rf_clf = RandomForestClassifier(n_estimators=459, max_depth=30,oob_score=True,n_jobs=-1)\n",
" rf_clf.fit(X_important_train, oversample_y_train)\n",
" y_train_pred = rf_clf.predict(X_important_train)\n",
" y_test_pred = rf_clf.predict(X_important_test)\n",
" #evalute the model\n",
" print('Train Data Accuracy : ',accuracy_score( oversample_y_train,y_train_pred))\n",
" print('Test Data Accuracy : ',accuracy_score(y_test,y_test_pred))\n",
" \n",
" print()\n",
" print('Confusion Matrix of Train Data : \\n',confusion_matrix( oversample_y_train,y_train_pred))\n",
" print('Confusion Matrix of Test Data : \\n',confusion_matrix(y_test,y_test_pred))\n",
" print()\n",
" #pricision and recall for test and train data\n",
" print(classification_report(y_test,y_test_pred))\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"has_company_logo, has_questions, Full-time, no_info_about_employment, Associate, Mid-Senior level, experience_not_asked, Bachelor's Degree, High School or equivalent, Accounting, Computer Software, Hospital & Health Care, Internet, Oil & Energy, industry_not_given, Administrative, Engineering, function_not_given, 111amp, 111re, 111you, ability, about, area, at, based, benefit, best, but, by, call, candidate, care, client, communication, company, computer, control, creative, degree, deliver, developer, development, duty, engineering, english, environment, excel, field, first, flexible, from, full, get, growing, growth, ha, high, home, hour, if, industry, information, internet, it, job, join, knowledge, looking, love, management, marketing, medium, mobile, month, more, multiple, must, new, no, not, office, only, other, own, passionate, per, perform, personal, phone, platform, play, position, preferred, problem, process, product, provide, required, requirement, responsibility, right, role, sale, schedule, school diploma, seeking, service, social, software, strategy, system, technical, their, them, this, time, university, user, web, who, within, without, word, working, year, your, "
]
}
],
"source": [
"for feature_list_index in sfm.get_support(indices=True):\n",
" print(data.columns[feature_list_index],end= \", \")"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"#Saving the models\n",
"import pickle \n",
"logistic_model = pickle.dumps(lr_clf) \n",
"knn_model = pickle.dumps(knn_clf)\n",
"randonForest_model = pickle.dumps(rf_clf)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment