Skip to content

Instantly share code, notes, and snippets.

Pros Cons
Easy to interpret and understand sensitive to input data range and requires normalization/standardization
Does not assume distribution of classes Fits linear boundaries
Can be extended to multi-classes Requires no multicollinearity amongst input features
Runs quickly Has a linear decision surface; cannot be used for non-linear problems
Model coefficients can be used as indicators of feature importance Cannot be used to understand complex relationships between variables
Overfitting occurs if number of features is smaller than dataset size Overfitting can be minimized via regularization of the C parameter
@shedoesdatascience
shedoesdatascience / iris_logistic_regression.py
Last active April 14, 2021 02:34
Logistic regression applied to iris dataset
import matplotlib.pyplot as plt
from sklearn import datasets
import numpy as np
import seaborn as sns
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
# from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt
iris = datasets.load_iris()
X = iris.data
@shedoesdatascience
shedoesdatascience / iris_roc.py
Created April 13, 2021 05:09
ROC curve on Iris dataset
import pylab as pl
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc)
# Plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
@shedoesdatascience
shedoesdatascience / iris_knn.py
Created April 9, 2021 00:45
k-nearestneighbours for iris dataset
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import precision_recall_curve
# from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt
iris = datasets.load_iris()
X = iris.data
y = iris.target
+------------+----------------+----------------+
| | | |
+------------+----------------+----------------+
| | 1 (Predicted) | 0 (Predicted) |
| 1 (Actual) | True Positive | False Negative |
| 0 (Actual) | False Positive | True Negative |
+------------+----------------+----------------+
@shedoesdatascience
shedoesdatascience / confusion_matrix_covid-test.txt
Last active October 16, 2020 03:12
confusion_matrix_covid-test
+------------------+---------------------+------------------+-------+
| n=165 | Predicted: Negative |Predicted:Positive| Total |
+------------------+---------------------+------------------+-------+
| Actual: Negative | TN = 50 | FP = 10 | 60 |
| Actual: Positive | FN = 5 | TP = 100 | 105 |
| Total | 55 | 110 | |
+------------------+---------------------+------------------+-------+
##### Twitter Analysis ####
#### 1. load packages ####
Needed <-c("twitteR","SentimentAnalysis","quanteda","tm","EGAnet","tidytext","wordcloud")
install.packages(Needed,dependencies=TRUE)
library(rtweet)
library(twitteR)
library(dplyr)
library(tidyr)
library(tidytext)
+------------------+--------+----------+
| clicked_on_email | mean | sd |
+------------------+--------+----------+
| 0 | $37392 | $74,210  |
| 1 | $43615 | $76,403 |
+------------------+--------+----------+