Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save MachineLearningIsEasy/31ac2499083f7952e908da6a40a2329b to your computer and use it in GitHub Desktop.
Save MachineLearningIsEasy/31ac2499083f7952e908da6a40a2329b to your computer and use it in GitHub Desktop.
NLP grid search, feature selection, grid search
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "3 Тюнинг модели отбор фичей.ipynb",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "5K1WlUmN6pJN"
},
"source": [
"![logo.png]()\n",
"\n",
"[перейти](https://www.bigdataschool.ru/)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "vI0rSTxNfMXu",
"outputId": "457c44b1-a5c3-4189-ea73-1a12138e6117",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 122
}
},
"source": [
"!pip install pymorphy2\n",
"import pandas as pd\n",
"import numpy as np\n",
"import nltk\n",
"import re\n",
"import csv\n",
"from nltk.stem import WordNetLemmatizer\n",
"import sklearn\n",
"import codecs\n",
"import pymorphy2\n",
"import seaborn as sns\n",
"sns.set_style(\"darkgrid\")\n",
"from nltk.stem.snowball import SnowballStemmer\n",
"\n",
"from google.colab import drive\n",
"drive.mount('/content/drive')"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: pymorphy2 in /usr/local/lib/python3.6/dist-packages (0.9.1)\n",
"Requirement already satisfied: dawg-python>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from pymorphy2) (0.7.2)\n",
"Requirement already satisfied: docopt>=0.6 in /usr/local/lib/python3.6/dist-packages (from pymorphy2) (0.6.2)\n",
"Requirement already satisfied: pymorphy2-dicts-ru<3.0,>=2.4 in /usr/local/lib/python3.6/dist-packages (from pymorphy2) (2.4.417127.4579844)\n",
"Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "LBF3ntXC5TWG",
"outputId": "c128d7ff-7e77-464c-cb02-ed770f628981",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 68
}
},
"source": [
"import nltk\n",
"nltk.download('stopwords')"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {
"tags": []
},
"execution_count": 2
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "NrdXlozI5Hev"
},
"source": [
"### Функции"
]
},
{
"cell_type": "code",
"metadata": {
"id": "dRIUfW88fI6A"
},
"source": [
"from nltk.corpus import stopwords\n",
"stopWords = set(stopwords.words('russian'))\n",
"\n",
"def csv_to_list(arr):\n",
" arr_list = []\n",
" for row in arr:\n",
" arr_list.append(list_to_str(row))\n",
" return arr_list\n",
"\n",
"def list_to_str(arr):\n",
" str_ = ''\n",
" for rec in arr:\n",
" str_+=rec\n",
" return str_\n",
"\n",
"def df_preprocess(text): \n",
" reg = re.compile('[^а-яА-яa-zA-Z0-9 ]') #\n",
" text = text.lower().replace(\"ё\", \"е\")\n",
" text = text.replace(\"ъ\", \"ь\")\n",
" text = text.replace(\"й\", \"и\")\n",
" text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', 'сайт', text)\n",
" text = re.sub('@[^\\s]+', 'пользователь', text)\n",
" text = reg.sub(' ', text)\n",
" \n",
" # Лемматизация\n",
" #morph = pymorphy2.MorphAnalyzer()\n",
" #text =[morph.parse(word)[0].normal_form for word in text.split()]\n",
"\n",
" # Стемминг\n",
" # stemmer = SnowballStemmer(\"russian\")\n",
" # text =[stemmer.stem(word) for word in text.split()]\n",
"\n",
" # Стемминг + удаление стоп слов\n",
" stemmer = SnowballStemmer(\"russian\")\n",
" #text =[stemmer.stem(word) for word in text.split() if word not in stopWords]\n",
" text = ' '.join([stemmer.stem(word) for word in text.split() if word not in stopWords])\n",
"\n",
" return text"
],
"execution_count": 3,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "0dCTRpn55ONd"
},
"source": [
"### Считываем данные\n",
"\n",
"Используем корпус с сайта https://study.mokoron.com/#download"
]
},
{
"cell_type": "code",
"metadata": {
"id": "87Chu_tOYxW2"
},
"source": [
"positive_recalls = csv_to_list(csv.reader(codecs.open('/content/drive/My Drive/Colab Notebooks/NLP/positive_recalls.csv', 'rU', 'utf-8', errors='ignore')))\n",
"negative_recalls = csv_to_list(csv.reader(codecs.open('/content/drive/My Drive/Colab Notebooks/NLP/negative_recalls.csv', 'rU', 'utf-8', errors='ignore')))"
],
"execution_count": 4,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "wF5HZPIb957U"
},
"source": [
"### Формируем датасет "
]
},
{
"cell_type": "code",
"metadata": {
"id": "KjrfTaD_7wo1",
"outputId": "6fb12b40-5ed6-4ffc-8244-fb73821a8ae7",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
}
},
"source": [
"df_positive_recalls = pd.DataFrame(positive_recalls, columns=['recall'])\n",
"df_positive_recalls['type']=1\n",
"df_positive_recalls.head()"
],
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>recall</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>@first_timee хоть я и школота но поверь у нас ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Да все-таки он немного похож на него. Но мой м...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>RT @KatiaCheh: Ну ты идиотка) я испугалась за ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>RT @digger2912: \"Кто то в углу сидит и погибае...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>@irina_dyshkant Вот что значит страшилка :D\\nН...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" recall type\n",
"0 @first_timee хоть я и школота но поверь у нас ... 1\n",
"1 Да все-таки он немного похож на него. Но мой м... 1\n",
"2 RT @KatiaCheh: Ну ты идиотка) я испугалась за ... 1\n",
"3 RT @digger2912: \"Кто то в углу сидит и погибае... 1\n",
"4 @irina_dyshkant Вот что значит страшилка :D\\nН... 1"
]
},
"metadata": {
"tags": []
},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "C3oNsEsg-2Yb",
"outputId": "e350344a-d31c-415e-c3af-117fe157bd91",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
}
},
"source": [
"df_negative_recalls = pd.DataFrame(negative_recalls, columns=['recall'])\n",
"df_negative_recalls['type']=0\n",
"df_negative_recalls.head()"
],
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>recall</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>на работе был полный пиддес :| и так каждое за...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Коллеги сидят рубятся в Urban terror а я из-за...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>@elina_4post как говорят обещаного три года жд...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Желаю хорошего полёта и удачной посадкия буду ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Обновил за каким-то лешим surf теперь не работ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" recall type\n",
"0 на работе был полный пиддес :| и так каждое за... 0\n",
"1 Коллеги сидят рубятся в Urban terror а я из-за... 0\n",
"2 @elina_4post как говорят обещаного три года жд... 0\n",
"3 Желаю хорошего полёта и удачной посадкия буду ... 0\n",
"4 Обновил за каким-то лешим surf теперь не работ... 0"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "2H5O4MG1-8uT",
"outputId": "371c897f-d089-4c7d-eef5-09349c967ae6",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
}
},
"source": [
"# Объединяем два датафрейма вместе\n",
"df_recalls = pd.concat((df_negative_recalls, df_positive_recalls),axis = 0).sample(frac = 1.0) # объединяем и перемешиваем\n",
"df_recalls.index = range(0,len(df_recalls))\n",
"df_recalls.head()"
],
"execution_count": 7,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>recall</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AAAW!!! МНЕ НРАВИТСЯ МОЙ ГОЛОС СЕЙЧАС! НЕМНОГО...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>@krissmurr аххаха;D\\nОбрадовала блин меня;(</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>@Doll_perian @RastaPushka в четверг ему будет ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>@JulikaKing у меня с телефона норм, а вот с ко...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>RT @locorti69: Как романтично писал Вольтер да...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" recall type\n",
"0 AAAW!!! МНЕ НРАВИТСЯ МОЙ ГОЛОС СЕЙЧАС! НЕМНОГО... 0\n",
"1 @krissmurr аххаха;D\\nОбрадовала блин меня;( 0\n",
"2 @Doll_perian @RastaPushka в четверг ему будет ... 1\n",
"3 @JulikaKing у меня с телефона норм, а вот с ко... 0\n",
"4 RT @locorti69: Как романтично писал Вольтер да... 1"
]
},
"metadata": {
"tags": []
},
"execution_count": 7
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-mTrgM0E_eAW"
},
"source": [
"### Очитска текста приведение слов к стандартному виду"
]
},
{
"cell_type": "code",
"metadata": {
"id": "gHI2wqsn_kMB",
"outputId": "fe638ba9-79dc-44c6-f82b-c75138c71e0b",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
}
},
"source": [
"%time df_recalls['recall'] = df_recalls['recall'].apply(df_preprocess)"
],
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"text": [
"CPU times: user 1min 40s, sys: 107 ms, total: 1min 40s\n",
"Wall time: 1min 40s\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Kca4OwK8BRPt",
"outputId": "367707c4-9ee6-451a-e262-1318d874ebc7",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
}
},
"source": [
"df_recalls.head()"
],
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>recall</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>aaaw нрав мо голос сеичас немн хриплы кароч вл...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>пользовател аххах d обрадова блин</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>пользовател пользовател четверг некогд</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>пользовател телефон норм комп ел</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>rt пользовател романтичн писа вольтер дам сво ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" recall type\n",
"0 aaaw нрав мо голос сеичас немн хриплы кароч вл... 0\n",
"1 пользовател аххах d обрадова блин 0\n",
"2 пользовател пользовател четверг некогд 1\n",
"3 пользовател телефон норм комп ел 0\n",
"4 rt пользовател романтичн писа вольтер дам сво ... 1"
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "O2NIMMwd_Cpi"
},
"source": [
"### Train/test split"
]
},
{
"cell_type": "code",
"metadata": {
"id": "_kkes5A7_CAl"
},
"source": [
"from sklearn.model_selection import train_test_split\n",
"X_train, X_test, y_train, y_test = train_test_split(df_recalls['recall'], df_recalls['type'], test_size=.15, random_state=42)\n"
],
"execution_count": 10,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZBy6f8q6-xzv"
},
"source": [
"### Формируем векторное представление (BOW)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "oeXBgN5q9gH5"
},
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score\n"
],
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "hDFo5A8wcinG",
"outputId": "4187c6fc-d53f-4562-8439-5e4372b56b6d",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 207
}
},
"source": [
"#-----------------------------------------------\n",
"vectorizer = CountVectorizer(ngram_range=(1, 2))\n",
"#-----------------------------------------------\n",
"X_train_BOW_bi = vectorizer.fit_transform(X_train)\n",
"X_test_BOW_bi = vectorizer.transform(X_test)\n",
"#-----------------------------------------------\n",
"print(X_train_BOW_bi.shape, X_test_BOW_bi.shape)\n",
"#-----------------------------------------------\n",
"clf = LogisticRegression(random_state=0).fit(X_train_BOW_bi, y_train)\n",
"#-----------------------------------------------\n",
"y_predict_BOW_bi = clf.predict(X_test_BOW_bi)\n",
"#-----------------------------------------------\n",
"accuracy_score(y_predict_BOW_bi, y_test)"
],
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"text": [
"(192808, 893869) (34026, 893869)\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
],
"name": "stderr"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.753071180861694"
]
},
"metadata": {
"tags": []
},
"execution_count": 12
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "YnHStNfuBBEG"
},
"source": [
"### Подбор гиперпараметров / отбор фичей / grid search / cross-validation"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "rGXGKrGNUps_"
},
"source": [
"#### Отбор фичей"
]
},
{
"cell_type": "code",
"metadata": {
"id": "CVdqq4pdBLn6",
"outputId": "e25b0869-7981-43e4-c398-fbd9a67db487",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 139
}
},
"source": [
"from sklearn.svm import LinearSVC\n",
"from sklearn.feature_selection import SelectFromModel\n",
"from sklearn.model_selection import cross_val_score\n",
"\n",
"lsvc = LinearSVC(C = .5) # C = 0.5\n",
"selective_model = SelectFromModel(lsvc, max_features = None)\n",
"\n",
"X_train_BOW_bi_select_features = selective_model.fit_transform(X_train_BOW_bi,y_train)\n",
"X_test_BOW_bi_select_features = selective_model.transform(X_test_BOW_bi)\n",
"print('\\nNew shapes: ', X_train_BOW_bi.shape, X_test_BOW_bi.shape)\n",
"print('\\nNew shapes: ', X_train_BOW_bi_select_features.shape, X_test_BOW_bi_select_features.shape)\n"
],
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"text": [
"\n",
"New shapes: (192808, 893869) (34026, 893869)\n",
"\n",
"New shapes: (192808, 361229) (34026, 361229)\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/svm/_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
" \"the number of iterations.\", ConvergenceWarning)\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ZHzMnCWl_QwY",
"outputId": "241f134c-de6e-4089-84c8-eb2c5e46bb64",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 547
}
},
"source": [
"clf = LogisticRegression(random_state=0)\n",
"scores = cross_val_score(clf, X_train_BOW_bi_select_features, y_train, cv=3, scoring='accuracy')\n",
"print(clf,'\\n Cross-validate: ', scores)"
],
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
" multi_class='auto', n_jobs=None, penalty='l2',\n",
" random_state=0, solver='lbfgs', tol=0.0001, verbose=0,\n",
" warm_start=False) \n",
" Cross-validate: [0.77186868 0.76962455 0.77066704]\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "0MlQnqAHVve3",
"outputId": "829e889e-26bb-4775-8b95-f42067590cd8",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 190
}
},
"source": [
"#-----------------------------------------------\n",
"clf = LogisticRegression(random_state=0).fit(X_train_BOW_bi_select_features, y_train)\n",
"#-----------------------------------------------\n",
"y_predict_BOW_bi = clf.predict(X_test_BOW_bi_select_features)\n",
"#-----------------------------------------------\n",
"accuracy_score(y_predict_BOW_bi, y_test)"
],
"execution_count": 15,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
],
"name": "stderr"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.7490448480573679"
]
},
"metadata": {
"tags": []
},
"execution_count": 15
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "FpYQg7MBUst-"
},
"source": [
"#### GridSearch"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Z8p1o4C5I6LY"
},
"source": [
"from sklearn.model_selection import GridSearchCV"
],
"execution_count": 17,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "7EI_0sL5UwaV",
"outputId": "ac6f04d3-cde4-4515-8113-6df8dec60079",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
}
},
"source": [
"grid={\n",
" \"C\":np.logspace(-3,3,4), \n",
" \"penalty\":[\"l2\"],\n",
" \"solver\":['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']\n",
" }\n",
"\n",
"clf=LogisticRegression()\n",
"logreg_cv=GridSearchCV(clf,grid,cv=3)\n",
"%time logreg_cv.fit(X_train_BOW_bi_select_features, y_train)\n",
"\n",
"print(\"tuned hpyerparameters :(best parameters) \",logreg_cv.best_params_)\n",
"print(\"accuracy :\",logreg_cv.best_score_)"
],
"execution_count": 19,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"CPU times: user 13min 38s, sys: 7min 25s, total: 21min 3s\n",
"Wall time: 12min 24s\n",
"tuned hpyerparameters :(best parameters) {'C': 10.0, 'penalty': 'l2', 'solver': 'liblinear'}\n",
"accuracy : 0.7846717905762045\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "edmsQBeZKFsF",
"outputId": "4c1d2ff7-49d0-4d1e-9d33-ca7e5040ab73",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"logreg_cv.best_params_"
],
"execution_count": 20,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'C': 10.0, 'penalty': 'l2', 'solver': 'liblinear'}"
]
},
"metadata": {
"tags": []
},
"execution_count": 20
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "cal6cdTaKBCJ",
"outputId": "0eb1cca9-80db-488f-f79c-a6022e5092ef",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"#-----------------------------------------------\n",
"clf = LogisticRegression(C=10.0, penalty='l2', solver='liblinear').fit(X_train_BOW_bi_select_features, y_train)\n",
"#-----------------------------------------------\n",
"y_predict_BOW_bi = clf.predict(X_test_BOW_bi_select_features)\n",
"#-----------------------------------------------\n",
"accuracy_score(y_predict_BOW_bi, y_test)"
],
"execution_count": 22,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.7483982836654323"
]
},
"metadata": {
"tags": []
},
"execution_count": 22
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment