Created
July 6, 2022 12:50
-
-
Save rhiskey/5297b5ce38effed8ddf20ab1c599dd67 to your computer and use it in GitHub Desktop.
Seminarus_Article_recommender.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Seminarus_Article_recommender.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyM8cghO/jqdFlJ1wp7wUK5G", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/rhiskey/5297b5ce38effed8ddf20ab1c599dd67/seminarus_article_recommender.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Рекомендательная система статей для сайта\n", | |
"Задачи:\n", | |
"1. Понять содержание статьи\n", | |
"2. Сопоставить содержание со всеми другими статьями\n", | |
"3. Рекомендовать наиболее подходящие статьи для статьи которую читатель уже читает\n", | |
"\n", | |
"**Косинусное подобие**\n", | |
"## Загрузка данных" | |
], | |
"metadata": { | |
"id": "Zkr5zfY4UbGX" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from sklearn.feature_extraction import text\n", | |
"from sklearn.metrics.pairwise import cosine_similarity\n", | |
"\n", | |
"data = pd.read_csv(\"https://gist.githubusercontent.com/rhiskey/3f8b7bf17f44567237a8f852b5e8c01e/raw/1dea35a323f9c28b568de7cdec8b9ff5bb538a8b/cyberkoala_articles.csv\")\n", | |
"data.head()" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 206 | |
}, | |
"id": "33BCmY8dVMS1", | |
"outputId": "8c4c52d6-47cb-48b3-d71c-4caf4caa15e8" | |
}, | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
" Article \\\n", | |
"0 Впусти Мстителей в свой дом! Рассмотрим как ра... \n", | |
"1 В чем секрет выбора функции активации?Несмотря... \n", | |
"2 Геометрическое глубокое обучение (Geometric De... \n", | |
"3 Кластеризация музыкальных жанров в Spotify. В ... \n", | |
"4 Попытка объединить способность рекуррентной не... \n", | |
"\n", | |
" Title \n", | |
"0 Распознавание лиц для умного дома \n", | |
"1 Тайна постоянно растущего числа функций активации \n", | |
"2 Геометрические интерпретации глубокого обучения \n", | |
"3 Кластеризация музыкальных жанров в Spotify \n", | |
"4 Архитектура модели Wikinet. Игра по гиперссылкам " | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-6953dac8-25e8-4543-9aa7-cc7a1bae2412\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Article</th>\n", | |
" <th>Title</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Впусти Мстителей в свой дом! Рассмотрим как ра...</td>\n", | |
" <td>Распознавание лиц для умного дома</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>В чем секрет выбора функции активации?Несмотря...</td>\n", | |
" <td>Тайна постоянно растущего числа функций активации</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Геометрическое глубокое обучение (Geometric De...</td>\n", | |
" <td>Геометрические интерпретации глубокого обучения</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Кластеризация музыкальных жанров в Spotify. В ...</td>\n", | |
" <td>Кластеризация музыкальных жанров в Spotify</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Попытка объединить способность рекуррентной не...</td>\n", | |
" <td>Архитектура модели Wikinet. Игра по гиперссылкам</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-6953dac8-25e8-4543-9aa7-cc7a1bae2412')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-6953dac8-25e8-4543-9aa7-cc7a1bae2412 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-6953dac8-25e8-4543-9aa7-cc7a1bae2412');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 1 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Косинусное подобие" | |
], | |
"metadata": { | |
"id": "t4UbtDZsecVh" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"articles = data[\"Article\"].tolist()\n", | |
"\n", | |
"uni_tfidf = text.TfidfVectorizer(input=articles)\n", | |
"uni_matrix = uni_tfidf.fit_transform(articles)\n", | |
"\n", | |
"uni_sim = cosine_similarity(uni_matrix)\n", | |
"\n", | |
"def recommend_articles(x):\n", | |
" return \", \".join(data[\"Title\"].loc[x.argsort()[-5:-1]])\n", | |
"\n", | |
"data[\"Recommended Articles\"] = [recommend_articles(x) for x in uni_sim]\n", | |
"data.head()" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 206 | |
}, | |
"id": "mLJ5_JRPXq2j", | |
"outputId": "0740cb96-b2f7-4b31-d128-09ab48edd6d7" | |
}, | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
" Article \\\n", | |
"0 Впусти Мстителей в свой дом! Рассмотрим как ра... \n", | |
"1 В чем секрет выбора функции активации?Несмотря... \n", | |
"2 Геометрическое глубокое обучение (Geometric De... \n", | |
"3 Кластеризация музыкальных жанров в Spotify. В ... \n", | |
"4 Попытка объединить способность рекуррентной не... \n", | |
"\n", | |
" Title \\\n", | |
"0 Распознавание лиц для умного дома \n", | |
"1 Тайна постоянно растущего числа функций активации \n", | |
"2 Геометрические интерпретации глубокого обучения \n", | |
"3 Кластеризация музыкальных жанров в Spotify \n", | |
"4 Архитектура модели Wikinet. Игра по гиперссылкам \n", | |
"\n", | |
" Recommended Articles \n", | |
"0 Кластеризация музыкальных жанров в Spotify, Ар... \n", | |
"1 Архитектура модели Wikinet. Игра по гиперссылк... \n", | |
"2 Архитектура модели Wikinet. Игра по гиперссылк... \n", | |
"3 Геометрические интерпретации глубокого обучени... \n", | |
"4 Кластеризация музыкальных жанров в Spotify, \"... " | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-80488245-07ee-4eb7-aaee-ffe95e97e539\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Article</th>\n", | |
" <th>Title</th>\n", | |
" <th>Recommended Articles</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Впусти Мстителей в свой дом! Рассмотрим как ра...</td>\n", | |
" <td>Распознавание лиц для умного дома</td>\n", | |
" <td>Кластеризация музыкальных жанров в Spotify, Ар...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>В чем секрет выбора функции активации?Несмотря...</td>\n", | |
" <td>Тайна постоянно растущего числа функций активации</td>\n", | |
" <td>Архитектура модели Wikinet. Игра по гиперссылк...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Геометрическое глубокое обучение (Geometric De...</td>\n", | |
" <td>Геометрические интерпретации глубокого обучения</td>\n", | |
" <td>Архитектура модели Wikinet. Игра по гиперссылк...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Кластеризация музыкальных жанров в Spotify. В ...</td>\n", | |
" <td>Кластеризация музыкальных жанров в Spotify</td>\n", | |
" <td>Геометрические интерпретации глубокого обучени...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Попытка объединить способность рекуррентной не...</td>\n", | |
" <td>Архитектура модели Wikinet. Игра по гиперссылкам</td>\n", | |
" <td>Кластеризация музыкальных жанров в Spotify, \"...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-80488245-07ee-4eb7-aaee-ffe95e97e539')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-80488245-07ee-4eb7-aaee-ffe95e97e539 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-80488245-07ee-4eb7-aaee-ffe95e97e539');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 3 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"data[\"Recommended Articles\"][4]" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"id": "_4r1mP9xcQ6U", | |
"outputId": "938c6430-6149-4715-c913-25c3f81fa372" | |
}, | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"'Кластеризация музыкальных жанров в Spotify, \"Кластеризация рекомендательная система\", Геометрические интерпретации глубокого обучения, Распознавание лиц для умного дома'" | |
], | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
} | |
}, | |
"metadata": {}, | |
"execution_count": 4 | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment