Skip to content

Instantly share code, notes, and snippets.

@mananai
Created November 4, 2020 12:25
Show Gist options
  • Save mananai/03e446675b2729e52e3bce5e90a40c47 to your computer and use it in GitHub Desktop.
Save mananai/03e446675b2729e52e3bce5e90a40c47 to your computer and use it in GitHub Desktop.
User classifier full code
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sqlite3\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.7.7\n"
]
}
],
"source": [
"from platform import python_version\n",
"\n",
"print(python_version())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<sqlite3.Cursor at 0x12199dbb20>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from pathlib import Path\n",
"db_dir = str(Path.home()) + '/sqlitedb/'\n",
"classifier_file = db_dir + 'TwitterUserClassifier.db'\n",
"friend_file = db_dir + 'TwitterFriends.db'\n",
"conn = sqlite3.connect(classifier_file)\n",
"conn.execute('ATTACH DATABASE \"' + friend_file + '\" AS friends')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"User class1 shape: \n",
"(3038, 2)\n",
"User class0 shape: \n",
"(5261, 2)\n"
]
}
],
"source": [
"#dfUserClass1 = pd.read_sql_query('select id, 1 class from user_class1 order by random();', conn)\n",
"#dfUserClass0 = pd.read_sql_query('select id, 0 class from user_class0 order by random();', conn)\n",
"dfUserClass1 = pd.read_sql_query('select distinct u.id, 1 class '\n",
" 'from user_class1 u '\n",
" 'join friends.user_friend uf '\n",
" 'on u.id=uf.user_id and uf.friend_id is not null '\n",
" 'order by random();',\n",
" conn)\n",
"dfUserClass0 = pd.read_sql_query('select distinct u.id, 0 class '\n",
" 'from user_class0 u '\n",
" 'join friends.user_friend uf '\n",
" 'on u.id=uf.user_id and uf.friend_id is not null '\n",
" 'order by random();', conn)\n",
"print(\"User class1 shape: \")\n",
"print(dfUserClass1.shape)\n",
"print(\"User class0 shape: \")\n",
"print(dfUserClass0.shape)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"dfUserClass1 = dfUserClass1.head(3000)\n",
"dfUserClass0 = dfUserClass0.head(5000)\n",
"dfUserClassAll = pd.concat([dfUserClass1, dfUserClass0])\n",
"dfUserClassAll = dfUserClassAll.sample(frac=1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"User class all shape: \n",
"(8000, 2)\n",
"Data Set Train shape: \n",
"(5600, 2)\n",
"Data Set Test shape: \n",
"(2400, 2)\n"
]
}
],
"source": [
"splitPos = int(dfUserClassAll.shape[0]*0.70)\n",
"dfTrain = dfUserClassAll.iloc[0:splitPos]\n",
"dfTest = dfUserClassAll.iloc[splitPos:]\n",
"print(\"User class all shape: \")\n",
"print(dfUserClassAll.shape)\n",
"print(\"Data Set Train shape: \")\n",
"print(dfTrain.shape)\n",
"print(\"Data Set Test shape: \")\n",
"print(dfTest.shape)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 3417929 entries, 0 to 3417928\n",
"Data columns (total 2 columns):\n",
" # Column Dtype\n",
"--- ------ -----\n",
" 0 user_id int64\n",
" 1 friend_id int64\n",
"dtypes: int64(2)\n",
"memory usage: 52.2 MB\n"
]
}
],
"source": [
"dfUserFriend=pd.read_sql_query('select user_id, friend_id '\n",
" 'from friends.user_friend '\n",
" 'where friend_id is not null;', conn)\n",
"dfUserFriend.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 709151 entries, 0 to 709150\n",
"Data columns (total 4 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 709151 non-null int64 \n",
" 1 screen_name 709151 non-null object\n",
" 2 name 709151 non-null object\n",
" 3 followers_count 709151 non-null int64 \n",
"dtypes: int64(2), object(2)\n",
"memory usage: 21.6+ MB\n"
]
}
],
"source": [
"dfAllUsers = pd.read_sql_query('select u.id, u.screen_name, u.name, u.followers_count '\n",
" 'from friends.user u;', conn)\n",
"dfAllUsers.set_index(['id'])\n",
"dfAllUsers.info()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def buildTopUser(_df_user_friend, _df_user, _numTopUsers):\n",
" _i1 = _df_user_friend.set_index(['user_id']).index\n",
" _i2 = _df_user.set_index(['id']).index\n",
" _dfTopUser = _df_user_friend[_i1.isin(_i2)]\n",
" _dfTopUser = _dfTopUser.groupby(['friend_id'])['user_id'].count().reset_index(name='count').sort_values(['count'], ascending=False)[:_numTopUsers]\n",
" #_dfTopUser = pd.merge(_dfTopUser, _df_all_users, left_on='friend_id', right_on='id')\n",
" return _dfTopUser"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def buildFeatureVectors(_df_user_friend, _df_user, _df_top_user):\n",
" _i1 = _df_user_friend.set_index('user_id').index\n",
" _i2 = _df_user.set_index('id').index\n",
" _dfUserFriend2 = _df_user_friend[_i1.isin(_i2)]\n",
" _i3 = _dfUserFriend2.set_index('friend_id').index\n",
" _i4 = _df_top_user.set_index('friend_id').index\n",
" _dfUserFriend3 = _dfUserFriend2[_i3.isin(_i4)]\n",
" \n",
" _feature_vectors = pd.crosstab(index=_dfUserFriend3['user_id'], columns=_dfUserFriend3['friend_id'], dropna=False)\n",
" _feature_vectors = pd.merge(_feature_vectors, _df_user, left_on='user_id', right_on='id')\n",
" return _feature_vectors\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
"from sklearn.metrics import roc_auc_score"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"numTopUser = 500\n",
"dfTopUser = buildTopUser(dfUserFriend, dfTrain, numTopUser)\n",
"numTopUser=dfTopUser.shape[0]\n",
"feature_vectors_train = buildFeatureVectors(dfUserFriend, dfTrain, dfTopUser)\n",
"feature_vectors_test = buildFeatureVectors(dfUserFriend, dfTest, dfTopUser)\n",
"#Rearrange columns on the test\n",
"feature_vectors_test = feature_vectors_test[feature_vectors_train.columns]\n",
"X_train = feature_vectors_train.iloc[:, :-2].values\n",
"y_train = feature_vectors_train.iloc[:, (numTopUser+1)].values\n",
"X_test = feature_vectors_test.iloc[:, :-2].values\n",
"y_test = feature_vectors_test.iloc[:, (numTopUser+1)].values"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestClassifier()"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"classifier = RandomForestClassifier()\n",
"classifier.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ROC AUC: 0.9346447723041871\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.88 0.96 0.92 1395\n",
" 1 0.92 0.79 0.85 903\n",
"\n",
" accuracy 0.89 2298\n",
" macro avg 0.90 0.87 0.88 2298\n",
"weighted avg 0.89 0.89 0.89 2298\n",
"\n",
"Accuracy: 0.8920800696257616\n",
"Confusion Matrix:\n",
"[[1336 59]\n",
" [ 189 714]]\n"
]
}
],
"source": [
"y_pred = classifier.predict(X_test)\n",
"y_prob = classifier.predict_proba(X_test)[:, 1]\n",
"print(\"ROC AUC: \", roc_auc_score(y_test, y_prob))\n",
"print(\"Classification Report:\",)\n",
"print (classification_report(y_test, y_pred))\n",
"print(\"Accuracy:\", accuracy_score(y_test,y_pred))\n",
"print(\"Confusion Matrix:\")\n",
"print(confusion_matrix(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>feature</th>\n",
" <th>importance</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>496</th>\n",
" <td>1301760719906512896</td>\n",
" <td>0.066097</td>\n",
" </tr>\n",
" <tr>\n",
" <th>202</th>\n",
" <td>199992361</td>\n",
" <td>0.051649</td>\n",
" </tr>\n",
" <tr>\n",
" <th>142</th>\n",
" <td>92531902</td>\n",
" <td>0.040005</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69</th>\n",
" <td>50916567</td>\n",
" <td>0.039931</td>\n",
" </tr>\n",
" <tr>\n",
" <th>487</th>\n",
" <td>1269106344528736256</td>\n",
" <td>0.028700</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>278</th>\n",
" <td>953670151</td>\n",
" <td>0.000104</td>\n",
" </tr>\n",
" <tr>\n",
" <th>125</th>\n",
" <td>78871747</td>\n",
" <td>0.000100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>348</th>\n",
" <td>3968042414</td>\n",
" <td>0.000100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>206</th>\n",
" <td>213612447</td>\n",
" <td>0.000078</td>\n",
" </tr>\n",
" <tr>\n",
" <th>270</th>\n",
" <td>830434249</td>\n",
" <td>0.000030</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>500 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" feature importance\n",
"496 1301760719906512896 0.066097\n",
"202 199992361 0.051649\n",
"142 92531902 0.040005\n",
"69 50916567 0.039931\n",
"487 1269106344528736256 0.028700\n",
".. ... ...\n",
"278 953670151 0.000104\n",
"125 78871747 0.000100\n",
"348 3968042414 0.000100\n",
"206 213612447 0.000078\n",
"270 830434249 0.000030\n",
"\n",
"[500 rows x 2 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fi = pd.DataFrame({'feature': list(feature_vectors_train.columns[:-2]),\n",
" 'importance': classifier.feature_importances_}).\\\n",
" sort_values('importance', ascending = False)\n",
"fi"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x1226b1f110>"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1008x504 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"%matplotlib inline\n",
"fi[:40].plot.bar(x='feature', y='importance', figsize=(14,7))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment