Skip to content

Instantly share code, notes, and snippets.

@onurmatik
Last active May 25, 2023 11:13
Show Gist options
  • Save onurmatik/8fa3ed3c31904584d5531a67609580d3 to your computer and use it in GitHub Desktop.
Save onurmatik/8fa3ed3c31904584d5531a67609580d3 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "796b9ea6-176a-47ef-b5e2-929be2af1374",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.cluster import DBSCAN\n",
"from scipy.spatial import distance\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "23844b25-220e-4c2e-9529-d38c8e7045dc",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('cb.csv')\n",
"\n",
"candidates = ['RTE', 'KK', 'Mİ', 'SO']\n",
"\n",
"df['EnÇokOy'] = df[candidates].idxmax(axis=1)\n",
"\n",
"df['ToplamOy'] = df[candidates].sum(axis=1)\n",
"for candidate in candidates:\n",
" df[candidate] /= df['ToplamOy']\n",
"\n",
"# Sıfır oyu olan sandıkları sil\n",
"df.dropna(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3cc1668c-679c-438c-8c52-61ca0913712e",
"metadata": {},
"outputs": [],
"source": [
"# Sapmaları bulmak için ilçe bazında kümele\n",
"\n",
"outliers = pd.DataFrame()\n",
"\n",
"for district in df['İlçe'].unique():\n",
" # İlçe verisini filtrele\n",
" district_data = df[df['İlçe'] == district].copy()\n",
"\n",
" # Cluster\n",
" clustering = DBSCAN(eps=0.3, min_samples=2).fit(district_data[candidates])\n",
" district_data['cluster'] = clustering.labels_\n",
" \n",
" # Cluster'ların ortalama ve standard sapmasını hesapla\n",
" cluster_stats = district_data.groupby('cluster')[candidates].agg(['mean', 'std'])\n",
"\n",
" # Uç değerleri işaretle\n",
" def is_outlier(row):\n",
" cluster = row['cluster']\n",
" if cluster == -1:\n",
" return True\n",
" mean = cluster_stats.loc[cluster][pd.IndexSlice[:, 'mean']].values\n",
" std = cluster_stats.loc[cluster][pd.IndexSlice[:, 'std']].values\n",
" row_values = row[candidates].values\n",
" distance_to_mean = distance.euclidean(row_values, mean)\n",
" return distance_to_mean > 3 * np.linalg.norm(std)\n",
" district_data['is_outlier'] = district_data.apply(is_outlier, axis=1)\n",
"\n",
" # İlçelerin uç değerlerini ekle\n",
" outliers = pd.concat([outliers, district_data[district_data['is_outlier']]])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "711ded1a-aaa1-48dc-afcd-10fc3383d5f5",
"metadata": {},
"outputs": [],
"source": [
"# Şüpheli sandıkları ana tabloda işaretle\n",
"\n",
"df['Şüpheli'] = False\n",
"df.loc[outliers.index, 'Şüpheli'] = True"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "3e8b922e-eed1-4c9d-bb7e-152f9b3ec6f7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>İl</th>\n",
" <th>İlçe</th>\n",
" <th>Sandık</th>\n",
" <th>RTE</th>\n",
" <th>KK</th>\n",
" <th>Mİ</th>\n",
" <th>SO</th>\n",
" <th>EnÇokOy</th>\n",
" <th>ToplamOy</th>\n",
" <th>Şüpheli</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>SAKARYA</td>\n",
" <td>ADAPAZARI</td>\n",
" <td>2043</td>\n",
" <td>0.363636</td>\n",
" <td>0.633229</td>\n",
" <td>0.003135</td>\n",
" <td>0.000000</td>\n",
" <td>KK</td>\n",
" <td>319</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>603</th>\n",
" <td>SAKARYA</td>\n",
" <td>AKYAZI</td>\n",
" <td>1019</td>\n",
" <td>0.473913</td>\n",
" <td>0.491304</td>\n",
" <td>0.000000</td>\n",
" <td>0.034783</td>\n",
" <td>KK</td>\n",
" <td>230</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>606</th>\n",
" <td>SAKARYA</td>\n",
" <td>AKYAZI</td>\n",
" <td>1022</td>\n",
" <td>0.310734</td>\n",
" <td>0.627119</td>\n",
" <td>0.011299</td>\n",
" <td>0.050847</td>\n",
" <td>KK</td>\n",
" <td>177</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1831</th>\n",
" <td>SAKARYA</td>\n",
" <td>KOCAALİ</td>\n",
" <td>1011</td>\n",
" <td>0.000000</td>\n",
" <td>0.315018</td>\n",
" <td>0.633700</td>\n",
" <td>0.051282</td>\n",
" <td>Mİ</td>\n",
" <td>273</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1888</th>\n",
" <td>SAKARYA</td>\n",
" <td>PAMUKOVA</td>\n",
" <td>1003</td>\n",
" <td>0.408163</td>\n",
" <td>0.530612</td>\n",
" <td>0.000000</td>\n",
" <td>0.061224</td>\n",
" <td>KK</td>\n",
" <td>49</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>190856</th>\n",
" <td>TRABZON</td>\n",
" <td>ORTAHİSAR</td>\n",
" <td>2124</td>\n",
" <td>0.909722</td>\n",
" <td>0.083333</td>\n",
" <td>0.000000</td>\n",
" <td>0.006944</td>\n",
" <td>RTE</td>\n",
" <td>288</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>190956</th>\n",
" <td>TRABZON</td>\n",
" <td>ORTAHİSAR</td>\n",
" <td>1074</td>\n",
" <td>0.906040</td>\n",
" <td>0.073826</td>\n",
" <td>0.000000</td>\n",
" <td>0.020134</td>\n",
" <td>RTE</td>\n",
" <td>149</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>190958</th>\n",
" <td>TRABZON</td>\n",
" <td>ORTAHİSAR</td>\n",
" <td>1075</td>\n",
" <td>0.889831</td>\n",
" <td>0.067797</td>\n",
" <td>0.000000</td>\n",
" <td>0.042373</td>\n",
" <td>RTE</td>\n",
" <td>118</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>191048</th>\n",
" <td>TRABZON</td>\n",
" <td>ORTAHİSAR</td>\n",
" <td>2169</td>\n",
" <td>0.914179</td>\n",
" <td>0.048507</td>\n",
" <td>0.003731</td>\n",
" <td>0.033582</td>\n",
" <td>RTE</td>\n",
" <td>268</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>191718</th>\n",
" <td>TRABZON</td>\n",
" <td>YOMRA</td>\n",
" <td>1032</td>\n",
" <td>0.188925</td>\n",
" <td>0.775244</td>\n",
" <td>0.000000</td>\n",
" <td>0.035831</td>\n",
" <td>KK</td>\n",
" <td>307</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>878 rows × 10 columns</p>\n",
"</div>"
],
"text/plain": [
" İl İlçe Sandık RTE KK Mİ SO \\\n",
"47 SAKARYA ADAPAZARI 2043 0.363636 0.633229 0.003135 0.000000 \n",
"603 SAKARYA AKYAZI 1019 0.473913 0.491304 0.000000 0.034783 \n",
"606 SAKARYA AKYAZI 1022 0.310734 0.627119 0.011299 0.050847 \n",
"1831 SAKARYA KOCAALİ 1011 0.000000 0.315018 0.633700 0.051282 \n",
"1888 SAKARYA PAMUKOVA 1003 0.408163 0.530612 0.000000 0.061224 \n",
"... ... ... ... ... ... ... ... \n",
"190856 TRABZON ORTAHİSAR 2124 0.909722 0.083333 0.000000 0.006944 \n",
"190956 TRABZON ORTAHİSAR 1074 0.906040 0.073826 0.000000 0.020134 \n",
"190958 TRABZON ORTAHİSAR 1075 0.889831 0.067797 0.000000 0.042373 \n",
"191048 TRABZON ORTAHİSAR 2169 0.914179 0.048507 0.003731 0.033582 \n",
"191718 TRABZON YOMRA 1032 0.188925 0.775244 0.000000 0.035831 \n",
"\n",
" EnÇokOy ToplamOy Şüpheli \n",
"47 KK 319 True \n",
"603 KK 230 True \n",
"606 KK 177 True \n",
"1831 Mİ 273 True \n",
"1888 KK 49 True \n",
"... ... ... ... \n",
"190856 RTE 288 True \n",
"190956 RTE 149 True \n",
"190958 RTE 118 True \n",
"191048 RTE 268 True \n",
"191718 KK 307 True \n",
"\n",
"[878 rows x 10 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['Şüpheli']]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "rekognize-finance",
"language": "python",
"name": "rekognize-finance"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
@ZeynepP
Copy link

ZeynepP commented May 24, 2023

Merhaba oncelikle ellerinize saglik, neden ilce bazli kumeleme tercih edildi ? Tesekkurler.

@onurmatik
Copy link
Author

Merhaba, teşekkürler. Daha küçük coğrafi dağılım yok verilerde. Sandık numaraları coğrafi olarak da yakınlık ima ediyor, dolayısıyla kümeleme hesabına katılabilirdi. Ancak hem daha zor olduğu için üşendim hem de sonuçları gözle değerlendirmek her durumda gerekeceği için pratik olarak çok katkısı olmayacağını düşündüm.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment