Created
January 28, 2023 03:47
-
-
Save NaimKabir/4e3076373804adf0755ba8a802d30fa1 to your computer and use it in GitHub Desktop.
Naive method used to create dinner matches for SF Commons
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "d1921601-79a2-46fd-b379-98950f63f77c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "c768d8e9-2ec4-4e62-a939-812d46b7b9c3", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Total # members to match: 297\n" | |
] | |
} | |
], | |
"source": [ | |
"df = pd.read_csv('sfcommons_likes_emails_full.csv')\n", | |
"df = df[['Name', 'Interests, Tags, Hobbies', 'Email']].rename(columns={'Name':'name', 'Interests, Tags, Hobbies': 'likes', 'Email': 'email'})\n", | |
"df = df[df.likes.notnull() & df.email.str.strip().notnull()]\n", | |
"df = df.drop_duplicates(subset=['email'], keep='last')\n", | |
"\n", | |
"num_members = df.name.size\n", | |
"\n", | |
"print(f\"Total # members to match: {num_members}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "d8e4b926-e048-4b98-837e-8a32597b9e21", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Example interest vector:\n", | |
"[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0\n", | |
" 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn.preprocessing import MultiLabelBinarizer\n", | |
"\n", | |
"# Make sparse vector from interests to give to each member\n", | |
"\n", | |
"df['likes_array'] = df.likes.str.split(',')\n", | |
"mlb = MultiLabelBinarizer()\n", | |
"member_vectors = mlb.fit_transform(df.likes_array)\n", | |
"\n", | |
"example_member = 11\n", | |
"print(f\"Example interest vector:\")\n", | |
"print(member_vectors[example_member,:])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "6ab6148c-f27b-4189-acee-394c611857c8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Utility\n", | |
"\n", | |
"from itertools import chain\n", | |
"from collections import Counter\n", | |
"\n", | |
"flatten = lambda list_of_list: list(chain.from_iterable(list_of_list))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "ff55f245-6230-4ce6-8380-09ca87d19d21", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Multipliers for interests:\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>3</th>\n", | |
" <th>4</th>\n", | |
" <th>5</th>\n", | |
" <th>6</th>\n", | |
" <th>7</th>\n", | |
" <th>8</th>\n", | |
" <th>9</th>\n", | |
" <th>...</th>\n", | |
" <th>47</th>\n", | |
" <th>48</th>\n", | |
" <th>49</th>\n", | |
" <th>50</th>\n", | |
" <th>51</th>\n", | |
" <th>52</th>\n", | |
" <th>53</th>\n", | |
" <th>54</th>\n", | |
" <th>55</th>\n", | |
" <th>56</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>interest</th>\n", | |
" <td>Informal Hang</td>\n", | |
" <td>Art</td>\n", | |
" <td>Book Clubs</td>\n", | |
" <td>Games</td>\n", | |
" <td>Music</td>\n", | |
" <td>Poetry</td>\n", | |
" <td>Writing</td>\n", | |
" <td>TfT</td>\n", | |
" <td>ML</td>\n", | |
" <td>Socials</td>\n", | |
" <td>...</td>\n", | |
" <td>Knitting</td>\n", | |
" <td>Economics</td>\n", | |
" <td>Health Tech</td>\n", | |
" <td>Politics</td>\n", | |
" <td>Mathematics</td>\n", | |
" <td>Psychology</td>\n", | |
" <td>EA</td>\n", | |
" <td>Climbing</td>\n", | |
" <td>Dance</td>\n", | |
" <td>tools for thoughts</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>multiplier</th>\n", | |
" <td>2.628319</td>\n", | |
" <td>3.3</td>\n", | |
" <td>3.061856</td>\n", | |
" <td>4.95</td>\n", | |
" <td>3.857143</td>\n", | |
" <td>7.815789</td>\n", | |
" <td>2.97</td>\n", | |
" <td>11.0</td>\n", | |
" <td>3.907895</td>\n", | |
" <td>3.061856</td>\n", | |
" <td>...</td>\n", | |
" <td>297.0</td>\n", | |
" <td>148.5</td>\n", | |
" <td>8.735294</td>\n", | |
" <td>12.913043</td>\n", | |
" <td>17.470588</td>\n", | |
" <td>10.241379</td>\n", | |
" <td>9.580645</td>\n", | |
" <td>18.5625</td>\n", | |
" <td>17.470588</td>\n", | |
" <td>74.25</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>2 rows × 57 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 0 1 2 3 4 5 \\\n", | |
"interest Informal Hang Art Book Clubs Games Music Poetry \n", | |
"multiplier 2.628319 3.3 3.061856 4.95 3.857143 7.815789 \n", | |
"\n", | |
" 6 7 8 9 ... 47 48 \\\n", | |
"interest Writing TfT ML Socials ... Knitting Economics \n", | |
"multiplier 2.97 11.0 3.907895 3.061856 ... 297.0 148.5 \n", | |
"\n", | |
" 49 50 51 52 53 \\\n", | |
"interest Health Tech Politics Mathematics Psychology EA \n", | |
"multiplier 8.735294 12.913043 17.470588 10.241379 9.580645 \n", | |
"\n", | |
" 54 55 56 \n", | |
"interest Climbing Dance tools for thoughts \n", | |
"multiplier 18.5625 17.470588 74.25 \n", | |
"\n", | |
"[2 rows x 57 columns]" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Multi-label binarization assumes each index is equally important—\n", | |
"# but rarer and more unique interests should be weighted a bit more highly!\n", | |
"# I'll exaggerate distances along axes that are rarer.\n", | |
"\n", | |
"all_likes = flatten(df.likes_array)\n", | |
"like_counts = Counter(all_likes)\n", | |
"\n", | |
"# Get probability of having a certain interest\n", | |
"like_p = {k: v/(num_members*1.0) for k,v in like_counts.items()}\n", | |
"\n", | |
"# We'd like to exaggerate an axis by it's INVERSE probability\n", | |
"# The rarer it is, the more it should count in clustering!\n", | |
"# TODO: Maybe throw a sigmoid on these to bound them—but they don't stretch too far\n", | |
"# out to make unreasonable clusters today\n", | |
"like_multiplier = {k: 1/v for k,v in like_p.items()} \n", | |
"\n", | |
"print(\"Multipliers for interests:\")\n", | |
"pd.DataFrame.from_dict({'interest': like_multiplier.keys(), 'multiplier': like_multiplier.values()}).T" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "6b0d5e21-0d54-4ed4-b297-323ab5237582", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Example interest vector, with multipliers applied:\n", | |
"[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", | |
" 0 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0 0 5\n", | |
" 0 0 0 0 0 297 0 0 0 0 0 0 0 0 0 0 0 0\n", | |
" 0 0 0]\n" | |
] | |
} | |
], | |
"source": [ | |
"idx = lambda like: np.where(mlb.classes_ == like)[0][0]\n", | |
"exaggerated_member_vectors = member_vectors.copy()\n", | |
"for like, multiplier in like_multiplier.items():\n", | |
" exaggerated_member_vectors[:, idx(like)] = exaggerated_member_vectors[:, idx(like)] * multiplier\n", | |
" \n", | |
"print(\"Example interest vector, with multipliers applied:\")\n", | |
"print(exaggerated_member_vectors[example_member,:])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "4755c769-d982-490d-b3ee-aa11572d67ea", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create clusters in this space we've created.\n", | |
"# There's a neat package that lets us add constraints like max and min cluster membership as well!\n", | |
"\n", | |
"from sklearn.cluster import KMeans\n", | |
"from k_means_constrained import KMeansConstrained\n", | |
"group_size = 4\n", | |
"clusterer = KMeansConstrained(n_clusters=num_members // group_size, size_min=3, size_max=5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "782362f9-e369-42b6-826f-5d34d0a8ba8f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df['group_id'] = clusterer.fit_predict(exaggerated_member_vectors)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment