Skip to content

Instantly share code, notes, and snippets.

@NaimKabir
Created January 28, 2023 03:47
Show Gist options
  • Save NaimKabir/4e3076373804adf0755ba8a802d30fa1 to your computer and use it in GitHub Desktop.
Save NaimKabir/4e3076373804adf0755ba8a802d30fa1 to your computer and use it in GitHub Desktop.
Naive method used to create dinner matches for SF Commons
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d1921601-79a2-46fd-b379-98950f63f77c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c768d8e9-2ec4-4e62-a939-812d46b7b9c3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total # members to match: 297\n"
]
}
],
"source": [
"df = pd.read_csv('sfcommons_likes_emails_full.csv')\n",
"df = df[['Name', 'Interests, Tags, Hobbies', 'Email']].rename(columns={'Name':'name', 'Interests, Tags, Hobbies': 'likes', 'Email': 'email'})\n",
"df = df[df.likes.notnull() & df.email.str.strip().notnull()]\n",
"df = df.drop_duplicates(subset=['email'], keep='last')\n",
"\n",
"num_members = df.name.size\n",
"\n",
"print(f\"Total # members to match: {num_members}\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d8e4b926-e048-4b98-837e-8a32597b9e21",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Example interest vector:\n",
"[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0\n",
" 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n"
]
}
],
"source": [
"from sklearn.preprocessing import MultiLabelBinarizer\n",
"\n",
"# Make sparse vector from interests to give to each member\n",
"\n",
"df['likes_array'] = df.likes.str.split(',')\n",
"mlb = MultiLabelBinarizer()\n",
"member_vectors = mlb.fit_transform(df.likes_array)\n",
"\n",
"example_member = 11\n",
"print(f\"Example interest vector:\")\n",
"print(member_vectors[example_member,:])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "6ab6148c-f27b-4189-acee-394c611857c8",
"metadata": {},
"outputs": [],
"source": [
"# Utility\n",
"\n",
"from itertools import chain\n",
"from collections import Counter\n",
"\n",
"flatten = lambda list_of_list: list(chain.from_iterable(list_of_list))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ff55f245-6230-4ce6-8380-09ca87d19d21",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Multipliers for interests:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>47</th>\n",
" <th>48</th>\n",
" <th>49</th>\n",
" <th>50</th>\n",
" <th>51</th>\n",
" <th>52</th>\n",
" <th>53</th>\n",
" <th>54</th>\n",
" <th>55</th>\n",
" <th>56</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>interest</th>\n",
" <td>Informal Hang</td>\n",
" <td>Art</td>\n",
" <td>Book Clubs</td>\n",
" <td>Games</td>\n",
" <td>Music</td>\n",
" <td>Poetry</td>\n",
" <td>Writing</td>\n",
" <td>TfT</td>\n",
" <td>ML</td>\n",
" <td>Socials</td>\n",
" <td>...</td>\n",
" <td>Knitting</td>\n",
" <td>Economics</td>\n",
" <td>Health Tech</td>\n",
" <td>Politics</td>\n",
" <td>Mathematics</td>\n",
" <td>Psychology</td>\n",
" <td>EA</td>\n",
" <td>Climbing</td>\n",
" <td>Dance</td>\n",
" <td>tools for thoughts</td>\n",
" </tr>\n",
" <tr>\n",
" <th>multiplier</th>\n",
" <td>2.628319</td>\n",
" <td>3.3</td>\n",
" <td>3.061856</td>\n",
" <td>4.95</td>\n",
" <td>3.857143</td>\n",
" <td>7.815789</td>\n",
" <td>2.97</td>\n",
" <td>11.0</td>\n",
" <td>3.907895</td>\n",
" <td>3.061856</td>\n",
" <td>...</td>\n",
" <td>297.0</td>\n",
" <td>148.5</td>\n",
" <td>8.735294</td>\n",
" <td>12.913043</td>\n",
" <td>17.470588</td>\n",
" <td>10.241379</td>\n",
" <td>9.580645</td>\n",
" <td>18.5625</td>\n",
" <td>17.470588</td>\n",
" <td>74.25</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows × 57 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 \\\n",
"interest Informal Hang Art Book Clubs Games Music Poetry \n",
"multiplier 2.628319 3.3 3.061856 4.95 3.857143 7.815789 \n",
"\n",
" 6 7 8 9 ... 47 48 \\\n",
"interest Writing TfT ML Socials ... Knitting Economics \n",
"multiplier 2.97 11.0 3.907895 3.061856 ... 297.0 148.5 \n",
"\n",
" 49 50 51 52 53 \\\n",
"interest Health Tech Politics Mathematics Psychology EA \n",
"multiplier 8.735294 12.913043 17.470588 10.241379 9.580645 \n",
"\n",
" 54 55 56 \n",
"interest Climbing Dance tools for thoughts \n",
"multiplier 18.5625 17.470588 74.25 \n",
"\n",
"[2 rows x 57 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Multi-label binarization assumes each index is equally important—\n",
"# but rarer and more unique interests should be weighted a bit more highly!\n",
"# I'll exaggerate distances along axes that are rarer.\n",
"\n",
"all_likes = flatten(df.likes_array)\n",
"like_counts = Counter(all_likes)\n",
"\n",
"# Get probability of having a certain interest\n",
"like_p = {k: v/(num_members*1.0) for k,v in like_counts.items()}\n",
"\n",
"# We'd like to exaggerate an axis by it's INVERSE probability\n",
"# The rarer it is, the more it should count in clustering!\n",
"# TODO: Maybe throw a sigmoid on these to bound them—but they don't stretch too far\n",
"# out to make unreasonable clusters today\n",
"like_multiplier = {k: 1/v for k,v in like_p.items()} \n",
"\n",
"print(\"Multipliers for interests:\")\n",
"pd.DataFrame.from_dict({'interest': like_multiplier.keys(), 'multiplier': like_multiplier.values()}).T"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "6b0d5e21-0d54-4ed4-b297-323ab5237582",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Example interest vector, with multipliers applied:\n",
"[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0 0 5\n",
" 0 0 0 0 0 297 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0]\n"
]
}
],
"source": [
"idx = lambda like: np.where(mlb.classes_ == like)[0][0]\n",
"exaggerated_member_vectors = member_vectors.copy()\n",
"for like, multiplier in like_multiplier.items():\n",
" exaggerated_member_vectors[:, idx(like)] = exaggerated_member_vectors[:, idx(like)] * multiplier\n",
" \n",
"print(\"Example interest vector, with multipliers applied:\")\n",
"print(exaggerated_member_vectors[example_member,:])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4755c769-d982-490d-b3ee-aa11572d67ea",
"metadata": {},
"outputs": [],
"source": [
"# Create clusters in this space we've created.\n",
"# There's a neat package that lets us add constraints like max and min cluster membership as well!\n",
"\n",
"from sklearn.cluster import KMeans\n",
"from k_means_constrained import KMeansConstrained\n",
"group_size = 4\n",
"clusterer = KMeansConstrained(n_clusters=num_members // group_size, size_min=3, size_max=5)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "782362f9-e369-42b6-826f-5d34d0a8ba8f",
"metadata": {},
"outputs": [],
"source": [
"df['group_id'] = clusterer.fit_predict(exaggerated_member_vectors)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment