NaimKabir/sfcommons_dinner.ipynb

## sfcommons_dinner.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d1921601-79a2-46fd-b379-98950f63f77c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c768d8e9-2ec4-4e62-a939-812d46b7b9c3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total # members to match: 297\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv('sfcommons_likes_emails_full.csv')\n",
    "df = df[['Name', 'Interests, Tags, Hobbies', 'Email']].rename(columns={'Name':'name', 'Interests, Tags, Hobbies': 'likes', 'Email': 'email'})\n",
    "df = df[df.likes.notnull() & df.email.str.strip().notnull()]\n",
    "df = df.drop_duplicates(subset=['email'], keep='last')\n",
    "\n",
    "num_members = df.name.size\n",
    "\n",
    "print(f\"Total # members to match: {num_members}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d8e4b926-e048-4b98-837e-8a32597b9e21",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Example interest vector:\n",
      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0\n",
      " 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "\n",
    "# Make sparse vector from interests to give to each member\n",
    "\n",
    "df['likes_array'] = df.likes.str.split(',')\n",
    "mlb = MultiLabelBinarizer()\n",
    "member_vectors = mlb.fit_transform(df.likes_array)\n",
    "\n",
    "example_member = 11\n",
    "print(f\"Example interest vector:\")\n",
    "print(member_vectors[example_member,:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6ab6148c-f27b-4189-acee-394c611857c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Utility\n",
    "\n",
    "from itertools import chain\n",
    "from collections import Counter\n",
    "\n",
    "flatten = lambda list_of_list: list(chain.from_iterable(list_of_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ff55f245-6230-4ce6-8380-09ca87d19d21",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Multipliers for interests:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>47</th>\n",
       "      <th>48</th>\n",
       "      <th>49</th>\n",
       "      <th>50</th>\n",
       "      <th>51</th>\n",
       "      <th>52</th>\n",
       "      <th>53</th>\n",
       "      <th>54</th>\n",
       "      <th>55</th>\n",
       "      <th>56</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>interest</th>\n",
       "      <td>Informal Hang</td>\n",
       "      <td>Art</td>\n",
       "      <td>Book Clubs</td>\n",
       "      <td>Games</td>\n",
       "      <td>Music</td>\n",
       "      <td>Poetry</td>\n",
       "      <td>Writing</td>\n",
       "      <td>TfT</td>\n",
       "      <td>ML</td>\n",
       "      <td>Socials</td>\n",
       "      <td>...</td>\n",
       "      <td>Knitting</td>\n",
       "      <td>Economics</td>\n",
       "      <td>Health Tech</td>\n",
       "      <td>Politics</td>\n",
       "      <td>Mathematics</td>\n",
       "      <td>Psychology</td>\n",
       "      <td>EA</td>\n",
       "      <td>Climbing</td>\n",
       "      <td>Dance</td>\n",
       "      <td>tools for thoughts</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>multiplier</th>\n",
       "      <td>2.628319</td>\n",
       "      <td>3.3</td>\n",
       "      <td>3.061856</td>\n",
       "      <td>4.95</td>\n",
       "      <td>3.857143</td>\n",
       "      <td>7.815789</td>\n",
       "      <td>2.97</td>\n",
       "      <td>11.0</td>\n",
       "      <td>3.907895</td>\n",
       "      <td>3.061856</td>\n",
       "      <td>...</td>\n",
       "      <td>297.0</td>\n",
       "      <td>148.5</td>\n",
       "      <td>8.735294</td>\n",
       "      <td>12.913043</td>\n",
       "      <td>17.470588</td>\n",
       "      <td>10.241379</td>\n",
       "      <td>9.580645</td>\n",
       "      <td>18.5625</td>\n",
       "      <td>17.470588</td>\n",
       "      <td>74.25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 57 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       0    1           2      3         4         5   \\\n",
       "interest    Informal Hang  Art  Book Clubs  Games     Music    Poetry   \n",
       "multiplier       2.628319  3.3    3.061856   4.95  3.857143  7.815789   \n",
       "\n",
       "                 6     7         8         9   ...        47         48  \\\n",
       "interest    Writing   TfT        ML   Socials  ...  Knitting  Economics   \n",
       "multiplier     2.97  11.0  3.907895  3.061856  ...     297.0      148.5   \n",
       "\n",
       "                     49         50           51          52        53  \\\n",
       "interest    Health Tech   Politics  Mathematics  Psychology        EA   \n",
       "multiplier     8.735294  12.913043    17.470588   10.241379  9.580645   \n",
       "\n",
       "                  54         55                  56  \n",
       "interest    Climbing      Dance  tools for thoughts  \n",
       "multiplier   18.5625  17.470588               74.25  \n",
       "\n",
       "[2 rows x 57 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Multi-label binarization assumes each index is equally important—\n",
    "# but rarer and more unique interests should be weighted a bit more highly!\n",
    "# I'll exaggerate distances along axes that are rarer.\n",
    "\n",
    "all_likes = flatten(df.likes_array)\n",
    "like_counts = Counter(all_likes)\n",
    "\n",
    "# Get probability of having a certain interest\n",
    "like_p = {k: v/(num_members*1.0) for k,v in like_counts.items()}\n",
    "\n",
    "# We'd like to exaggerate an axis by it's INVERSE probability\n",
    "# The rarer it is, the more it should count in clustering!\n",
    "# TODO: Maybe throw a sigmoid on these to bound them—but they don't stretch too far\n",
    "# out to make unreasonable clusters today\n",
    "like_multiplier = {k: 1/v for k,v in like_p.items()} \n",
    "\n",
    "print(\"Multipliers for interests:\")\n",
    "pd.DataFrame.from_dict({'interest': like_multiplier.keys(), 'multiplier': like_multiplier.values()}).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6b0d5e21-0d54-4ed4-b297-323ab5237582",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Example interest vector, with multipliers applied:\n",
      "[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0\n",
      "   0   0   0   0   0   0   0   0   0   0   0   0   0   8   0   0   0   5\n",
      "   0   0   0   0   0 297   0   0   0   0   0   0   0   0   0   0   0   0\n",
      "   0   0   0]\n"
     ]
    }
   ],
   "source": [
    "idx = lambda like: np.where(mlb.classes_ == like)[0][0]\n",
    "exaggerated_member_vectors = member_vectors.copy()\n",
    "for like, multiplier in like_multiplier.items():\n",
    "    exaggerated_member_vectors[:, idx(like)] = exaggerated_member_vectors[:, idx(like)] * multiplier\n",
    "    \n",
    "print(\"Example interest vector, with multipliers applied:\")\n",
    "print(exaggerated_member_vectors[example_member,:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4755c769-d982-490d-b3ee-aa11572d67ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create clusters in this space we've created.\n",
    "# There's a neat package that lets us add constraints like max and min cluster membership as well!\n",
    "\n",
    "from sklearn.cluster import KMeans\n",
    "from k_means_constrained import KMeansConstrained\n",
    "group_size = 4\n",
    "clusterer = KMeansConstrained(n_clusters=num_members // group_size, size_min=3, size_max=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "782362f9-e369-42b6-826f-5d34d0a8ba8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['group_id'] = clusterer.fit_predict(exaggerated_member_vectors)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "d1921601-79a2-46fd-b379-98950f63f77c",
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "c768d8e9-2ec4-4e62-a939-812d46b7b9c3",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Total # members to match: 297\n"
	]
	}
	],
	"source": [
	"df = pd.read_csv('sfcommons_likes_emails_full.csv')\n",
	"df = df[['Name', 'Interests, Tags, Hobbies', 'Email']].rename(columns={'Name':'name', 'Interests, Tags, Hobbies': 'likes', 'Email': 'email'})\n",
	"df = df[df.likes.notnull() & df.email.str.strip().notnull()]\n",
	"df = df.drop_duplicates(subset=['email'], keep='last')\n",
	"\n",
	"num_members = df.name.size\n",
	"\n",
	"print(f\"Total # members to match: {num_members}\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "d8e4b926-e048-4b98-837e-8a32597b9e21",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Example interest vector:\n",
	"[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0\n",
	" 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n"
	]
	}
	],
	"source": [
	"from sklearn.preprocessing import MultiLabelBinarizer\n",
	"\n",
	"# Make sparse vector from interests to give to each member\n",
	"\n",
	"df['likes_array'] = df.likes.str.split(',')\n",
	"mlb = MultiLabelBinarizer()\n",
	"member_vectors = mlb.fit_transform(df.likes_array)\n",
	"\n",
	"example_member = 11\n",
	"print(f\"Example interest vector:\")\n",
	"print(member_vectors[example_member,:])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "6ab6148c-f27b-4189-acee-394c611857c8",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Utility\n",
	"\n",
	"from itertools import chain\n",
	"from collections import Counter\n",
	"\n",
	"flatten = lambda list_of_list: list(chain.from_iterable(list_of_list))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "ff55f245-6230-4ce6-8380-09ca87d19d21",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Multipliers for interests:\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>0</th>\n",
	" <th>1</th>\n",
	" <th>2</th>\n",
	" <th>3</th>\n",
	" <th>4</th>\n",
	" <th>5</th>\n",
	" <th>6</th>\n",
	" <th>7</th>\n",
	" <th>8</th>\n",
	" <th>9</th>\n",
	" <th>...</th>\n",
	" <th>47</th>\n",
	" <th>48</th>\n",
	" <th>49</th>\n",
	" <th>50</th>\n",
	" <th>51</th>\n",
	" <th>52</th>\n",
	" <th>53</th>\n",
	" <th>54</th>\n",
	" <th>55</th>\n",
	" <th>56</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>interest</th>\n",
	" <td>Informal Hang</td>\n",
	" <td>Art</td>\n",
	" <td>Book Clubs</td>\n",
	" <td>Games</td>\n",
	" <td>Music</td>\n",
	" <td>Poetry</td>\n",
	" <td>Writing</td>\n",
	" <td>TfT</td>\n",
	" <td>ML</td>\n",
	" <td>Socials</td>\n",
	" <td>...</td>\n",
	" <td>Knitting</td>\n",
	" <td>Economics</td>\n",
	" <td>Health Tech</td>\n",
	" <td>Politics</td>\n",
	" <td>Mathematics</td>\n",
	" <td>Psychology</td>\n",
	" <td>EA</td>\n",
	" <td>Climbing</td>\n",
	" <td>Dance</td>\n",
	" <td>tools for thoughts</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>multiplier</th>\n",
	" <td>2.628319</td>\n",
	" <td>3.3</td>\n",
	" <td>3.061856</td>\n",
	" <td>4.95</td>\n",
	" <td>3.857143</td>\n",
	" <td>7.815789</td>\n",
	" <td>2.97</td>\n",
	" <td>11.0</td>\n",
	" <td>3.907895</td>\n",
	" <td>3.061856</td>\n",
	" <td>...</td>\n",
	" <td>297.0</td>\n",
	" <td>148.5</td>\n",
	" <td>8.735294</td>\n",
	" <td>12.913043</td>\n",
	" <td>17.470588</td>\n",
	" <td>10.241379</td>\n",
	" <td>9.580645</td>\n",
	" <td>18.5625</td>\n",
	" <td>17.470588</td>\n",
	" <td>74.25</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>2 rows × 57 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" 0 1 2 3 4 5 \\\n",
	"interest Informal Hang Art Book Clubs Games Music Poetry \n",
	"multiplier 2.628319 3.3 3.061856 4.95 3.857143 7.815789 \n",
	"\n",
	" 6 7 8 9 ... 47 48 \\\n",
	"interest Writing TfT ML Socials ... Knitting Economics \n",
	"multiplier 2.97 11.0 3.907895 3.061856 ... 297.0 148.5 \n",
	"\n",
	" 49 50 51 52 53 \\\n",
	"interest Health Tech Politics Mathematics Psychology EA \n",
	"multiplier 8.735294 12.913043 17.470588 10.241379 9.580645 \n",
	"\n",
	" 54 55 56 \n",
	"interest Climbing Dance tools for thoughts \n",
	"multiplier 18.5625 17.470588 74.25 \n",
	"\n",
	"[2 rows x 57 columns]"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Multi-label binarization assumes each index is equally important—\n",
	"# but rarer and more unique interests should be weighted a bit more highly!\n",
	"# I'll exaggerate distances along axes that are rarer.\n",
	"\n",
	"all_likes = flatten(df.likes_array)\n",
	"like_counts = Counter(all_likes)\n",
	"\n",
	"# Get probability of having a certain interest\n",
	"like_p = {k: v/(num_members*1.0) for k,v in like_counts.items()}\n",
	"\n",
	"# We'd like to exaggerate an axis by it's INVERSE probability\n",
	"# The rarer it is, the more it should count in clustering!\n",
	"# TODO: Maybe throw a sigmoid on these to bound them—but they don't stretch too far\n",
	"# out to make unreasonable clusters today\n",
	"like_multiplier = {k: 1/v for k,v in like_p.items()} \n",
	"\n",
	"print(\"Multipliers for interests:\")\n",
	"pd.DataFrame.from_dict({'interest': like_multiplier.keys(), 'multiplier': like_multiplier.values()}).T"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "6b0d5e21-0d54-4ed4-b297-323ab5237582",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Example interest vector, with multipliers applied:\n",
	"[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0 0 5\n",
	" 0 0 0 0 0 297 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0]\n"
	]
	}
	],
	"source": [
	"idx = lambda like: np.where(mlb.classes_ == like)[0][0]\n",
	"exaggerated_member_vectors = member_vectors.copy()\n",
	"for like, multiplier in like_multiplier.items():\n",
	" exaggerated_member_vectors[:, idx(like)] = exaggerated_member_vectors[:, idx(like)] * multiplier\n",
	" \n",
	"print(\"Example interest vector, with multipliers applied:\")\n",
	"print(exaggerated_member_vectors[example_member,:])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "4755c769-d982-490d-b3ee-aa11572d67ea",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Create clusters in this space we've created.\n",
	"# There's a neat package that lets us add constraints like max and min cluster membership as well!\n",
	"\n",
	"from sklearn.cluster import KMeans\n",
	"from k_means_constrained import KMeansConstrained\n",
	"group_size = 4\n",
	"clusterer = KMeansConstrained(n_clusters=num_members // group_size, size_min=3, size_max=5)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "782362f9-e369-42b6-826f-5d34d0a8ba8f",
	"metadata": {},
	"outputs": [],
	"source": [
	"df['group_id'] = clusterer.fit_predict(exaggerated_member_vectors)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}