koalaboy808/clustering_kmeans.ipynb

## clustering_kmeans.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Purpose of K-Means Clustering\n",
    "\n",
    "The purpose of the notebook is to use annotatoins of each emoji to intelligently cluster emojis\n",
    "* use 'emoji_webscraped_expanded.json' file to create categories"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "** Import Packages **"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# import packages work bitch please meow\n",
    "# meow meow\n",
    "# meow\n",
    "import nltk, re\n",
    "import numpy as np\n",
    "import gensim\n",
    "from gensim.models import Word2Vec\n",
    "from nltk.data import find\n",
    "import pandas as pd\n",
    "from collections import defaultdict\n",
    "from nltk.corpus import stopwords\n",
    "import json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "** Function to convert the text file into a list of 1) titles 2) descriptions 3) annotations **"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df = pd.read_json(\"data/emoji_webscraped_expanded.json\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Subset Dataframe to only annotations which either contain face or person**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def subset_annotations(_df):\n",
    "    list_titles = [list(item) for item in list(_df)]\n",
    "    index_face_person = [index for index,value in enumerate(list_titles) if 'face' in value] # or 'person' in value]\n",
    "    # print(len(index_face_person))\n",
    "    df_face_person = df.iloc[index_face_person]\n",
    "    # print(df_face_person.shape)\n",
    "    # df_face_person.head()\n",
    "    return df_face_person"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "** return list of all annotations and the fifty top annotations **"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def word_list(annotations, _common=100):\n",
    "    stop_words = stopwords.words('english')\n",
    "    total_list = [word for item in list(annotations) for word in item if word not in stop_words]\n",
    "    top_list = nltk.FreqDist(total_list).most_common(_common)\n",
    "    return total_list, top_list"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "** return list of fifty top words without frequency rate **"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def top_justwords(top_list):\n",
    "    top_words = [item[0] for item in top_list]\n",
    "    return top_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def face_by_topwords(top, df):\n",
    "    top_justwords_array = [(top) for i in range(df.shape[0])]\n",
    "    df['top_words'] = pd.Series(top_justwords_array, index=df.index)\n",
    "    \n",
    "    df['top_binary'] = [list(pd.Series(item).isin(list(df.annotations)[index])) for index, item in enumerate(df.top_words)]\n",
    "    df = df.reset_index(drop=True)\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "\n",
    "def Clustering(df_combined, n_clusters=5):\n",
    "    _X = np.array(df_combined.ix[:,7:(df_combined.shape)[1]])\n",
    "    k_means = KMeans(init='k-means++', n_clusters= n_clusters, n_init=20)\n",
    "    k_means.fit(_X)\n",
    "    k_means_labels = k_means.labels_\n",
    "    k_means_cluster_centers = k_means.cluster_centers_\n",
    "    k_means_labels_unique = np.unique(k_means_labels)\n",
    "    ft = (k_means_labels, k_means_cluster_centers, k_means_labels_unique)\n",
    "    labels = np.array(k_means_labels_unique)\n",
    "    location = np.array(k_means_cluster_centers)\n",
    "    labels_location = list(zip(labels, location))\n",
    "    # person_df['cluster_label'] = pd.DataFrame(k_means_labels)\n",
    "    print (\"labels:\\n %s, \\n cluster centers:\\n %s,\\n  unique labels:\\n %s\" % ft)\n",
    "    print(labels_location)\n",
    "    return k_means_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def create_dictionary(df, labels):\n",
    "    df_combined['k_means'] = list(k_means_labels)\n",
    "    unique_labels = list(k_means_labels)\n",
    "    \n",
    "    # subset df to two fields needed\n",
    "    subset = df_combined[['k_means','byteCode1']]\n",
    "    subset = subset.values.tolist()\n",
    "    \n",
    "    # create dictionary from subset df\n",
    "    dict_grouping = defaultdict(list)\n",
    "    for key, date in subset:\n",
    "        dict_grouping[key].append(date)\n",
    "    \n",
    "    return dict_grouping"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Call Functions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Output Emojis by Category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "👼😇😈👿👹👺💀👻👽👾\n",
      "\n",
      "1\n",
      "😀😁😎⛑😍😘🗣😗😙😚☺👀🙂🤗😂🤔😐😑😶🙄😏😣😥😮🤐😃😯😪😫😴😌🤓😛😜😝☹😄🙁😒😓😔😕😖🙃😷🤒🤕😅🤑😲😞😟😤😢😭😦😧😨😆😩😬😰😱😳😵😡😠😉☠🤖😊🗿😋\n",
      "\n",
      "2\n",
      "🌚🌛🌜🌝🌞\n",
      "\n",
      "3\n",
      "🐵🐶🐺🐱🦁🐯🐴🦄🐮🐷🐽🐭🐹🐰🐻🐼🐸🐲🐳🌬💩😺😸😹😻😼😽🙀😿😾\n",
      "\n",
      "4\n",
      "🙈🙉🙊\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for key, value in dict_grouping.items():\n",
    "    print(key)\n",
    "    for _emoji in value:\n",
    "        print(bytes(\"{0}\".format(_emoji), 'ascii').decode('unicode-escape'), end=\"\")\n",
    "    print(\"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Purpose of K-Means Clustering\n",
	"\n",
	"The purpose of the notebook is to use annotatoins of each emoji to intelligently cluster emojis\n",
	"* use 'emoji_webscraped_expanded.json' file to create categories"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	" Import Packages "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# import packages work bitch please meow\n",
	"# meow meow\n",
	"# meow\n",
	"import nltk, re\n",
	"import numpy as np\n",
	"import gensim\n",
	"from gensim.models import Word2Vec\n",
	"from nltk.data import find\n",
	"import pandas as pd\n",
	"from collections import defaultdict\n",
	"from nltk.corpus import stopwords\n",
	"import json"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	" Function to convert the text file into a list of 1) titles 2) descriptions 3) annotations "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df = pd.read_json(\"data/emoji_webscraped_expanded.json\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Subset Dataframe to only annotations which either contain face or person"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def subset_annotations(_df):\n",
	" list_titles = [list(item) for item in list(_df)]\n",
	" index_face_person = [index for index,value in enumerate(list_titles) if 'face' in value] # or 'person' in value]\n",
	" # print(len(index_face_person))\n",
	" df_face_person = df.iloc[index_face_person]\n",
	" # print(df_face_person.shape)\n",
	" # df_face_person.head()\n",
	" return df_face_person"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	" return list of all annotations and the fifty top annotations "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def word_list(annotations, _common=100):\n",
	" stop_words = stopwords.words('english')\n",
	" total_list = [word for item in list(annotations) for word in item if word not in stop_words]\n",
	" top_list = nltk.FreqDist(total_list).most_common(_common)\n",
	" return total_list, top_list"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	" return list of fifty top words without frequency rate "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def top_justwords(top_list):\n",
	" top_words = [item[0] for item in top_list]\n",
	" return top_words"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def face_by_topwords(top, df):\n",
	" top_justwords_array = [(top) for i in range(df.shape[0])]\n",
	" df['top_words'] = pd.Series(top_justwords_array, index=df.index)\n",
	" \n",
	" df['top_binary'] = [list(pd.Series(item).isin(list(df.annotations)[index])) for index, item in enumerate(df.top_words)]\n",
	" df = df.reset_index(drop=True)\n",
	" \n",
	" return df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"from sklearn.cluster import KMeans\n",
	"\n",
	"def Clustering(df_combined, n_clusters=5):\n",
	" _X = np.array(df_combined.ix[:,7:(df_combined.shape)[1]])\n",
	" k_means = KMeans(init='k-means++', n_clusters= n_clusters, n_init=20)\n",
	" k_means.fit(_X)\n",
	" k_means_labels = k_means.labels_\n",
	" k_means_cluster_centers = k_means.cluster_centers_\n",
	" k_means_labels_unique = np.unique(k_means_labels)\n",
	" ft = (k_means_labels, k_means_cluster_centers, k_means_labels_unique)\n",
	" labels = np.array(k_means_labels_unique)\n",
	" location = np.array(k_means_cluster_centers)\n",
	" labels_location = list(zip(labels, location))\n",
	" # person_df['cluster_label'] = pd.DataFrame(k_means_labels)\n",
	" print (\"labels:\\n %s, \\n cluster centers:\\n %s,\\n unique labels:\\n %s\" % ft)\n",
	" print(labels_location)\n",
	" return k_means_labels"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def create_dictionary(df, labels):\n",
	" df_combined['k_means'] = list(k_means_labels)\n",
	" unique_labels = list(k_means_labels)\n",
	" \n",
	" # subset df to two fields needed\n",
	" subset = df_combined[['k_means','byteCode1']]\n",
	" subset = subset.values.tolist()\n",
	" \n",
	" # create dictionary from subset df\n",
	" dict_grouping = defaultdict(list)\n",
	" for key, date in subset:\n",
	" dict_grouping[key].append(date)\n",
	" \n",
	" return dict_grouping"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Call Functions"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Output Emojis by Category"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0\n",
	"👼😇😈👿👹👺💀👻👽👾\n",
	"\n",
	"1\n",
	"😀😁😎⛑😍😘🗣😗😙😚☺👀🙂🤗😂🤔😐😑😶🙄😏😣😥😮🤐😃😯😪😫😴😌🤓😛😜😝☹😄🙁😒😓😔😕😖🙃😷🤒🤕😅🤑😲😞😟😤😢😭😦😧😨😆😩😬😰😱😳😵😡😠😉☠🤖😊🗿😋\n",
	"\n",
	"2\n",
	"🌚🌛🌜🌝🌞\n",
	"\n",
	"3\n",
	"🐵🐶🐺🐱🦁🐯🐴🦄🐮🐷🐽🐭🐹🐰🐻🐼🐸🐲🐳🌬💩😺😸😹😻😼😽🙀😿😾\n",
	"\n",
	"4\n",
	"🙈🙉🙊\n",
	"\n"
	]
	}
	],
	"source": [
	"for key, value in dict_grouping.items():\n",
	" print(key)\n",
	" for _emoji in value:\n",
	" print(bytes(\"{0}\".format(_emoji), 'ascii').decode('unicode-escape'), end=\"\")\n",
	" print(\"\\n\")"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.4.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}