Created
February 4, 2016 04:51
-
-
Save koalaboy808/270cc73b71a469484338 to your computer and use it in GitHub Desktop.
clustering of emojis by using kmeans clustering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Purpose of K-Means Clustering\n", | |
"\n", | |
"The purpose of the notebook is to use annotatoins of each emoji to intelligently cluster emojis\n", | |
"* use 'emoji_webscraped_expanded.json' file to create categories" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"** Import Packages **" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# import packages work bitch please meow\n", | |
"# meow meow\n", | |
"# meow\n", | |
"import nltk, re\n", | |
"import numpy as np\n", | |
"import gensim\n", | |
"from gensim.models import Word2Vec\n", | |
"from nltk.data import find\n", | |
"import pandas as pd\n", | |
"from collections import defaultdict\n", | |
"from nltk.corpus import stopwords\n", | |
"import json" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"** Function to convert the text file into a list of 1) titles 2) descriptions 3) annotations **" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_json(\"data/emoji_webscraped_expanded.json\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Subset Dataframe to only annotations which either contain face or person**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def subset_annotations(_df):\n", | |
" list_titles = [list(item) for item in list(_df)]\n", | |
" index_face_person = [index for index,value in enumerate(list_titles) if 'face' in value] # or 'person' in value]\n", | |
" # print(len(index_face_person))\n", | |
" df_face_person = df.iloc[index_face_person]\n", | |
" # print(df_face_person.shape)\n", | |
" # df_face_person.head()\n", | |
" return df_face_person" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"** return list of all annotations and the fifty top annotations **" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def word_list(annotations, _common=100):\n", | |
" stop_words = stopwords.words('english')\n", | |
" total_list = [word for item in list(annotations) for word in item if word not in stop_words]\n", | |
" top_list = nltk.FreqDist(total_list).most_common(_common)\n", | |
" return total_list, top_list" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"** return list of fifty top words without frequency rate **" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def top_justwords(top_list):\n", | |
" top_words = [item[0] for item in top_list]\n", | |
" return top_words" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def face_by_topwords(top, df):\n", | |
" top_justwords_array = [(top) for i in range(df.shape[0])]\n", | |
" df['top_words'] = pd.Series(top_justwords_array, index=df.index)\n", | |
" \n", | |
" df['top_binary'] = [list(pd.Series(item).isin(list(df.annotations)[index])) for index, item in enumerate(df.top_words)]\n", | |
" df = df.reset_index(drop=True)\n", | |
" \n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.cluster import KMeans\n", | |
"\n", | |
"def Clustering(df_combined, n_clusters=5):\n", | |
" _X = np.array(df_combined.ix[:,7:(df_combined.shape)[1]])\n", | |
" k_means = KMeans(init='k-means++', n_clusters= n_clusters, n_init=20)\n", | |
" k_means.fit(_X)\n", | |
" k_means_labels = k_means.labels_\n", | |
" k_means_cluster_centers = k_means.cluster_centers_\n", | |
" k_means_labels_unique = np.unique(k_means_labels)\n", | |
" ft = (k_means_labels, k_means_cluster_centers, k_means_labels_unique)\n", | |
" labels = np.array(k_means_labels_unique)\n", | |
" location = np.array(k_means_cluster_centers)\n", | |
" labels_location = list(zip(labels, location))\n", | |
" # person_df['cluster_label'] = pd.DataFrame(k_means_labels)\n", | |
" print (\"labels:\\n %s, \\n cluster centers:\\n %s,\\n unique labels:\\n %s\" % ft)\n", | |
" print(labels_location)\n", | |
" return k_means_labels" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def create_dictionary(df, labels):\n", | |
" df_combined['k_means'] = list(k_means_labels)\n", | |
" unique_labels = list(k_means_labels)\n", | |
" \n", | |
" # subset df to two fields needed\n", | |
" subset = df_combined[['k_means','byteCode1']]\n", | |
" subset = subset.values.tolist()\n", | |
" \n", | |
" # create dictionary from subset df\n", | |
" dict_grouping = defaultdict(list)\n", | |
" for key, date in subset:\n", | |
" dict_grouping[key].append(date)\n", | |
" \n", | |
" return dict_grouping" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Call Functions" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Output Emojis by Category" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0\n", | |
"๐ผ๐๐๐ฟ๐น๐บ๐๐ป๐ฝ๐พ\n", | |
"\n", | |
"1\n", | |
"๐๐๐โ๐๐๐ฃ๐๐๐โบ๐๐๐ค๐๐ค๐๐๐ถ๐๐๐ฃ๐ฅ๐ฎ๐ค๐๐ฏ๐ช๐ซ๐ด๐๐ค๐๐๐โน๐๐๐๐๐๐๐๐๐ท๐ค๐ค๐ ๐ค๐ฒ๐๐๐ค๐ข๐ญ๐ฆ๐ง๐จ๐๐ฉ๐ฌ๐ฐ๐ฑ๐ณ๐ต๐ก๐ ๐โ ๐ค๐๐ฟ๐\n", | |
"\n", | |
"2\n", | |
"๐๐๐๐๐\n", | |
"\n", | |
"3\n", | |
"๐ต๐ถ๐บ๐ฑ๐ฆ๐ฏ๐ด๐ฆ๐ฎ๐ท๐ฝ๐ญ๐น๐ฐ๐ป๐ผ๐ธ๐ฒ๐ณ๐ฌ๐ฉ๐บ๐ธ๐น๐ป๐ผ๐ฝ๐๐ฟ๐พ\n", | |
"\n", | |
"4\n", | |
"๐๐๐\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"for key, value in dict_grouping.items():\n", | |
" print(key)\n", | |
" for _emoji in value:\n", | |
" print(bytes(\"{0}\".format(_emoji), 'ascii').decode('unicode-escape'), end=\"\")\n", | |
" print(\"\\n\")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment