Skip to content

Instantly share code, notes, and snippets.

@moonmilk
Last active July 5, 2016 00:14
Show Gist options
  • Save moonmilk/691cb5c4d824f65d5e9b0eb77c5d0dca to your computer and use it in GitHub Desktop.
Save moonmilk/691cb5c4d824f65d5e9b0eb77c5d0dca to your computer and use it in GitHub Desktop.
messing around with ofxMSAWord2Vec
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# messing with oxfMSAWord2Vec\n",
"from https://github.com/memo/ofxMSAWord2Vec\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"load_word_vectors_bin : /Users/ranjit/Downloads/GoogleNews-vectors-negative300_trimmed_53K_lowercase.bin ... \n",
"num_words: 53084/53084\n",
"num_dims: 300\n",
"done in 7.57633709908 seconds.\n",
"------------------------------------------------------------\n",
"normalize_word_vectors ... done in 0.50585103035 seconds.\n"
]
}
],
"source": [
"from word2vec_utils import *\n",
"vecs = load_word_vectors_bin('/Users/ranjit/Downloads/GoogleNews-vectors-negative300_trimmed_53K_lowercase.bin')\n",
"vecs_n = normalize_word_vectors(vecs)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('dogs', 0.86804897),\n",
" ('puppy', 0.81064284),\n",
" ('cat', 0.7609458),\n",
" ('beagle', 0.74186218),\n",
" ('pup', 0.74069107),\n",
" ('chihuahua', 0.71739173),\n",
" ('pet', 0.71647859),\n",
" ('canine', 0.69182897),\n",
" ('collie', 0.67144096),\n",
" ('kitten', 0.66598809)]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"find_closest_words(vecs_n, \"dog\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('papillon', 0.69635022),\n",
" ('chihuahua', 0.6762343),\n",
" ('dalmatian', 0.65920705),\n",
" ('pug', 0.64561403),\n",
" ('puppy', 0.64243448),\n",
" ('labrador', 0.63804096),\n",
" ('mastiff', 0.62263489),\n",
" ('poodle', 0.62242281),\n",
" ('beagle', 0.62123823),\n",
" ('alsatian', 0.6157546)]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"find_closest_words(vecs_n, \"pomeranian\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('pomeranian', 1.0),\n",
" ('papillon', 0.69635022),\n",
" ('chihuahua', 0.6762343),\n",
" ('dalmatian', 0.65920705),\n",
" ('pug', 0.64561403),\n",
" ('puppy', 0.64243448),\n",
" ('labrador', 0.63804096),\n",
" ('mastiff', 0.62263489),\n",
" ('poodle', 0.62242281),\n",
" ('beagle', 0.62123823)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pom_n = vecs_n[\"pomeranian\"]\n",
"find_closest_words(vecs_n, pom_n)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"metal_words = \"burn cries veins eternity breathe beast gonna demons ashes soul\".split(\" \")\n",
"unmetal_words = \"particularly indicated secretary committee university relatively noted approximately chairman employees\".split(\" \")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"metal_vecs, metal_vecs_n = [vecs[word] for word in metal_words], [vecs_n[word] for word in metal_words]\n",
"unmetal_vecs, unmetal_vecs_n = [vecs[word] for word in unmetal_words], [vecs_n[word] for word in unmetal_words]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"([('hell', 0.58362615),\n",
" ('eateth', 0.54645419),\n",
" ('souls', 0.54323936),\n",
" ('looketh', 0.53242099),\n",
" ('god', 0.52718186)],\n",
" [('said', 0.53903073),\n",
" ('acknowledged', 0.49876258),\n",
" ('stressed', 0.48518729),\n",
" ('emphasized', 0.46537137),\n",
" ('committees', 0.46324104)])"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# find closest words to average word\n",
"do_word_maths(vecs, vecs_n, [(0.1, word) for word in metal_words]),do_word_maths(vecs, vecs_n, [(0.1, word) for word in unmetal_words])"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"normalize_word_vectors ... done in 5.19752502441e-05 seconds.\n"
]
}
],
"source": [
"# find average vector\n",
"import numpy as np\n",
"metal_mean = np.mean(metal_vecs, axis=0)\n",
"metal_mean_n = normalize_word_vectors({'metal_af':metal_mean})['metal_af']\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('soul', 0.63606858),\n",
" ('demons', 0.60098231),\n",
" ('hell', 0.58362621),\n",
" ('eateth', 0.54645419),\n",
" ('souls', 0.54323936),\n",
" ('beast', 0.53437954),\n",
" ('looketh', 0.53242099),\n",
" ('eternity', 0.52865142),\n",
" ('breathe', 0.52801883),\n",
" ('god', 0.52718186)]"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"find_closest_words(vecs_n, metal_mean_n)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"normalize_word_vectors ... done in 3.79085540771e-05 seconds.\n"
]
}
],
"source": [
"unmetal_mean = np.mean(unmetal_vecs, axis=0)\n",
"unmetal_mean_n = normalize_word_vectors({'unmetal_af':unmetal_mean})['unmetal_af']"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('chairman', 0.5799759),\n",
" ('noted', 0.55365336),\n",
" ('committee', 0.54178369),\n",
" ('said', 0.53903079),\n",
" ('secretary', 0.52612215),\n",
" ('indicated', 0.50970483),\n",
" ('acknowledged', 0.49876261),\n",
" ('stressed', 0.48518729),\n",
" ('emphasized', 0.46537143),\n",
" ('committees', 0.46324104)]"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"find_closest_words(vecs_n, unmetal_mean_n)"
]
},
{
"cell_type": "code",
"execution_count": 198,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" --- less metal --- \n",
"concerned\treceptive\tinvolved\tsaid\tstressed\n",
"concerned\treceptive\tinvolved\tsaid\tamenable\n",
"concerned\treceptive\tinvolved\tamenable\tconsidering\tactively\n",
"concerned\treceptive\tinvolved\tamenable\taverse\tactively\tconsider\n",
"receptive\tconcerned\tinvolved\taverse\tamenable\tintrigued\tactively\n",
"receptive\tinvolved\tconcerned\taverse\tamenable\tintrigued\tconsider\n",
"involved\treceptive\tconcerned\taverse\tintrigued\tamenable\tuninterested\n",
"involved\treceptive\taverse\tconcerned\tintrigued\tamenable\tuninterested\n",
"involved\treceptive\taverse\tintrigued\tconcerned\tamenable\tuninterested\n",
"involved\taverse\tintrigued\treceptive\tuninterested\tamenable\tconcerned\n",
"interested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\n",
"averse\tinvolved\tintrigued\treceptive\tuninterested\tenamored\tamenable\n",
"intrigued\taverse\tinvolved\tenamored\tuninterested\treceptive\tdesirous\n",
"intrigued\taverse\tenamored\tdesirous\tuninterested\tinvolved\treceptive\n",
"intrigued\taverse\tenamored\tdesirous\tuninterested\tfond\tinvolved\n",
"intrigued\tenamored\taverse\tdesirous\tfond\tlove\tdreaming\n",
"love\tintrigued\tenamored\tdreaming\tloves\tdesirous\taverse\n",
"love\tmad\tdreaming\tloves\tintrigued\thell\n",
"love\thell\tmad\tloves\tdreaming\n",
"hell\tlove\tmad\tloves\n",
"hell\tlove\tmad\twarn't\n",
" --- more metal ---\n"
]
}
],
"source": [
"word = 'interested'\n",
"inc = 0.03\n",
"print \" --- less metal --- \"\n",
"def wordtable(w):\n",
" return \"\\t\".join([a[0] for a in w])\n",
"for f in range(10,0,-1):\n",
" print wordtable(do_word_maths(vecs, vecs_n, [(1, word)] +[(inc*f, w) for w in unmetal_words], top_k=8))\n",
"print wordtable([(word,) for i in range(0,10)])\n",
"for f in range(0,10):\n",
" print wordtable(do_word_maths(vecs, vecs_n, [(1, word)] +[(inc*f, w) for w in metal_words], top_k=8))\n",
"print \" --- more metal ---\""
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment