moonmilk/ofxMSAWord2Vec playground.ipynb

## ofxMSAWord2Vec playground.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# messing with oxfMSAWord2Vec\n",
    "from https://github.com/memo/ofxMSAWord2Vec\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "load_word_vectors_bin : /Users/ranjit/Downloads/GoogleNews-vectors-negative300_trimmed_53K_lowercase.bin  ... \n",
      "num_words: 53084/53084\n",
      "num_dims: 300\n",
      "done in 7.57633709908 seconds.\n",
      "------------------------------------------------------------\n",
      "normalize_word_vectors ... done in 0.50585103035 seconds.\n"
     ]
    }
   ],
   "source": [
    "from word2vec_utils import *\n",
    "vecs = load_word_vectors_bin('/Users/ranjit/Downloads/GoogleNews-vectors-negative300_trimmed_53K_lowercase.bin')\n",
    "vecs_n = normalize_word_vectors(vecs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('dogs', 0.86804897),\n",
       " ('puppy', 0.81064284),\n",
       " ('cat', 0.7609458),\n",
       " ('beagle', 0.74186218),\n",
       " ('pup', 0.74069107),\n",
       " ('chihuahua', 0.71739173),\n",
       " ('pet', 0.71647859),\n",
       " ('canine', 0.69182897),\n",
       " ('collie', 0.67144096),\n",
       " ('kitten', 0.66598809)]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "find_closest_words(vecs_n, \"dog\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('papillon', 0.69635022),\n",
       " ('chihuahua', 0.6762343),\n",
       " ('dalmatian', 0.65920705),\n",
       " ('pug', 0.64561403),\n",
       " ('puppy', 0.64243448),\n",
       " ('labrador', 0.63804096),\n",
       " ('mastiff', 0.62263489),\n",
       " ('poodle', 0.62242281),\n",
       " ('beagle', 0.62123823),\n",
       " ('alsatian', 0.6157546)]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "find_closest_words(vecs_n, \"pomeranian\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('pomeranian', 1.0),\n",
       " ('papillon', 0.69635022),\n",
       " ('chihuahua', 0.6762343),\n",
       " ('dalmatian', 0.65920705),\n",
       " ('pug', 0.64561403),\n",
       " ('puppy', 0.64243448),\n",
       " ('labrador', 0.63804096),\n",
       " ('mastiff', 0.62263489),\n",
       " ('poodle', 0.62242281),\n",
       " ('beagle', 0.62123823)]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pom_n = vecs_n[\"pomeranian\"]\n",
    "find_closest_words(vecs_n, pom_n)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "metal_words = \"burn cries veins eternity breathe beast gonna demons ashes soul\".split(\" \")\n",
    "unmetal_words = \"particularly indicated secretary committee university relatively noted approximately chairman employees\".split(\" \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "metal_vecs, metal_vecs_n = [vecs[word] for word in metal_words], [vecs_n[word] for word in metal_words]\n",
    "unmetal_vecs, unmetal_vecs_n = [vecs[word] for word in unmetal_words], [vecs_n[word] for word in unmetal_words]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([('hell', 0.58362615),\n",
       "  ('eateth', 0.54645419),\n",
       "  ('souls', 0.54323936),\n",
       "  ('looketh', 0.53242099),\n",
       "  ('god', 0.52718186)],\n",
       " [('said', 0.53903073),\n",
       "  ('acknowledged', 0.49876258),\n",
       "  ('stressed', 0.48518729),\n",
       "  ('emphasized', 0.46537137),\n",
       "  ('committees', 0.46324104)])"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# find closest words to average word\n",
    "do_word_maths(vecs, vecs_n, [(0.1, word) for word in metal_words]),do_word_maths(vecs, vecs_n, [(0.1, word) for word in unmetal_words])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalize_word_vectors ... done in 5.19752502441e-05 seconds.\n"
     ]
    }
   ],
   "source": [
    "# find average vector\n",
    "import numpy as np\n",
    "metal_mean = np.mean(metal_vecs, axis=0)\n",
    "metal_mean_n = normalize_word_vectors({'metal_af':metal_mean})['metal_af']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('soul', 0.63606858),\n",
       " ('demons', 0.60098231),\n",
       " ('hell', 0.58362621),\n",
       " ('eateth', 0.54645419),\n",
       " ('souls', 0.54323936),\n",
       " ('beast', 0.53437954),\n",
       " ('looketh', 0.53242099),\n",
       " ('eternity', 0.52865142),\n",
       " ('breathe', 0.52801883),\n",
       " ('god', 0.52718186)]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "find_closest_words(vecs_n, metal_mean_n)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalize_word_vectors ... done in 3.79085540771e-05 seconds.\n"
     ]
    }
   ],
   "source": [
    "unmetal_mean = np.mean(unmetal_vecs, axis=0)\n",
    "unmetal_mean_n = normalize_word_vectors({'unmetal_af':unmetal_mean})['unmetal_af']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('chairman', 0.5799759),\n",
       " ('noted', 0.55365336),\n",
       " ('committee', 0.54178369),\n",
       " ('said', 0.53903079),\n",
       " ('secretary', 0.52612215),\n",
       " ('indicated', 0.50970483),\n",
       " ('acknowledged', 0.49876261),\n",
       " ('stressed', 0.48518729),\n",
       " ('emphasized', 0.46537143),\n",
       " ('committees', 0.46324104)]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "find_closest_words(vecs_n, unmetal_mean_n)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " --- less metal --- \n",
      "concerned\treceptive\tinvolved\tsaid\tstressed\n",
      "concerned\treceptive\tinvolved\tsaid\tamenable\n",
      "concerned\treceptive\tinvolved\tamenable\tconsidering\tactively\n",
      "concerned\treceptive\tinvolved\tamenable\taverse\tactively\tconsider\n",
      "receptive\tconcerned\tinvolved\taverse\tamenable\tintrigued\tactively\n",
      "receptive\tinvolved\tconcerned\taverse\tamenable\tintrigued\tconsider\n",
      "involved\treceptive\tconcerned\taverse\tintrigued\tamenable\tuninterested\n",
      "involved\treceptive\taverse\tconcerned\tintrigued\tamenable\tuninterested\n",
      "involved\treceptive\taverse\tintrigued\tconcerned\tamenable\tuninterested\n",
      "involved\taverse\tintrigued\treceptive\tuninterested\tamenable\tconcerned\n",
      "interested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\n",
      "averse\tinvolved\tintrigued\treceptive\tuninterested\tenamored\tamenable\n",
      "intrigued\taverse\tinvolved\tenamored\tuninterested\treceptive\tdesirous\n",
      "intrigued\taverse\tenamored\tdesirous\tuninterested\tinvolved\treceptive\n",
      "intrigued\taverse\tenamored\tdesirous\tuninterested\tfond\tinvolved\n",
      "intrigued\tenamored\taverse\tdesirous\tfond\tlove\tdreaming\n",
      "love\tintrigued\tenamored\tdreaming\tloves\tdesirous\taverse\n",
      "love\tmad\tdreaming\tloves\tintrigued\thell\n",
      "love\thell\tmad\tloves\tdreaming\n",
      "hell\tlove\tmad\tloves\n",
      "hell\tlove\tmad\twarn't\n",
      " --- more metal ---\n"
     ]
    }
   ],
   "source": [
    "word = 'interested'\n",
    "inc = 0.03\n",
    "print \" --- less metal --- \"\n",
    "def wordtable(w):\n",
    "    return \"\\t\".join([a[0] for a in w])\n",
    "for f in range(10,0,-1):\n",
    "    print wordtable(do_word_maths(vecs, vecs_n, [(1, word)] +[(inc*f, w) for w in unmetal_words], top_k=8))\n",
    "print wordtable([(word,) for i in range(0,10)])\n",
    "for f in range(0,10):\n",
    "    print wordtable(do_word_maths(vecs, vecs_n, [(1, word)] +[(inc*f, w) for w in metal_words], top_k=8))\n",
    "print \" --- more metal ---\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [

   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# messing with oxfMSAWord2Vec\n",
	"from https://github.com/memo/ofxMSAWord2Vec\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"load_word_vectors_bin : /Users/ranjit/Downloads/GoogleNews-vectors-negative300_trimmed_53K_lowercase.bin ... \n",
	"num_words: 53084/53084\n",
	"num_dims: 300\n",
	"done in 7.57633709908 seconds.\n",
	"------------------------------------------------------------\n",
	"normalize_word_vectors ... done in 0.50585103035 seconds.\n"
	]
	}
	],
	"source": [
	"from word2vec_utils import *\n",
	"vecs = load_word_vectors_bin('/Users/ranjit/Downloads/GoogleNews-vectors-negative300_trimmed_53K_lowercase.bin')\n",
	"vecs_n = normalize_word_vectors(vecs)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('dogs', 0.86804897),\n",
	" ('puppy', 0.81064284),\n",
	" ('cat', 0.7609458),\n",
	" ('beagle', 0.74186218),\n",
	" ('pup', 0.74069107),\n",
	" ('chihuahua', 0.71739173),\n",
	" ('pet', 0.71647859),\n",
	" ('canine', 0.69182897),\n",
	" ('collie', 0.67144096),\n",
	" ('kitten', 0.66598809)]"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"find_closest_words(vecs_n, \"dog\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('papillon', 0.69635022),\n",
	" ('chihuahua', 0.6762343),\n",
	" ('dalmatian', 0.65920705),\n",
	" ('pug', 0.64561403),\n",
	" ('puppy', 0.64243448),\n",
	" ('labrador', 0.63804096),\n",
	" ('mastiff', 0.62263489),\n",
	" ('poodle', 0.62242281),\n",
	" ('beagle', 0.62123823),\n",
	" ('alsatian', 0.6157546)]"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"find_closest_words(vecs_n, \"pomeranian\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('pomeranian', 1.0),\n",
	" ('papillon', 0.69635022),\n",
	" ('chihuahua', 0.6762343),\n",
	" ('dalmatian', 0.65920705),\n",
	" ('pug', 0.64561403),\n",
	" ('puppy', 0.64243448),\n",
	" ('labrador', 0.63804096),\n",
	" ('mastiff', 0.62263489),\n",
	" ('poodle', 0.62242281),\n",
	" ('beagle', 0.62123823)]"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pom_n = vecs_n[\"pomeranian\"]\n",
	"find_closest_words(vecs_n, pom_n)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"metal_words = \"burn cries veins eternity breathe beast gonna demons ashes soul\".split(\" \")\n",
	"unmetal_words = \"particularly indicated secretary committee university relatively noted approximately chairman employees\".split(\" \")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"metal_vecs, metal_vecs_n = [vecs[word] for word in metal_words], [vecs_n[word] for word in metal_words]\n",
	"unmetal_vecs, unmetal_vecs_n = [vecs[word] for word in unmetal_words], [vecs_n[word] for word in unmetal_words]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"([('hell', 0.58362615),\n",
	" ('eateth', 0.54645419),\n",
	" ('souls', 0.54323936),\n",
	" ('looketh', 0.53242099),\n",
	" ('god', 0.52718186)],\n",
	" [('said', 0.53903073),\n",
	" ('acknowledged', 0.49876258),\n",
	" ('stressed', 0.48518729),\n",
	" ('emphasized', 0.46537137),\n",
	" ('committees', 0.46324104)])"
	]
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# find closest words to average word\n",
	"do_word_maths(vecs, vecs_n, [(0.1, word) for word in metal_words]),do_word_maths(vecs, vecs_n, [(0.1, word) for word in unmetal_words])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"normalize_word_vectors ... done in 5.19752502441e-05 seconds.\n"
	]
	}
	],
	"source": [
	"# find average vector\n",
	"import numpy as np\n",
	"metal_mean = np.mean(metal_vecs, axis=0)\n",
	"metal_mean_n = normalize_word_vectors({'metal_af':metal_mean})['metal_af']\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 43,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('soul', 0.63606858),\n",
	" ('demons', 0.60098231),\n",
	" ('hell', 0.58362621),\n",
	" ('eateth', 0.54645419),\n",
	" ('souls', 0.54323936),\n",
	" ('beast', 0.53437954),\n",
	" ('looketh', 0.53242099),\n",
	" ('eternity', 0.52865142),\n",
	" ('breathe', 0.52801883),\n",
	" ('god', 0.52718186)]"
	]
	},
	"execution_count": 43,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"find_closest_words(vecs_n, metal_mean_n)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 44,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"normalize_word_vectors ... done in 3.79085540771e-05 seconds.\n"
	]
	}
	],
	"source": [
	"unmetal_mean = np.mean(unmetal_vecs, axis=0)\n",
	"unmetal_mean_n = normalize_word_vectors({'unmetal_af':unmetal_mean})['unmetal_af']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 45,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('chairman', 0.5799759),\n",
	" ('noted', 0.55365336),\n",
	" ('committee', 0.54178369),\n",
	" ('said', 0.53903079),\n",
	" ('secretary', 0.52612215),\n",
	" ('indicated', 0.50970483),\n",
	" ('acknowledged', 0.49876261),\n",
	" ('stressed', 0.48518729),\n",
	" ('emphasized', 0.46537143),\n",
	" ('committees', 0.46324104)]"
	]
	},
	"execution_count": 45,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"find_closest_words(vecs_n, unmetal_mean_n)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 198,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" --- less metal --- \n",
	"concerned\treceptive\tinvolved\tsaid\tstressed\n",
	"concerned\treceptive\tinvolved\tsaid\tamenable\n",
	"concerned\treceptive\tinvolved\tamenable\tconsidering\tactively\n",
	"concerned\treceptive\tinvolved\tamenable\taverse\tactively\tconsider\n",
	"receptive\tconcerned\tinvolved\taverse\tamenable\tintrigued\tactively\n",
	"receptive\tinvolved\tconcerned\taverse\tamenable\tintrigued\tconsider\n",
	"involved\treceptive\tconcerned\taverse\tintrigued\tamenable\tuninterested\n",
	"involved\treceptive\taverse\tconcerned\tintrigued\tamenable\tuninterested\n",
	"involved\treceptive\taverse\tintrigued\tconcerned\tamenable\tuninterested\n",
	"involved\taverse\tintrigued\treceptive\tuninterested\tamenable\tconcerned\n",
	"interested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\n",
	"averse\tinvolved\tintrigued\treceptive\tuninterested\tenamored\tamenable\n",
	"intrigued\taverse\tinvolved\tenamored\tuninterested\treceptive\tdesirous\n",
	"intrigued\taverse\tenamored\tdesirous\tuninterested\tinvolved\treceptive\n",
	"intrigued\taverse\tenamored\tdesirous\tuninterested\tfond\tinvolved\n",
	"intrigued\tenamored\taverse\tdesirous\tfond\tlove\tdreaming\n",
	"love\tintrigued\tenamored\tdreaming\tloves\tdesirous\taverse\n",
	"love\tmad\tdreaming\tloves\tintrigued\thell\n",
	"love\thell\tmad\tloves\tdreaming\n",
	"hell\tlove\tmad\tloves\n",
	"hell\tlove\tmad\twarn't\n",
	" --- more metal ---\n"
	]
	}
	],
	"source": [
	"word = 'interested'\n",
	"inc = 0.03\n",
	"print \" --- less metal --- \"\n",
	"def wordtable(w):\n",
	" return \"\\t\".join([a[0] for a in w])\n",
	"for f in range(10,0,-1):\n",
	" print wordtable(do_word_maths(vecs, vecs_n, [(1, word)] +[(inc*f, w) for w in unmetal_words], top_k=8))\n",
	"print wordtable([(word,) for i in range(0,10)])\n",
	"for f in range(0,10):\n",
	" print wordtable(do_word_maths(vecs, vecs_n, [(1, word)] +[(inc*f, w) for w in metal_words], top_k=8))\n",
	"print \" --- more metal ---\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 46,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [

	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}