kudkudak/10_08_feasibility.ipynb Secret

## 10_08_feasibility.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feasibility study\n",
    "\n",
    "* Fetch related sentences\n",
    "* Use conceptnet (gzipped assertions)\n",
    "* Use causal relations\n",
    "* Use booktest "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import tqdm\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pylab as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "db = \"/data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0.csv\"\n",
    "db_en = \"/data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0_en.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "### Filter out non-eng relations (creates db_en file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "! grep /c/en/ /data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0.csv > /data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0_en.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "809\t/data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0.csv\r\n"
     ]
    }
   ],
   "source": [
    "! du -m /data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "323\t/data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0_en.csv\r\n"
     ]
    }
   ],
   "source": [
    "! du -m /data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0_en.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "### Read concept net filtered down to en relations\n",
    "C_en = pd.read_csv(db_en, sep=\"\\t\", header=None)\n",
    "C_en.columns = ['whatever', 'rel', 'head', 'tail', 'props']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "C_en = C_en.sample(frac=1).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "C_en_counts = Counter(C_en.rel.head(10000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['/r/ExternalURL', '/r/RelatedTo', '/r/Synonym', '/r/HasContext', '/r/SimilarTo', '/r/dbpedia/field', '/r/dbpedia/genus', '/r/dbpedia/occupation', '/r/dbpedia/knownFor', '/r/dbpedia/capital', '/r/dbpedia/genre']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10183595/10183595 [00:08<00:00, 1251159.04it/s]\n"
     ]
    }
   ],
   "source": [
    "## Filter out common and useless relations for us, which simplifies a lot of code later - \n",
    "## dataset is just smaller\n",
    "filter_rels = ['/r/ExternalURL', '/r/RelatedTo', '/r/Synonym', '/r/HasContext', '/r/SimilarTo']\n",
    "filter_rels += [r for r in C_en_counts if 'dbpedia' in r]\n",
    "print filter_rels\n",
    "C_en_rel = C_en.rel.values \n",
    "C_en_filtered = C_en.loc[[i for i in tqdm.tqdm(range(len(C_en)), total=len(C_en)) if C_en_rel[i] not in filter_rels]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Analysis of conceptnet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ended up with 1077940 relations\n"
     ]
    }
   ],
   "source": [
    "print \"Ended up with\", len(C_en_filtered), \"relations\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({'/r/Antonym': 21456,\n",
       "         '/r/AtLocation': 28960,\n",
       "         '/r/CapableOf': 26354,\n",
       "         '/r/Causes': 17088,\n",
       "         '/r/CausesDesire': 4782,\n",
       "         '/r/CreatedBy': 267,\n",
       "         '/r/DefinedAs': 2710,\n",
       "         '/r/DerivedFrom': 126928,\n",
       "         '/r/Desires': 3728,\n",
       "         '/r/DistinctFrom': 3571,\n",
       "         '/r/Entails': 405,\n",
       "         '/r/EtymologicallyRelatedTo': 158706,\n",
       "         '/r/FormOf': 273813,\n",
       "         '/r/HasA': 7735,\n",
       "         '/r/HasFirstSubevent': 3381,\n",
       "         '/r/HasLastSubevent': 2890,\n",
       "         '/r/HasPrerequisite': 23493,\n",
       "         '/r/HasProperty': 13553,\n",
       "         '/r/HasSubevent': 25896,\n",
       "         '/r/InstanceOf': 1366,\n",
       "         '/r/IsA': 249839,\n",
       "         '/r/LocatedNear': 49,\n",
       "         '/r/MadeOf': 604,\n",
       "         '/r/MannerOf': 48,\n",
       "         '/r/MotivatedByGoal': 9807,\n",
       "         '/r/NotCapableOf': 517,\n",
       "         '/r/NotDesires': 3095,\n",
       "         '/r/NotHasProperty': 509,\n",
       "         '/r/PartOf': 12872,\n",
       "         '/r/ReceivesAction': 8383,\n",
       "         '/r/SymbolOf': 4,\n",
       "         '/r/UsedFor': 42443,\n",
       "         '/r/dbpedia/influencedBy': 1261,\n",
       "         '/r/dbpedia/language': 885,\n",
       "         '/r/dbpedia/leader': 80,\n",
       "         '/r/dbpedia/product': 462})"
      ]
     },
     "execution_count": 167,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from collections import Counter\n",
    "C_en_filtered_counts = Counter(C_en_filtered.rel)\n",
    "C_en_filtered_counts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "corpus_f_name = \"/data/lisa/exp/jastrzes/l2lwe/data/booktest-gut/train.14M+.txt\"\n",
    "# corpus_f_name = \"/data/lisa/exp/jastrzes/l2lwe/data/CBTest/data/cbt_train.txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "### Open question: how to build a better index. Probably a search engine makes most sense\n",
    "N_CORPUS = int(1.5*10**8) # 1 B words is around 125MB "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "\n",
    "from collections import defaultdict\n",
    "word_index = defaultdict(list) # Build an index word -> ids in corpus\n",
    "word_count = defaultdict(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "THRESHOLD_MAX_WORD_COUNT_INDEX = N_CORPUS * 10**-3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "THRESHOLD_WORD_COUNT_INDEX = N_CORPUS * 4 * 10**-7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def read_word_by_word(input_file):\n",
    "    line = ''\n",
    "    while True:\n",
    "        word, space, line = line.partition(' ')\n",
    "        if space:\n",
    "            # A word was found\n",
    "            yield word\n",
    "        else:\n",
    "            # A word was not found; read a chunk of data from file\n",
    "            next_chunk = input_file.read(10000)\n",
    "            if next_chunk:\n",
    "                # Add the chunk to our line\n",
    "                line = word + next_chunk\n",
    "            else:\n",
    "                # No more data; yield the last word and return\n",
    "                yield word.rstrip('\\n')\n",
    "                return"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from itertools import islice\n",
    "corpus = list(islice(read_word_by_word(open(corpus_f_name)), N_CORPUS)) # ~2m for 100M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "word_count = Counter(corpus) # Quite fast!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 150000000/150000000 [05:00<00:00, 499554.73it/s]\n"
     ]
    }
   ],
   "source": [
    "## TODO: Use set, not list\n",
    "word_index = defaultdict(list)\n",
    "for word_id, word in tqdm.tqdm(enumerate(corpus), total=len(corpus)):\n",
    "    if word_count[word] > THRESHOLD_WORD_COUNT_INDEX: # We care about common words really\n",
    "        word_index[word].append(word_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 150000000/150000000 [05:05<00:00, 491025.18it/s]\n"
     ]
    }
   ],
   "source": [
    "id_to_sid = [0]*N_CORPUS\n",
    "cur_sid = 0\n",
    "sentences = []\n",
    "cur_sentence = []\n",
    "for w_id, w in tqdm.tqdm(enumerate(corpus), total=len(corpus)):\n",
    "    id_to_sid[w_id] = cur_sid\n",
    "    cur_sentence.append(w)\n",
    "    if w.startswith(\".\"):\n",
    "        cur_sid += 1\n",
    "        sentences.append(\" \".join(cur_sentence))\n",
    "        cur_sentence = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "id_to_sid = np.array(id_to_sid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vocab_corpus = set(corpus)\n",
    "print len(vocab_corpus)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Useful functions "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def find_indexes(word_list, w):\n",
    "    return word_index.get(w, [])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def print_random_joint_sentences(w1, w2, N=5, window_size=8):\n",
    "    \n",
    "\n",
    "    contexts = {}\n",
    "    occurences1 = set(id_to_sid[find_indexes(corpus, w1)])\n",
    "    occurences2 = set(id_to_sid[find_indexes(corpus, w2)])\n",
    "    occurences = list(occurences1 & occurences2)\n",
    "    joint_sentences = [sentences[id] for id in occurences]\n",
    "        \n",
    "        \n",
    "    if len(joint_sentences) == 0:\n",
    "        return [\"Sorry - no contexts for \" + w1 + \" \" + w2]\n",
    "    \n",
    "    out.append(\"w1={}, w2={}\".format(w1, w2))\n",
    "    contexts_w1 = np.random.choice(len(joint_sentences), min(N, len(joint_sentences)), replace=False)\n",
    "    contexts_w1 = [\"* \" + joint_sentences[s].replace(\"\\n\", \"\") for s in contexts_w1]\n",
    "    out.append(\"\\n\".join(contexts_w1))\n",
    "    return out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def print_random_context(w1, w2, N=5, window_size=8):\n",
    "    contexts = {}\n",
    "    for w in [w1, w2]:\n",
    "        occurences = find_indexes(corpus, w)\n",
    "        if len(occurences) == 0:\n",
    "            return ['Sorry no occurences for ' + w]\n",
    "        occurences_id = np.random.choice(len(occurences), min(N, len(occurences)), replace=False)\n",
    "        contexts[w] = [corpus[max(0, occurences[occ_id]-window_size):occurences[occ_id]+window_size] for occ_id in occurences_id]\n",
    "        \n",
    "    \n",
    "    \n",
    "    out = []\n",
    "    \n",
    "    out.append(\"w1={}, w2={}\".format(w1, w2))\n",
    "    out.append(\"{} contexts:\".format(w1))\n",
    "    contexts_w1 = contexts[w1]\n",
    "    contexts_w1 = [\"* \" + \" \".join(s).replace(\"\\n\", \"\") for s in contexts_w1]\n",
    "    out.append(\"\\n\".join(contexts_w1))\n",
    "    out.append(\"{} contexts:\".format(w2))\n",
    "    contexts_w2 = contexts[w2]\n",
    "    contexts_w2 = [\"* \" + \" \".join(s).replace(\"\\n\", \"\") for s in contexts_w2]\n",
    "    out.append(\"\\n\".join(contexts_w2))\n",
    "    \n",
    "    return out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## TODO : Nicer to filter out assertions\n",
    "def random_pair_words(corpus, vocab_corpus, corpus_count, assertions, rng, rel=None):\n",
    "    if rel:\n",
    "        assertions = assertions.query(\"rel=='{}'\".format(rel))\n",
    "    for i in range(1000):\n",
    "        edge_id = rng.randint(len(assertions))\n",
    "        edge = assertions.iloc[edge_id]\n",
    "        head, rel, tail = edge['head'], edge['rel'], edge['tail']\n",
    "\n",
    "        if head.endswith(\"/v\") or head.endswith(\"/n\"):\n",
    "            head = head[0:-2]\n",
    "\n",
    "        if tail.endswith(\"/v\") or tail.endswith(\"/n\"):\n",
    "            tail = tail[0:-2]\n",
    "\n",
    "        # 1. Filter out gramma\n",
    "        if (head.startswith(tail) or tail.startswith(head)):\n",
    "            continue\n",
    "\n",
    "        # 2. Filter out multi word\n",
    "        if \"_\" in head or \"_\" in tail:\n",
    "            continue \n",
    "\n",
    "        if corpus_count[head.split(\"/\")[-1]]>THRESHOLD_MAX_WORD_COUNT_INDEX  \\\n",
    "            or corpus_count[tail.split(\"/\")[-1]] >THRESHOLD_MAX_WORD_COUNT_INDEX:\n",
    "            continue\n",
    "            \n",
    "        # 3. Only englihs\n",
    "        if not head.startswith(\"/c/en\") or not tail.startswith(\"/c/en/\"):\n",
    "            continue\n",
    "\n",
    "        # 4. Only words occuring in vocab\n",
    "        if head.split(\"/\")[-1] not in vocab_corpus or tail.split(\"/\")[-1] not in vocab_corpus:\n",
    "            continue\n",
    "\n",
    "        # 5. Only non-recursive edges\n",
    "        if head == tail:\n",
    "            continue\n",
    "\n",
    "        return head.split(\"/\")[-1], rel, tail.split(\"/\")[-1]\n",
    "    return None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# \"Random\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "assertions = C_en_filtered"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 200/200 [00:00<00:00, 249.04it/s]\n"
     ]
    }
   ],
   "source": [
    "words = []\n",
    "import tqdm\n",
    "N = 200\n",
    "rng = np.random.RandomState(777)\n",
    "for i in tqdm.tqdm(range(N), total=N):\n",
    "    words.append(random_pair_words(corpus=corpus, corpus_count=word_count,\n",
    "                                   assertions=C_en_filtered, vocab_corpus=vocab_corpus, rng=rng))\n",
    "\n",
    "with open(\"10_08_feasibility_study.md\", \"w\") as f:\n",
    "    for w in tqdm.tqdm(words, total=len(words)):\n",
    "        out = []\n",
    "        out.append(\"## \" + str(w))\n",
    "        out.append(\"------\\n\")\n",
    "        out.append(\"\\n ### joint sentences contexts \\n\")\n",
    "        out += print_random_joint_sentences(str(w[0].split(\"/\")[-1]), str(w[2].split(\"/\")[-1]), N=10, window_size=14)\n",
    "        out.append( \"\\n ### random contexts \\n\" )\n",
    "        out += print_random_context(str(w[0].split(\"/\")[-1]), str(w[2].split(\"/\")[-1]), N=10, window_size=14)\n",
    "        \n",
    "        f.write(\"\\n\".join(out))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Each relation 5 examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 33/33 [00:25<00:00,  1.96it/s]\n"
     ]
    }
   ],
   "source": [
    "words = []\n",
    "import tqdm\n",
    "N = 5\n",
    "rng = np.random.RandomState(777)\n",
    "for r in tqdm.tqdm(C_en_filtered_counts, total=len(C_en_filtered_counts)):\n",
    "    for i in range(N):\n",
    "        words.append(random_pair_words(corpus=corpus, corpus_count=word_count,\n",
    "                                   assertions=C_en_filtered, vocab_corpus=vocab_corpus, rng=rng, rel=r))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 165/165 [00:01<00:00, 102.66it/s]\n"
     ]
    }
   ],
   "source": [
    "with open(\"13_08_feasibility_study.md\", \"w\") as f:\n",
    "    for w in tqdm.tqdm(words, total=len(words)):\n",
    "        \n",
    "        if not w:\n",
    "            continue\n",
    "        \n",
    "        out = []\n",
    "        out.append(\"## \" + str(w))\n",
    "        out.append(\"------\\n\")\n",
    "        out.append(\"\\n ### joint sentences contexts \\n\")\n",
    "        out += print_random_joint_sentences(str(w[0].split(\"/\")[-1]), str(w[2].split(\"/\")[-1]), N=10, window_size=14)\n",
    "        out.append( \"\\n ### random contexts \\n\" )\n",
    "        out += print_random_context(str(w[0].split(\"/\")[-1]), str(w[2].split(\"/\")[-1]), N=10, window_size=14)\n",
    "        \n",
    "        f.write(\"\\n\".join(out))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t10_08_feasibility_study.md\r\n"
     ]
    }
   ],
   "source": [
    "! du -m 10_08_feasibility_study.md"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Hand picked"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "words = [\n",
    "    ('lemon', 'sour'), # has property\n",
    "    ('door', 'open'), # is used\n",
    "    ('hot', 'cold'), # antonym\n",
    "    ('pencil', 'write'), #\n",
    "    ('money', 'saving'), # is used, # TODO: what to do with 2 words?\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "## ('lemon', 'sour')\n",
      "-----\n",
      "\n",
      "Sorry - no contexts for lemon sour\n",
      "## ('door', 'open')\n",
      "-----\n",
      "\n",
      "w1=door, w2=open\n",
      "* Long before he could open the door he who was descending would be with him at the bottom of the stairs .9\n",
      "* Through queer short cuts that terribly bewildered the way , the porter led him to the house , and pushing the door open , went up two flights of stone stairs and knocked at a door on the landing .21\n",
      "* But Tibbie resumed : `` Ye maunna think , hooever , 'cause sic longin ' thouchts come ower me , that I gang aboot the hoose girnin ' and compleenin ' that I canna open the door and win oot .11\n",
      "* A big car was standing by the kerb and one of the attendants was holding open the door for a girl dressed in black .11\n",
      "* ''20 `` There 's mair vertues i ' the Bible nor courage , Thamas , '' retorted James , holding the outer door open to throw the sentence in , and shutting it instantly to escape with the last word .21\n",
      "## ('hot', 'cold')\n",
      "-----\n",
      "\n",
      "w1=hot, w2=cold\n",
      "* On the other hand , taking Hyngham -LRB- or Ingham -RRB- as a place-name , we go , as the kiddies say , from `` cold '' to `` hot '' at once .17\n",
      "* '14 As he read the detailed accounts Hugo knew , perhaps for the first time in his life , what it was ` to go hot and cold all over .\n",
      "* There are both cold and hot waters and these are sweet and agreeable .18\n",
      "* On the other hand , taking Hyngham -LRB- or Ingham -RRB- as a place-name , we go , as the kiddies say , from `` cold '' to `` hot '' at once .19\n",
      "* '14 As he read the detailed accounts Hugo knew , perhaps for the first time in his life , what it was ` to go hot and cold all over .\n",
      "## ('pencil', 'write')\n",
      "-----\n",
      "\n",
      "w1=pencil, w2=write\n",
      "* '6 ` For candles ? '7 said Fisher , confused ; ` how many ?8 -- what sort ? '9 ` Stupidity ! '10 exclaimed Archer , ` you are a pretty fellow at a dead lift !11 Lend me a pencil and a bit of paper , do ; I 'll write down what I want myself !12 Well , what are you fumbling for ? '13 ` For money ! '14 said Fisher , colouring .15\n",
      "* '13 ` For candles ? '14 said Fisher , confused ; ` how many ?15 -- what sort ? '16 ` Stupidity ! '17 exclaimed Archer , ` you are a pretty fellow at a dead lift !18 Lend me a pencil and a bit of paper , do ; I 'll write down what I want myself !19 Well , what are you fumbling for ? '20 ` For money ! '21 said XXXXX , colouring .\tFisher\t\tbuns|design|ambassador|dark|pray|Archer|Fisher|expense|candles|schoolroom1\n",
      "* exclaimed Archer , ` you are a pretty fellow at a dead lift !2 Lend me a pencil and a bit of paper , do ; I 'll write down what I want myself !3 Well , what are you fumbling for ? '4 ` For money ! '5 said Fisher , colouring .6\n",
      "* `` A horse , a dog , a fire , a man -- a St. Bernard dog saving a boy -- a soldier -- I think a soldier would suit Cyril ! ''3 She stared through the bush to the red road consideringly , holding her pencil ready to write .4\n",
      "* '6 ` For candles ? '7 said Fisher , confused ; ` how many ?8 -- what sort ? '9 ` Stupidity ! '10 exclaimed Archer , ` you are a pretty fellow at a dead lift !11 Lend me a pencil and a bit of paper , do ; I 'll write down what I want myself !12 Well , what are you fumbling for ? '13 ` For money ! '14 said Fisher , colouring .15\n",
      "## ('money', 'saving')\n",
      "-----\n",
      "\n",
      "w1=money, w2=saving\n",
      "* You will find that their prices are reasonable , considering the difference in cost of transportation at any point you might decide to purchase from in the United States ; in fact it is the saving of money to buy in Juneau .5\n",
      "* He also carried a more questionable scheme for the payment of military , naval , and civil pensions , which then amounted to L4 ,900,000 a year , but were falling in rapidly ; the money required for this purpose was to be borrowed by trustees , and was to be repaid in the course of forty-five years at the rate of L2 ,800,000 a year ; in this way an immediate saving of about L2 ,000,000 annually was effected at the cost , however , of the next generation .5\n",
      "* I know what saving of money its use has meant to me .21\n",
      "* I know what saving of money its use has meant to me .19\n",
      "* He also carried a more questionable scheme for the payment of military , naval , and civil pensions , which then amounted to L4 ,900,000 a year , but were falling in rapidly ; the money required for this purpose was to be borrowed by trustees , and was to be repaid in the course of forty-five years at the rate of L2 ,800,000 a year ; in this way an immediate saving of about L2 ,000,000 annually was effected at the cost , however , of the next generation .4\n"
     ]
    }
   ],
   "source": [
    "for w in words:\n",
    "    print \"## \" + str(w)\n",
    "    print \"-----\\n\"\n",
    "    print_random_joint_sentences(*w)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "## ('lemon', 'sour')\n",
      "-----\n",
      "\n",
      "w1=lemon, w2=sour\n",
      "lemon contexts:\n",
      "* the tent and Jakie 's breath reeked of lemon and vanilla .\tJakie\t\t_|woes|sleep|gratitude|charge|way|Jakie|pans|coffee|Jack1 Jakie wept , this\n",
      "* -LRB- Hawaiian hau -RRB- , or `` the lemon hibiscus '' -- the `` argenta ,\n",
      "* oranges have ceased to be gathered and the lemon has been squeezed .5 Occasionally there is\n",
      "* so they try macaroon and pistachio instead of lemon and vanilla .14 Fresh people are better\n",
      "* oranges have ceased to be gathered and the lemon has been squeezed .17 Occasionally there is\n",
      "sour contexts:\n",
      "* I doubt pastry from Languedoc would turn me sour ; and liking monks little enough as\n",
      "* . ''8 Sejanus answered this reminder with a sour and peculiar smile .9 `` Good ,\n",
      "* have been the glasses that had changed the sour old woman into a smiling fairy ;\n",
      "* 's head ! ''15 `` A set of sour knaves , '' he cried , ``\n",
      "* hee-hawed '' , and said something about `` sour grapes '' .15 She was jolly smartly\n",
      "## ('door', 'open')\n",
      "-----\n",
      "\n",
      "w1=door, w2=open\n",
      "door contexts:\n",
      "* to the door .12 Mr. Carson opened the door and mildly asked to know the object\n",
      "* room level with the ground .12 The main door is in the left wall .13 Along\n",
      "* cheers and shrieks , someone fell against the door with a soft noise , and there\n",
      "* rang the bell , where , after the door was opened , she was shown into\n",
      "* under the misletoe , which was over the door , and Dick shook hands with Mrs.\n",
      "open contexts:\n",
      "* his efforts to holding Lisbon , and keeping open his line of communication with Spain .11\n",
      "* .17 It was plain that Canada was an open book to him .18 `` The long\n",
      "* there . ''5 Loughs Conn and Cullin are open free fishing , and on the preserves\n",
      "* left the sentence unfinished .9 `` Let us open it , '' said Cleggett .10 ``\n",
      "* valley .13 Hals , whose veracity is much open to doubt , states that Militon had\n",
      "## ('hot', 'cold')\n",
      "-----\n",
      "\n",
      "w1=hot, w2=cold\n",
      "hot contexts:\n",
      "* But I , Eustace , love her so hot that I have fear of myself .16\n",
      "* -- '' Salis was too late , for hot , excited , and strung up hard\n",
      "* bofe sides , an ' dey had it hot an ' heavy , nip an '\n",
      "* .8 For hills and mountains are created in hot countries , whether they are situated by\n",
      "* delving , building , toiling through the long hot summer 's day , in rivalry of\n",
      "cold contexts:\n",
      "* days of old , when the spring with cold Had , brightened his branches gray ,\n",
      "* I am quite well .10 It is a cold morning , and I shivered a little\n",
      "* laughter came from the playground -- but a cold silence had come by the fiftieth .12\n",
      "* is not enough that I Am sacrificed to cold state policy , A snare is laid\n",
      "* his heart beat quickly and his flesh grow cold with a nervous trepidation -- just such\n",
      "## ('pencil', 'write')\n",
      "-----\n",
      "\n",
      "w1=pencil, w2=write\n",
      "pencil contexts:\n",
      "* I will . ''21 And Olive produced a pencil and paper with alacrity , and by\n",
      "* the most finished that the bright and effective pencil of Edith could achieve .\tLady\t\tabstraction|Coningsby|Hellingsley|sky|Edith|aunt|Joseph|Lady|pencils|Wallinger1 ` We\n",
      "* describe it , nor can the painter 's pencil .12 It continued for nearly half-an-hour ,\n",
      "* know all about it . ''13 The paralytic pencil wavered and came to a full stop\n",
      "* the most finished that the bright and effective pencil of Edith could achieve .11 If it\n",
      "write contexts:\n",
      "* Mountjoy , King-at-arms , who was there to write down the names , began to reason\n",
      "* suit myself .18 One or two only will write with very little change from me .19\n",
      "* been a hard tussle to get her to write the apology , and , but for\n",
      "* bear them , or could she ?3 Better write without his knowledge .4 Then , on\n",
      "* which Mr. Grey had agreed that she should write to him , he hesitated to open\n",
      "## ('money', 'saving')\n",
      "-----\n",
      "\n",
      "w1=money, w2=saving\n",
      "money contexts:\n",
      "* out of his pocket and began counting his money .6 There was a great deal of\n",
      "* to the screen .\tDelamere\t\ttime|places|Delamere|Tom|Gus|Augustus|lem|Sandy|ha|Davidson1 `` It 's Confederate money . ''2 `` So it is ,\n",
      "* that ship-money was not a tax , but money paid in commutation of the duty of\n",
      "* .13 There is nobody I want to leave money to except you and Mr. Glover .\n",
      "* whole of life .10 Wasted plans , wasted money , wasted love , and she had\n",
      "saving contexts:\n",
      "* shipwrecked and , according to tradition , only saving his poem which he held in one\n",
      "* rooms that night , and worked hardest , saving her library and her pictures and her\n",
      "* Everard , `` I had the satisfaction of saving the life of a French officer in\n",
      "* gotten any manner of strength ; and the saving of the friar 's life , which\n",
      "* used to take Ralph to task for not saving Ham from his iniquities , and Ralph\n"
     ]
    }
   ],
   "source": [
    "for w in words:\n",
    "    print \"## \" + str(w)\n",
    "    print \"-----\\n\"\n",
    "    print_random_context(*w)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Old"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 1. Gather occurences\n",
    "\n",
    "\n",
    "for w1, w2 in words:\n",
    "    for w in [w1, w2]:\n",
    "        if w in contexts:\n",
    "            continue\n",
    "        occurences = (np.where(text8_corpus == w)[0])\n",
    "        print(\"{} occurences of {}\".format(len(occurences), w))\n",
    "        contexts[w] = [text8_corpus[max(0, occ-window_size):occ+window_size] for occ in occurences]\n",
    "        \n",
    "    \n",
    "\n",
    "# 2. Print some\n",
    "\n",
    "N = 5\n",
    "\n",
    "for w1, w2 in words:\n",
    "    print \"w1={}, w2={}\".format(w1, w2)\n",
    "    print \"{} contexts:\".format(w1)\n",
    "    contexts_w1 = np.random.choice(len(contexts[w1]), N, replace=False)\n",
    "    contexts_w1 = [contexts[w1][id] for id in contexts_w1]\n",
    "    contexts_w1 = [\"\\t\" + \" \".join(s) for s in contexts_w1]\n",
    "    print \"\\n\".join(contexts_w1)\n",
    "    print \"{} contexts:\".format(w2)\n",
    "    contexts_w2 = np.random.choice(len(contexts[w2]), N, replace=False)\n",
    "    contexts_w2 = [contexts[w2][id] for id in contexts_w2]\n",
    "    contexts_w2 = [\"\\t\" + \" \".join(s) for s in contexts_w2]\n",
    "    print \"\\n\".join(contexts_w2)    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}