Skip to content

Instantly share code, notes, and snippets.

@kudkudak
Created September 27, 2017 21:24
Show Gist options
  • Save kudkudak/bd486180eba247a0bf1bad1621b0f260 to your computer and use it in GitHub Desktop.
Save kudkudak/bd486180eba247a0bf1bad1621b0f260 to your computer and use it in GitHub Desktop.
10_08_feasibility.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Feasibility study\n",
"\n",
"* Fetch related sentences\n",
"* Use conceptnet (gzipped assertions)\n",
"* Use causal relations\n",
"* Use booktest "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json\n",
"import tqdm\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pylab as plt\n",
"import seaborn as sns\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"db = \"/data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0.csv\"\n",
"db_en = \"/data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0_en.csv\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"### Filter out non-eng relations (creates db_en file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"! grep /c/en/ /data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0.csv > /data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0_en.csv"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"809\t/data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0.csv\r\n"
]
}
],
"source": [
"! du -m /data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0.csv"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"323\t/data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0_en.csv\r\n"
]
}
],
"source": [
"! du -m /data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0_en.csv"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"### Read concept net filtered down to en relations\n",
"C_en = pd.read_csv(db_en, sep=\"\\t\", header=None)\n",
"C_en.columns = ['whatever', 'rel', 'head', 'tail', 'props']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"C_en = C_en.sample(frac=1).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from collections import Counter\n",
"C_en_counts = Counter(C_en.rel.head(10000))"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['/r/ExternalURL', '/r/RelatedTo', '/r/Synonym', '/r/HasContext', '/r/SimilarTo', '/r/dbpedia/field', '/r/dbpedia/genus', '/r/dbpedia/occupation', '/r/dbpedia/knownFor', '/r/dbpedia/capital', '/r/dbpedia/genre']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 10183595/10183595 [00:08<00:00, 1251159.04it/s]\n"
]
}
],
"source": [
"## Filter out common and useless relations for us, which simplifies a lot of code later - \n",
"## dataset is just smaller\n",
"filter_rels = ['/r/ExternalURL', '/r/RelatedTo', '/r/Synonym', '/r/HasContext', '/r/SimilarTo']\n",
"filter_rels += [r for r in C_en_counts if 'dbpedia' in r]\n",
"print filter_rels\n",
"C_en_rel = C_en.rel.values \n",
"C_en_filtered = C_en.loc[[i for i in tqdm.tqdm(range(len(C_en)), total=len(C_en)) if C_en_rel[i] not in filter_rels]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Analysis of conceptnet"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ended up with 1077940 relations\n"
]
}
],
"source": [
"print \"Ended up with\", len(C_en_filtered), \"relations\""
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Counter({'/r/Antonym': 21456,\n",
" '/r/AtLocation': 28960,\n",
" '/r/CapableOf': 26354,\n",
" '/r/Causes': 17088,\n",
" '/r/CausesDesire': 4782,\n",
" '/r/CreatedBy': 267,\n",
" '/r/DefinedAs': 2710,\n",
" '/r/DerivedFrom': 126928,\n",
" '/r/Desires': 3728,\n",
" '/r/DistinctFrom': 3571,\n",
" '/r/Entails': 405,\n",
" '/r/EtymologicallyRelatedTo': 158706,\n",
" '/r/FormOf': 273813,\n",
" '/r/HasA': 7735,\n",
" '/r/HasFirstSubevent': 3381,\n",
" '/r/HasLastSubevent': 2890,\n",
" '/r/HasPrerequisite': 23493,\n",
" '/r/HasProperty': 13553,\n",
" '/r/HasSubevent': 25896,\n",
" '/r/InstanceOf': 1366,\n",
" '/r/IsA': 249839,\n",
" '/r/LocatedNear': 49,\n",
" '/r/MadeOf': 604,\n",
" '/r/MannerOf': 48,\n",
" '/r/MotivatedByGoal': 9807,\n",
" '/r/NotCapableOf': 517,\n",
" '/r/NotDesires': 3095,\n",
" '/r/NotHasProperty': 509,\n",
" '/r/PartOf': 12872,\n",
" '/r/ReceivesAction': 8383,\n",
" '/r/SymbolOf': 4,\n",
" '/r/UsedFor': 42443,\n",
" '/r/dbpedia/influencedBy': 1261,\n",
" '/r/dbpedia/language': 885,\n",
" '/r/dbpedia/leader': 80,\n",
" '/r/dbpedia/product': 462})"
]
},
"execution_count": 167,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from collections import Counter\n",
"C_en_filtered_counts = Counter(C_en_filtered.rel)\n",
"C_en_filtered_counts"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"corpus_f_name = \"/data/lisa/exp/jastrzes/l2lwe/data/booktest-gut/train.14M+.txt\"\n",
"# corpus_f_name = \"/data/lisa/exp/jastrzes/l2lwe/data/CBTest/data/cbt_train.txt\""
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"### Open question: how to build a better index. Probably a search engine makes most sense\n",
"N_CORPUS = int(1.5*10**8) # 1 B words is around 125MB "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"\n",
"from collections import defaultdict\n",
"word_index = defaultdict(list) # Build an index word -> ids in corpus\n",
"word_count = defaultdict(int)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"THRESHOLD_MAX_WORD_COUNT_INDEX = N_CORPUS * 10**-3"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"THRESHOLD_WORD_COUNT_INDEX = N_CORPUS * 4 * 10**-7"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def read_word_by_word(input_file):\n",
" line = ''\n",
" while True:\n",
" word, space, line = line.partition(' ')\n",
" if space:\n",
" # A word was found\n",
" yield word\n",
" else:\n",
" # A word was not found; read a chunk of data from file\n",
" next_chunk = input_file.read(10000)\n",
" if next_chunk:\n",
" # Add the chunk to our line\n",
" line = word + next_chunk\n",
" else:\n",
" # No more data; yield the last word and return\n",
" yield word.rstrip('\\n')\n",
" return"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from itertools import islice\n",
"corpus = list(islice(read_word_by_word(open(corpus_f_name)), N_CORPUS)) # ~2m for 100M"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"word_count = Counter(corpus) # Quite fast!"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 150000000/150000000 [05:00<00:00, 499554.73it/s]\n"
]
}
],
"source": [
"## TODO: Use set, not list\n",
"word_index = defaultdict(list)\n",
"for word_id, word in tqdm.tqdm(enumerate(corpus), total=len(corpus)):\n",
" if word_count[word] > THRESHOLD_WORD_COUNT_INDEX: # We care about common words really\n",
" word_index[word].append(word_id)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 150000000/150000000 [05:05<00:00, 491025.18it/s]\n"
]
}
],
"source": [
"id_to_sid = [0]*N_CORPUS\n",
"cur_sid = 0\n",
"sentences = []\n",
"cur_sentence = []\n",
"for w_id, w in tqdm.tqdm(enumerate(corpus), total=len(corpus)):\n",
" id_to_sid[w_id] = cur_sid\n",
" cur_sentence.append(w)\n",
" if w.startswith(\".\"):\n",
" cur_sid += 1\n",
" sentences.append(\" \".join(cur_sentence))\n",
" cur_sentence = []"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"id_to_sid = np.array(id_to_sid)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"vocab_corpus = set(corpus)\n",
"print len(vocab_corpus)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Useful functions "
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def find_indexes(word_list, w):\n",
" return word_index.get(w, [])"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def print_random_joint_sentences(w1, w2, N=5, window_size=8):\n",
" \n",
"\n",
" contexts = {}\n",
" occurences1 = set(id_to_sid[find_indexes(corpus, w1)])\n",
" occurences2 = set(id_to_sid[find_indexes(corpus, w2)])\n",
" occurences = list(occurences1 & occurences2)\n",
" joint_sentences = [sentences[id] for id in occurences]\n",
" \n",
" \n",
" if len(joint_sentences) == 0:\n",
" return [\"Sorry - no contexts for \" + w1 + \" \" + w2]\n",
" \n",
" out.append(\"w1={}, w2={}\".format(w1, w2))\n",
" contexts_w1 = np.random.choice(len(joint_sentences), min(N, len(joint_sentences)), replace=False)\n",
" contexts_w1 = [\"* \" + joint_sentences[s].replace(\"\\n\", \"\") for s in contexts_w1]\n",
" out.append(\"\\n\".join(contexts_w1))\n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def print_random_context(w1, w2, N=5, window_size=8):\n",
" contexts = {}\n",
" for w in [w1, w2]:\n",
" occurences = find_indexes(corpus, w)\n",
" if len(occurences) == 0:\n",
" return ['Sorry no occurences for ' + w]\n",
" occurences_id = np.random.choice(len(occurences), min(N, len(occurences)), replace=False)\n",
" contexts[w] = [corpus[max(0, occurences[occ_id]-window_size):occurences[occ_id]+window_size] for occ_id in occurences_id]\n",
" \n",
" \n",
" \n",
" out = []\n",
" \n",
" out.append(\"w1={}, w2={}\".format(w1, w2))\n",
" out.append(\"{} contexts:\".format(w1))\n",
" contexts_w1 = contexts[w1]\n",
" contexts_w1 = [\"* \" + \" \".join(s).replace(\"\\n\", \"\") for s in contexts_w1]\n",
" out.append(\"\\n\".join(contexts_w1))\n",
" out.append(\"{} contexts:\".format(w2))\n",
" contexts_w2 = contexts[w2]\n",
" contexts_w2 = [\"* \" + \" \".join(s).replace(\"\\n\", \"\") for s in contexts_w2]\n",
" out.append(\"\\n\".join(contexts_w2))\n",
" \n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"## TODO : Nicer to filter out assertions\n",
"def random_pair_words(corpus, vocab_corpus, corpus_count, assertions, rng, rel=None):\n",
" if rel:\n",
" assertions = assertions.query(\"rel=='{}'\".format(rel))\n",
" for i in range(1000):\n",
" edge_id = rng.randint(len(assertions))\n",
" edge = assertions.iloc[edge_id]\n",
" head, rel, tail = edge['head'], edge['rel'], edge['tail']\n",
"\n",
" if head.endswith(\"/v\") or head.endswith(\"/n\"):\n",
" head = head[0:-2]\n",
"\n",
" if tail.endswith(\"/v\") or tail.endswith(\"/n\"):\n",
" tail = tail[0:-2]\n",
"\n",
" # 1. Filter out gramma\n",
" if (head.startswith(tail) or tail.startswith(head)):\n",
" continue\n",
"\n",
" # 2. Filter out multi word\n",
" if \"_\" in head or \"_\" in tail:\n",
" continue \n",
"\n",
" if corpus_count[head.split(\"/\")[-1]]>THRESHOLD_MAX_WORD_COUNT_INDEX \\\n",
" or corpus_count[tail.split(\"/\")[-1]] >THRESHOLD_MAX_WORD_COUNT_INDEX:\n",
" continue\n",
" \n",
" # 3. Only englihs\n",
" if not head.startswith(\"/c/en\") or not tail.startswith(\"/c/en/\"):\n",
" continue\n",
"\n",
" # 4. Only words occuring in vocab\n",
" if head.split(\"/\")[-1] not in vocab_corpus or tail.split(\"/\")[-1] not in vocab_corpus:\n",
" continue\n",
"\n",
" # 5. Only non-recursive edges\n",
" if head == tail:\n",
" continue\n",
"\n",
" return head.split(\"/\")[-1], rel, tail.split(\"/\")[-1]\n",
" return None"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# \"Random\""
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"assertions = C_en_filtered"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 200/200 [00:00<00:00, 249.04it/s]\n"
]
}
],
"source": [
"words = []\n",
"import tqdm\n",
"N = 200\n",
"rng = np.random.RandomState(777)\n",
"for i in tqdm.tqdm(range(N), total=N):\n",
" words.append(random_pair_words(corpus=corpus, corpus_count=word_count,\n",
" assertions=C_en_filtered, vocab_corpus=vocab_corpus, rng=rng))\n",
"\n",
"with open(\"10_08_feasibility_study.md\", \"w\") as f:\n",
" for w in tqdm.tqdm(words, total=len(words)):\n",
" out = []\n",
" out.append(\"## \" + str(w))\n",
" out.append(\"------\\n\")\n",
" out.append(\"\\n ### joint sentences contexts \\n\")\n",
" out += print_random_joint_sentences(str(w[0].split(\"/\")[-1]), str(w[2].split(\"/\")[-1]), N=10, window_size=14)\n",
" out.append( \"\\n ### random contexts \\n\" )\n",
" out += print_random_context(str(w[0].split(\"/\")[-1]), str(w[2].split(\"/\")[-1]), N=10, window_size=14)\n",
" \n",
" f.write(\"\\n\".join(out))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Each relation 5 examples"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 33/33 [00:25<00:00, 1.96it/s]\n"
]
}
],
"source": [
"words = []\n",
"import tqdm\n",
"N = 5\n",
"rng = np.random.RandomState(777)\n",
"for r in tqdm.tqdm(C_en_filtered_counts, total=len(C_en_filtered_counts)):\n",
" for i in range(N):\n",
" words.append(random_pair_words(corpus=corpus, corpus_count=word_count,\n",
" assertions=C_en_filtered, vocab_corpus=vocab_corpus, rng=rng, rel=r))\n"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 165/165 [00:01<00:00, 102.66it/s]\n"
]
}
],
"source": [
"with open(\"13_08_feasibility_study.md\", \"w\") as f:\n",
" for w in tqdm.tqdm(words, total=len(words)):\n",
" \n",
" if not w:\n",
" continue\n",
" \n",
" out = []\n",
" out.append(\"## \" + str(w))\n",
" out.append(\"------\\n\")\n",
" out.append(\"\\n ### joint sentences contexts \\n\")\n",
" out += print_random_joint_sentences(str(w[0].split(\"/\")[-1]), str(w[2].split(\"/\")[-1]), N=10, window_size=14)\n",
" out.append( \"\\n ### random contexts \\n\" )\n",
" out += print_random_context(str(w[0].split(\"/\")[-1]), str(w[2].split(\"/\")[-1]), N=10, window_size=14)\n",
" \n",
" f.write(\"\\n\".join(out))"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\t10_08_feasibility_study.md\r\n"
]
}
],
"source": [
"! du -m 10_08_feasibility_study.md"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Hand picked"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"words = [\n",
" ('lemon', 'sour'), # has property\n",
" ('door', 'open'), # is used\n",
" ('hot', 'cold'), # antonym\n",
" ('pencil', 'write'), #\n",
" ('money', 'saving'), # is used, # TODO: what to do with 2 words?\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"## ('lemon', 'sour')\n",
"-----\n",
"\n",
"Sorry - no contexts for lemon sour\n",
"## ('door', 'open')\n",
"-----\n",
"\n",
"w1=door, w2=open\n",
"* Long before he could open the door he who was descending would be with him at the bottom of the stairs .9\n",
"* Through queer short cuts that terribly bewildered the way , the porter led him to the house , and pushing the door open , went up two flights of stone stairs and knocked at a door on the landing .21\n",
"* But Tibbie resumed : `` Ye maunna think , hooever , 'cause sic longin ' thouchts come ower me , that I gang aboot the hoose girnin ' and compleenin ' that I canna open the door and win oot .11\n",
"* A big car was standing by the kerb and one of the attendants was holding open the door for a girl dressed in black .11\n",
"* ''20 `` There 's mair vertues i ' the Bible nor courage , Thamas , '' retorted James , holding the outer door open to throw the sentence in , and shutting it instantly to escape with the last word .21\n",
"## ('hot', 'cold')\n",
"-----\n",
"\n",
"w1=hot, w2=cold\n",
"* On the other hand , taking Hyngham -LRB- or Ingham -RRB- as a place-name , we go , as the kiddies say , from `` cold '' to `` hot '' at once .17\n",
"* '14 As he read the detailed accounts Hugo knew , perhaps for the first time in his life , what it was ` to go hot and cold all over .\n",
"* There are both cold and hot waters and these are sweet and agreeable .18\n",
"* On the other hand , taking Hyngham -LRB- or Ingham -RRB- as a place-name , we go , as the kiddies say , from `` cold '' to `` hot '' at once .19\n",
"* '14 As he read the detailed accounts Hugo knew , perhaps for the first time in his life , what it was ` to go hot and cold all over .\n",
"## ('pencil', 'write')\n",
"-----\n",
"\n",
"w1=pencil, w2=write\n",
"* '6 ` For candles ? '7 said Fisher , confused ; ` how many ?8 -- what sort ? '9 ` Stupidity ! '10 exclaimed Archer , ` you are a pretty fellow at a dead lift !11 Lend me a pencil and a bit of paper , do ; I 'll write down what I want myself !12 Well , what are you fumbling for ? '13 ` For money ! '14 said Fisher , colouring .15\n",
"* '13 ` For candles ? '14 said Fisher , confused ; ` how many ?15 -- what sort ? '16 ` Stupidity ! '17 exclaimed Archer , ` you are a pretty fellow at a dead lift !18 Lend me a pencil and a bit of paper , do ; I 'll write down what I want myself !19 Well , what are you fumbling for ? '20 ` For money ! '21 said XXXXX , colouring .\tFisher\t\tbuns|design|ambassador|dark|pray|Archer|Fisher|expense|candles|schoolroom1\n",
"* exclaimed Archer , ` you are a pretty fellow at a dead lift !2 Lend me a pencil and a bit of paper , do ; I 'll write down what I want myself !3 Well , what are you fumbling for ? '4 ` For money ! '5 said Fisher , colouring .6\n",
"* `` A horse , a dog , a fire , a man -- a St. Bernard dog saving a boy -- a soldier -- I think a soldier would suit Cyril ! ''3 She stared through the bush to the red road consideringly , holding her pencil ready to write .4\n",
"* '6 ` For candles ? '7 said Fisher , confused ; ` how many ?8 -- what sort ? '9 ` Stupidity ! '10 exclaimed Archer , ` you are a pretty fellow at a dead lift !11 Lend me a pencil and a bit of paper , do ; I 'll write down what I want myself !12 Well , what are you fumbling for ? '13 ` For money ! '14 said Fisher , colouring .15\n",
"## ('money', 'saving')\n",
"-----\n",
"\n",
"w1=money, w2=saving\n",
"* You will find that their prices are reasonable , considering the difference in cost of transportation at any point you might decide to purchase from in the United States ; in fact it is the saving of money to buy in Juneau .5\n",
"* He also carried a more questionable scheme for the payment of military , naval , and civil pensions , which then amounted to L4 ,900,000 a year , but were falling in rapidly ; the money required for this purpose was to be borrowed by trustees , and was to be repaid in the course of forty-five years at the rate of L2 ,800,000 a year ; in this way an immediate saving of about L2 ,000,000 annually was effected at the cost , however , of the next generation .5\n",
"* I know what saving of money its use has meant to me .21\n",
"* I know what saving of money its use has meant to me .19\n",
"* He also carried a more questionable scheme for the payment of military , naval , and civil pensions , which then amounted to L4 ,900,000 a year , but were falling in rapidly ; the money required for this purpose was to be borrowed by trustees , and was to be repaid in the course of forty-five years at the rate of L2 ,800,000 a year ; in this way an immediate saving of about L2 ,000,000 annually was effected at the cost , however , of the next generation .4\n"
]
}
],
"source": [
"for w in words:\n",
" print \"## \" + str(w)\n",
" print \"-----\\n\"\n",
" print_random_joint_sentences(*w)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"## ('lemon', 'sour')\n",
"-----\n",
"\n",
"w1=lemon, w2=sour\n",
"lemon contexts:\n",
"* the tent and Jakie 's breath reeked of lemon and vanilla .\tJakie\t\t_|woes|sleep|gratitude|charge|way|Jakie|pans|coffee|Jack1 Jakie wept , this\n",
"* -LRB- Hawaiian hau -RRB- , or `` the lemon hibiscus '' -- the `` argenta ,\n",
"* oranges have ceased to be gathered and the lemon has been squeezed .5 Occasionally there is\n",
"* so they try macaroon and pistachio instead of lemon and vanilla .14 Fresh people are better\n",
"* oranges have ceased to be gathered and the lemon has been squeezed .17 Occasionally there is\n",
"sour contexts:\n",
"* I doubt pastry from Languedoc would turn me sour ; and liking monks little enough as\n",
"* . ''8 Sejanus answered this reminder with a sour and peculiar smile .9 `` Good ,\n",
"* have been the glasses that had changed the sour old woman into a smiling fairy ;\n",
"* 's head ! ''15 `` A set of sour knaves , '' he cried , ``\n",
"* hee-hawed '' , and said something about `` sour grapes '' .15 She was jolly smartly\n",
"## ('door', 'open')\n",
"-----\n",
"\n",
"w1=door, w2=open\n",
"door contexts:\n",
"* to the door .12 Mr. Carson opened the door and mildly asked to know the object\n",
"* room level with the ground .12 The main door is in the left wall .13 Along\n",
"* cheers and shrieks , someone fell against the door with a soft noise , and there\n",
"* rang the bell , where , after the door was opened , she was shown into\n",
"* under the misletoe , which was over the door , and Dick shook hands with Mrs.\n",
"open contexts:\n",
"* his efforts to holding Lisbon , and keeping open his line of communication with Spain .11\n",
"* .17 It was plain that Canada was an open book to him .18 `` The long\n",
"* there . ''5 Loughs Conn and Cullin are open free fishing , and on the preserves\n",
"* left the sentence unfinished .9 `` Let us open it , '' said Cleggett .10 ``\n",
"* valley .13 Hals , whose veracity is much open to doubt , states that Militon had\n",
"## ('hot', 'cold')\n",
"-----\n",
"\n",
"w1=hot, w2=cold\n",
"hot contexts:\n",
"* But I , Eustace , love her so hot that I have fear of myself .16\n",
"* -- '' Salis was too late , for hot , excited , and strung up hard\n",
"* bofe sides , an ' dey had it hot an ' heavy , nip an '\n",
"* .8 For hills and mountains are created in hot countries , whether they are situated by\n",
"* delving , building , toiling through the long hot summer 's day , in rivalry of\n",
"cold contexts:\n",
"* days of old , when the spring with cold Had , brightened his branches gray ,\n",
"* I am quite well .10 It is a cold morning , and I shivered a little\n",
"* laughter came from the playground -- but a cold silence had come by the fiftieth .12\n",
"* is not enough that I Am sacrificed to cold state policy , A snare is laid\n",
"* his heart beat quickly and his flesh grow cold with a nervous trepidation -- just such\n",
"## ('pencil', 'write')\n",
"-----\n",
"\n",
"w1=pencil, w2=write\n",
"pencil contexts:\n",
"* I will . ''21 And Olive produced a pencil and paper with alacrity , and by\n",
"* the most finished that the bright and effective pencil of Edith could achieve .\tLady\t\tabstraction|Coningsby|Hellingsley|sky|Edith|aunt|Joseph|Lady|pencils|Wallinger1 ` We\n",
"* describe it , nor can the painter 's pencil .12 It continued for nearly half-an-hour ,\n",
"* know all about it . ''13 The paralytic pencil wavered and came to a full stop\n",
"* the most finished that the bright and effective pencil of Edith could achieve .11 If it\n",
"write contexts:\n",
"* Mountjoy , King-at-arms , who was there to write down the names , began to reason\n",
"* suit myself .18 One or two only will write with very little change from me .19\n",
"* been a hard tussle to get her to write the apology , and , but for\n",
"* bear them , or could she ?3 Better write without his knowledge .4 Then , on\n",
"* which Mr. Grey had agreed that she should write to him , he hesitated to open\n",
"## ('money', 'saving')\n",
"-----\n",
"\n",
"w1=money, w2=saving\n",
"money contexts:\n",
"* out of his pocket and began counting his money .6 There was a great deal of\n",
"* to the screen .\tDelamere\t\ttime|places|Delamere|Tom|Gus|Augustus|lem|Sandy|ha|Davidson1 `` It 's Confederate money . ''2 `` So it is ,\n",
"* that ship-money was not a tax , but money paid in commutation of the duty of\n",
"* .13 There is nobody I want to leave money to except you and Mr. Glover .\n",
"* whole of life .10 Wasted plans , wasted money , wasted love , and she had\n",
"saving contexts:\n",
"* shipwrecked and , according to tradition , only saving his poem which he held in one\n",
"* rooms that night , and worked hardest , saving her library and her pictures and her\n",
"* Everard , `` I had the satisfaction of saving the life of a French officer in\n",
"* gotten any manner of strength ; and the saving of the friar 's life , which\n",
"* used to take Ralph to task for not saving Ham from his iniquities , and Ralph\n"
]
}
],
"source": [
"for w in words:\n",
" print \"## \" + str(w)\n",
" print \"-----\\n\"\n",
" print_random_context(*w)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Old"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 1. Gather occurences\n",
"\n",
"\n",
"for w1, w2 in words:\n",
" for w in [w1, w2]:\n",
" if w in contexts:\n",
" continue\n",
" occurences = (np.where(text8_corpus == w)[0])\n",
" print(\"{} occurences of {}\".format(len(occurences), w))\n",
" contexts[w] = [text8_corpus[max(0, occ-window_size):occ+window_size] for occ in occurences]\n",
" \n",
" \n",
"\n",
"# 2. Print some\n",
"\n",
"N = 5\n",
"\n",
"for w1, w2 in words:\n",
" print \"w1={}, w2={}\".format(w1, w2)\n",
" print \"{} contexts:\".format(w1)\n",
" contexts_w1 = np.random.choice(len(contexts[w1]), N, replace=False)\n",
" contexts_w1 = [contexts[w1][id] for id in contexts_w1]\n",
" contexts_w1 = [\"\\t\" + \" \".join(s) for s in contexts_w1]\n",
" print \"\\n\".join(contexts_w1)\n",
" print \"{} contexts:\".format(w2)\n",
" contexts_w2 = np.random.choice(len(contexts[w2]), N, replace=False)\n",
" contexts_w2 = [contexts[w2][id] for id in contexts_w2]\n",
" contexts_w2 = [\"\\t\" + \" \".join(s) for s in contexts_w2]\n",
" print \"\\n\".join(contexts_w2) "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment