-
-
Save kudkudak/bd486180eba247a0bf1bad1621b0f260 to your computer and use it in GitHub Desktop.
10_08_feasibility.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Feasibility study\n", | |
"\n", | |
"* Fetch related sentences\n", | |
"* Use conceptnet (gzipped assertions)\n", | |
"* Use causal relations\n", | |
"* Use booktest " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import tqdm\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import matplotlib.pylab as plt\n", | |
"import seaborn as sns\n", | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"db = \"/data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0.csv\"\n", | |
"db_en = \"/data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0_en.csv\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"### Filter out non-eng relations (creates db_en file)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"! grep /c/en/ /data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0.csv > /data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0_en.csv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"809\t/data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0.csv\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! du -m /data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0.csv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"323\t/data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0_en.csv\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! du -m /data/lisa/exp/jastrzes/l2lwe/data/conceptnet-assertions-5.5.0_en.csv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"### Read concept net filtered down to en relations\n", | |
"C_en = pd.read_csv(db_en, sep=\"\\t\", header=None)\n", | |
"C_en.columns = ['whatever', 'rel', 'head', 'tail', 'props']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"C_en = C_en.sample(frac=1).reset_index(drop=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 145, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from collections import Counter\n", | |
"C_en_counts = Counter(C_en.rel.head(10000))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 146, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['/r/ExternalURL', '/r/RelatedTo', '/r/Synonym', '/r/HasContext', '/r/SimilarTo', '/r/dbpedia/field', '/r/dbpedia/genus', '/r/dbpedia/occupation', '/r/dbpedia/knownFor', '/r/dbpedia/capital', '/r/dbpedia/genre']\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 10183595/10183595 [00:08<00:00, 1251159.04it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"## Filter out common and useless relations for us, which simplifies a lot of code later - \n", | |
"## dataset is just smaller\n", | |
"filter_rels = ['/r/ExternalURL', '/r/RelatedTo', '/r/Synonym', '/r/HasContext', '/r/SimilarTo']\n", | |
"filter_rels += [r for r in C_en_counts if 'dbpedia' in r]\n", | |
"print filter_rels\n", | |
"C_en_rel = C_en.rel.values \n", | |
"C_en_filtered = C_en.loc[[i for i in tqdm.tqdm(range(len(C_en)), total=len(C_en)) if C_en_rel[i] not in filter_rels]]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Analysis of conceptnet" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 149, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Ended up with 1077940 relations\n" | |
] | |
} | |
], | |
"source": [ | |
"print \"Ended up with\", len(C_en_filtered), \"relations\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 167, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Counter({'/r/Antonym': 21456,\n", | |
" '/r/AtLocation': 28960,\n", | |
" '/r/CapableOf': 26354,\n", | |
" '/r/Causes': 17088,\n", | |
" '/r/CausesDesire': 4782,\n", | |
" '/r/CreatedBy': 267,\n", | |
" '/r/DefinedAs': 2710,\n", | |
" '/r/DerivedFrom': 126928,\n", | |
" '/r/Desires': 3728,\n", | |
" '/r/DistinctFrom': 3571,\n", | |
" '/r/Entails': 405,\n", | |
" '/r/EtymologicallyRelatedTo': 158706,\n", | |
" '/r/FormOf': 273813,\n", | |
" '/r/HasA': 7735,\n", | |
" '/r/HasFirstSubevent': 3381,\n", | |
" '/r/HasLastSubevent': 2890,\n", | |
" '/r/HasPrerequisite': 23493,\n", | |
" '/r/HasProperty': 13553,\n", | |
" '/r/HasSubevent': 25896,\n", | |
" '/r/InstanceOf': 1366,\n", | |
" '/r/IsA': 249839,\n", | |
" '/r/LocatedNear': 49,\n", | |
" '/r/MadeOf': 604,\n", | |
" '/r/MannerOf': 48,\n", | |
" '/r/MotivatedByGoal': 9807,\n", | |
" '/r/NotCapableOf': 517,\n", | |
" '/r/NotDesires': 3095,\n", | |
" '/r/NotHasProperty': 509,\n", | |
" '/r/PartOf': 12872,\n", | |
" '/r/ReceivesAction': 8383,\n", | |
" '/r/SymbolOf': 4,\n", | |
" '/r/UsedFor': 42443,\n", | |
" '/r/dbpedia/influencedBy': 1261,\n", | |
" '/r/dbpedia/language': 885,\n", | |
" '/r/dbpedia/leader': 80,\n", | |
" '/r/dbpedia/product': 462})" | |
] | |
}, | |
"execution_count": 167, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from collections import Counter\n", | |
"C_en_filtered_counts = Counter(C_en_filtered.rel)\n", | |
"C_en_filtered_counts" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Get data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"corpus_f_name = \"/data/lisa/exp/jastrzes/l2lwe/data/booktest-gut/train.14M+.txt\"\n", | |
"# corpus_f_name = \"/data/lisa/exp/jastrzes/l2lwe/data/CBTest/data/cbt_train.txt\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"### Open question: how to build a better index. Probably a search engine makes most sense\n", | |
"N_CORPUS = int(1.5*10**8) # 1 B words is around 125MB " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"from collections import defaultdict\n", | |
"word_index = defaultdict(list) # Build an index word -> ids in corpus\n", | |
"word_count = defaultdict(int)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 118, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"THRESHOLD_MAX_WORD_COUNT_INDEX = N_CORPUS * 10**-3" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"THRESHOLD_WORD_COUNT_INDEX = N_CORPUS * 4 * 10**-7" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def read_word_by_word(input_file):\n", | |
" line = ''\n", | |
" while True:\n", | |
" word, space, line = line.partition(' ')\n", | |
" if space:\n", | |
" # A word was found\n", | |
" yield word\n", | |
" else:\n", | |
" # A word was not found; read a chunk of data from file\n", | |
" next_chunk = input_file.read(10000)\n", | |
" if next_chunk:\n", | |
" # Add the chunk to our line\n", | |
" line = word + next_chunk\n", | |
" else:\n", | |
" # No more data; yield the last word and return\n", | |
" yield word.rstrip('\\n')\n", | |
" return" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from itertools import islice\n", | |
"corpus = list(islice(read_word_by_word(open(corpus_f_name)), N_CORPUS)) # ~2m for 100M" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from collections import Counter" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"word_count = Counter(corpus) # Quite fast!" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 150000000/150000000 [05:00<00:00, 499554.73it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"## TODO: Use set, not list\n", | |
"word_index = defaultdict(list)\n", | |
"for word_id, word in tqdm.tqdm(enumerate(corpus), total=len(corpus)):\n", | |
" if word_count[word] > THRESHOLD_WORD_COUNT_INDEX: # We care about common words really\n", | |
" word_index[word].append(word_id)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 150000000/150000000 [05:05<00:00, 491025.18it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"id_to_sid = [0]*N_CORPUS\n", | |
"cur_sid = 0\n", | |
"sentences = []\n", | |
"cur_sentence = []\n", | |
"for w_id, w in tqdm.tqdm(enumerate(corpus), total=len(corpus)):\n", | |
" id_to_sid[w_id] = cur_sid\n", | |
" cur_sentence.append(w)\n", | |
" if w.startswith(\".\"):\n", | |
" cur_sid += 1\n", | |
" sentences.append(\" \".join(cur_sentence))\n", | |
" cur_sentence = []" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 104, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"id_to_sid = np.array(id_to_sid)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"vocab_corpus = set(corpus)\n", | |
"print len(vocab_corpus)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Useful functions " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 110, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def find_indexes(word_list, w):\n", | |
" return word_index.get(w, [])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 111, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def print_random_joint_sentences(w1, w2, N=5, window_size=8):\n", | |
" \n", | |
"\n", | |
" contexts = {}\n", | |
" occurences1 = set(id_to_sid[find_indexes(corpus, w1)])\n", | |
" occurences2 = set(id_to_sid[find_indexes(corpus, w2)])\n", | |
" occurences = list(occurences1 & occurences2)\n", | |
" joint_sentences = [sentences[id] for id in occurences]\n", | |
" \n", | |
" \n", | |
" if len(joint_sentences) == 0:\n", | |
" return [\"Sorry - no contexts for \" + w1 + \" \" + w2]\n", | |
" \n", | |
" out.append(\"w1={}, w2={}\".format(w1, w2))\n", | |
" contexts_w1 = np.random.choice(len(joint_sentences), min(N, len(joint_sentences)), replace=False)\n", | |
" contexts_w1 = [\"* \" + joint_sentences[s].replace(\"\\n\", \"\") for s in contexts_w1]\n", | |
" out.append(\"\\n\".join(contexts_w1))\n", | |
" return out" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 133, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def print_random_context(w1, w2, N=5, window_size=8):\n", | |
" contexts = {}\n", | |
" for w in [w1, w2]:\n", | |
" occurences = find_indexes(corpus, w)\n", | |
" if len(occurences) == 0:\n", | |
" return ['Sorry no occurences for ' + w]\n", | |
" occurences_id = np.random.choice(len(occurences), min(N, len(occurences)), replace=False)\n", | |
" contexts[w] = [corpus[max(0, occurences[occ_id]-window_size):occurences[occ_id]+window_size] for occ_id in occurences_id]\n", | |
" \n", | |
" \n", | |
" \n", | |
" out = []\n", | |
" \n", | |
" out.append(\"w1={}, w2={}\".format(w1, w2))\n", | |
" out.append(\"{} contexts:\".format(w1))\n", | |
" contexts_w1 = contexts[w1]\n", | |
" contexts_w1 = [\"* \" + \" \".join(s).replace(\"\\n\", \"\") for s in contexts_w1]\n", | |
" out.append(\"\\n\".join(contexts_w1))\n", | |
" out.append(\"{} contexts:\".format(w2))\n", | |
" contexts_w2 = contexts[w2]\n", | |
" contexts_w2 = [\"* \" + \" \".join(s).replace(\"\\n\", \"\") for s in contexts_w2]\n", | |
" out.append(\"\\n\".join(contexts_w2))\n", | |
" \n", | |
" return out" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 163, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"## TODO : Nicer to filter out assertions\n", | |
"def random_pair_words(corpus, vocab_corpus, corpus_count, assertions, rng, rel=None):\n", | |
" if rel:\n", | |
" assertions = assertions.query(\"rel=='{}'\".format(rel))\n", | |
" for i in range(1000):\n", | |
" edge_id = rng.randint(len(assertions))\n", | |
" edge = assertions.iloc[edge_id]\n", | |
" head, rel, tail = edge['head'], edge['rel'], edge['tail']\n", | |
"\n", | |
" if head.endswith(\"/v\") or head.endswith(\"/n\"):\n", | |
" head = head[0:-2]\n", | |
"\n", | |
" if tail.endswith(\"/v\") or tail.endswith(\"/n\"):\n", | |
" tail = tail[0:-2]\n", | |
"\n", | |
" # 1. Filter out gramma\n", | |
" if (head.startswith(tail) or tail.startswith(head)):\n", | |
" continue\n", | |
"\n", | |
" # 2. Filter out multi word\n", | |
" if \"_\" in head or \"_\" in tail:\n", | |
" continue \n", | |
"\n", | |
" if corpus_count[head.split(\"/\")[-1]]>THRESHOLD_MAX_WORD_COUNT_INDEX \\\n", | |
" or corpus_count[tail.split(\"/\")[-1]] >THRESHOLD_MAX_WORD_COUNT_INDEX:\n", | |
" continue\n", | |
" \n", | |
" # 3. Only englihs\n", | |
" if not head.startswith(\"/c/en\") or not tail.startswith(\"/c/en/\"):\n", | |
" continue\n", | |
"\n", | |
" # 4. Only words occuring in vocab\n", | |
" if head.split(\"/\")[-1] not in vocab_corpus or tail.split(\"/\")[-1] not in vocab_corpus:\n", | |
" continue\n", | |
"\n", | |
" # 5. Only non-recursive edges\n", | |
" if head == tail:\n", | |
" continue\n", | |
"\n", | |
" return head.split(\"/\")[-1], rel, tail.split(\"/\")[-1]\n", | |
" return None" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# \"Random\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 125, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"assertions = C_en_filtered" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 126, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 200/200 [00:00<00:00, 249.04it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"words = []\n", | |
"import tqdm\n", | |
"N = 200\n", | |
"rng = np.random.RandomState(777)\n", | |
"for i in tqdm.tqdm(range(N), total=N):\n", | |
" words.append(random_pair_words(corpus=corpus, corpus_count=word_count,\n", | |
" assertions=C_en_filtered, vocab_corpus=vocab_corpus, rng=rng))\n", | |
"\n", | |
"with open(\"10_08_feasibility_study.md\", \"w\") as f:\n", | |
" for w in tqdm.tqdm(words, total=len(words)):\n", | |
" out = []\n", | |
" out.append(\"## \" + str(w))\n", | |
" out.append(\"------\\n\")\n", | |
" out.append(\"\\n ### joint sentences contexts \\n\")\n", | |
" out += print_random_joint_sentences(str(w[0].split(\"/\")[-1]), str(w[2].split(\"/\")[-1]), N=10, window_size=14)\n", | |
" out.append( \"\\n ### random contexts \\n\" )\n", | |
" out += print_random_context(str(w[0].split(\"/\")[-1]), str(w[2].split(\"/\")[-1]), N=10, window_size=14)\n", | |
" \n", | |
" f.write(\"\\n\".join(out))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Each relation 5 examples" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 164, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 33/33 [00:25<00:00, 1.96it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"words = []\n", | |
"import tqdm\n", | |
"N = 5\n", | |
"rng = np.random.RandomState(777)\n", | |
"for r in tqdm.tqdm(C_en_filtered_counts, total=len(C_en_filtered_counts)):\n", | |
" for i in range(N):\n", | |
" words.append(random_pair_words(corpus=corpus, corpus_count=word_count,\n", | |
" assertions=C_en_filtered, vocab_corpus=vocab_corpus, rng=rng, rel=r))\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 165, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 165/165 [00:01<00:00, 102.66it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"with open(\"13_08_feasibility_study.md\", \"w\") as f:\n", | |
" for w in tqdm.tqdm(words, total=len(words)):\n", | |
" \n", | |
" if not w:\n", | |
" continue\n", | |
" \n", | |
" out = []\n", | |
" out.append(\"## \" + str(w))\n", | |
" out.append(\"------\\n\")\n", | |
" out.append(\"\\n ### joint sentences contexts \\n\")\n", | |
" out += print_random_joint_sentences(str(w[0].split(\"/\")[-1]), str(w[2].split(\"/\")[-1]), N=10, window_size=14)\n", | |
" out.append( \"\\n ### random contexts \\n\" )\n", | |
" out += print_random_context(str(w[0].split(\"/\")[-1]), str(w[2].split(\"/\")[-1]), N=10, window_size=14)\n", | |
" \n", | |
" f.write(\"\\n\".join(out))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 135, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1\t10_08_feasibility_study.md\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! du -m 10_08_feasibility_study.md" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Hand picked" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"words = [\n", | |
" ('lemon', 'sour'), # has property\n", | |
" ('door', 'open'), # is used\n", | |
" ('hot', 'cold'), # antonym\n", | |
" ('pencil', 'write'), #\n", | |
" ('money', 'saving'), # is used, # TODO: what to do with 2 words?\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 71, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"## ('lemon', 'sour')\n", | |
"-----\n", | |
"\n", | |
"Sorry - no contexts for lemon sour\n", | |
"## ('door', 'open')\n", | |
"-----\n", | |
"\n", | |
"w1=door, w2=open\n", | |
"* Long before he could open the door he who was descending would be with him at the bottom of the stairs .9\n", | |
"* Through queer short cuts that terribly bewildered the way , the porter led him to the house , and pushing the door open , went up two flights of stone stairs and knocked at a door on the landing .21\n", | |
"* But Tibbie resumed : `` Ye maunna think , hooever , 'cause sic longin ' thouchts come ower me , that I gang aboot the hoose girnin ' and compleenin ' that I canna open the door and win oot .11\n", | |
"* A big car was standing by the kerb and one of the attendants was holding open the door for a girl dressed in black .11\n", | |
"* ''20 `` There 's mair vertues i ' the Bible nor courage , Thamas , '' retorted James , holding the outer door open to throw the sentence in , and shutting it instantly to escape with the last word .21\n", | |
"## ('hot', 'cold')\n", | |
"-----\n", | |
"\n", | |
"w1=hot, w2=cold\n", | |
"* On the other hand , taking Hyngham -LRB- or Ingham -RRB- as a place-name , we go , as the kiddies say , from `` cold '' to `` hot '' at once .17\n", | |
"* '14 As he read the detailed accounts Hugo knew , perhaps for the first time in his life , what it was ` to go hot and cold all over .\n", | |
"* There are both cold and hot waters and these are sweet and agreeable .18\n", | |
"* On the other hand , taking Hyngham -LRB- or Ingham -RRB- as a place-name , we go , as the kiddies say , from `` cold '' to `` hot '' at once .19\n", | |
"* '14 As he read the detailed accounts Hugo knew , perhaps for the first time in his life , what it was ` to go hot and cold all over .\n", | |
"## ('pencil', 'write')\n", | |
"-----\n", | |
"\n", | |
"w1=pencil, w2=write\n", | |
"* '6 ` For candles ? '7 said Fisher , confused ; ` how many ?8 -- what sort ? '9 ` Stupidity ! '10 exclaimed Archer , ` you are a pretty fellow at a dead lift !11 Lend me a pencil and a bit of paper , do ; I 'll write down what I want myself !12 Well , what are you fumbling for ? '13 ` For money ! '14 said Fisher , colouring .15\n", | |
"* '13 ` For candles ? '14 said Fisher , confused ; ` how many ?15 -- what sort ? '16 ` Stupidity ! '17 exclaimed Archer , ` you are a pretty fellow at a dead lift !18 Lend me a pencil and a bit of paper , do ; I 'll write down what I want myself !19 Well , what are you fumbling for ? '20 ` For money ! '21 said XXXXX , colouring .\tFisher\t\tbuns|design|ambassador|dark|pray|Archer|Fisher|expense|candles|schoolroom1\n", | |
"* exclaimed Archer , ` you are a pretty fellow at a dead lift !2 Lend me a pencil and a bit of paper , do ; I 'll write down what I want myself !3 Well , what are you fumbling for ? '4 ` For money ! '5 said Fisher , colouring .6\n", | |
"* `` A horse , a dog , a fire , a man -- a St. Bernard dog saving a boy -- a soldier -- I think a soldier would suit Cyril ! ''3 She stared through the bush to the red road consideringly , holding her pencil ready to write .4\n", | |
"* '6 ` For candles ? '7 said Fisher , confused ; ` how many ?8 -- what sort ? '9 ` Stupidity ! '10 exclaimed Archer , ` you are a pretty fellow at a dead lift !11 Lend me a pencil and a bit of paper , do ; I 'll write down what I want myself !12 Well , what are you fumbling for ? '13 ` For money ! '14 said Fisher , colouring .15\n", | |
"## ('money', 'saving')\n", | |
"-----\n", | |
"\n", | |
"w1=money, w2=saving\n", | |
"* You will find that their prices are reasonable , considering the difference in cost of transportation at any point you might decide to purchase from in the United States ; in fact it is the saving of money to buy in Juneau .5\n", | |
"* He also carried a more questionable scheme for the payment of military , naval , and civil pensions , which then amounted to L4 ,900,000 a year , but were falling in rapidly ; the money required for this purpose was to be borrowed by trustees , and was to be repaid in the course of forty-five years at the rate of L2 ,800,000 a year ; in this way an immediate saving of about L2 ,000,000 annually was effected at the cost , however , of the next generation .5\n", | |
"* I know what saving of money its use has meant to me .21\n", | |
"* I know what saving of money its use has meant to me .19\n", | |
"* He also carried a more questionable scheme for the payment of military , naval , and civil pensions , which then amounted to L4 ,900,000 a year , but were falling in rapidly ; the money required for this purpose was to be borrowed by trustees , and was to be repaid in the course of forty-five years at the rate of L2 ,800,000 a year ; in this way an immediate saving of about L2 ,000,000 annually was effected at the cost , however , of the next generation .4\n" | |
] | |
} | |
], | |
"source": [ | |
"for w in words:\n", | |
" print \"## \" + str(w)\n", | |
" print \"-----\\n\"\n", | |
" print_random_joint_sentences(*w)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"## ('lemon', 'sour')\n", | |
"-----\n", | |
"\n", | |
"w1=lemon, w2=sour\n", | |
"lemon contexts:\n", | |
"* the tent and Jakie 's breath reeked of lemon and vanilla .\tJakie\t\t_|woes|sleep|gratitude|charge|way|Jakie|pans|coffee|Jack1 Jakie wept , this\n", | |
"* -LRB- Hawaiian hau -RRB- , or `` the lemon hibiscus '' -- the `` argenta ,\n", | |
"* oranges have ceased to be gathered and the lemon has been squeezed .5 Occasionally there is\n", | |
"* so they try macaroon and pistachio instead of lemon and vanilla .14 Fresh people are better\n", | |
"* oranges have ceased to be gathered and the lemon has been squeezed .17 Occasionally there is\n", | |
"sour contexts:\n", | |
"* I doubt pastry from Languedoc would turn me sour ; and liking monks little enough as\n", | |
"* . ''8 Sejanus answered this reminder with a sour and peculiar smile .9 `` Good ,\n", | |
"* have been the glasses that had changed the sour old woman into a smiling fairy ;\n", | |
"* 's head ! ''15 `` A set of sour knaves , '' he cried , ``\n", | |
"* hee-hawed '' , and said something about `` sour grapes '' .15 She was jolly smartly\n", | |
"## ('door', 'open')\n", | |
"-----\n", | |
"\n", | |
"w1=door, w2=open\n", | |
"door contexts:\n", | |
"* to the door .12 Mr. Carson opened the door and mildly asked to know the object\n", | |
"* room level with the ground .12 The main door is in the left wall .13 Along\n", | |
"* cheers and shrieks , someone fell against the door with a soft noise , and there\n", | |
"* rang the bell , where , after the door was opened , she was shown into\n", | |
"* under the misletoe , which was over the door , and Dick shook hands with Mrs.\n", | |
"open contexts:\n", | |
"* his efforts to holding Lisbon , and keeping open his line of communication with Spain .11\n", | |
"* .17 It was plain that Canada was an open book to him .18 `` The long\n", | |
"* there . ''5 Loughs Conn and Cullin are open free fishing , and on the preserves\n", | |
"* left the sentence unfinished .9 `` Let us open it , '' said Cleggett .10 ``\n", | |
"* valley .13 Hals , whose veracity is much open to doubt , states that Militon had\n", | |
"## ('hot', 'cold')\n", | |
"-----\n", | |
"\n", | |
"w1=hot, w2=cold\n", | |
"hot contexts:\n", | |
"* But I , Eustace , love her so hot that I have fear of myself .16\n", | |
"* -- '' Salis was too late , for hot , excited , and strung up hard\n", | |
"* bofe sides , an ' dey had it hot an ' heavy , nip an '\n", | |
"* .8 For hills and mountains are created in hot countries , whether they are situated by\n", | |
"* delving , building , toiling through the long hot summer 's day , in rivalry of\n", | |
"cold contexts:\n", | |
"* days of old , when the spring with cold Had , brightened his branches gray ,\n", | |
"* I am quite well .10 It is a cold morning , and I shivered a little\n", | |
"* laughter came from the playground -- but a cold silence had come by the fiftieth .12\n", | |
"* is not enough that I Am sacrificed to cold state policy , A snare is laid\n", | |
"* his heart beat quickly and his flesh grow cold with a nervous trepidation -- just such\n", | |
"## ('pencil', 'write')\n", | |
"-----\n", | |
"\n", | |
"w1=pencil, w2=write\n", | |
"pencil contexts:\n", | |
"* I will . ''21 And Olive produced a pencil and paper with alacrity , and by\n", | |
"* the most finished that the bright and effective pencil of Edith could achieve .\tLady\t\tabstraction|Coningsby|Hellingsley|sky|Edith|aunt|Joseph|Lady|pencils|Wallinger1 ` We\n", | |
"* describe it , nor can the painter 's pencil .12 It continued for nearly half-an-hour ,\n", | |
"* know all about it . ''13 The paralytic pencil wavered and came to a full stop\n", | |
"* the most finished that the bright and effective pencil of Edith could achieve .11 If it\n", | |
"write contexts:\n", | |
"* Mountjoy , King-at-arms , who was there to write down the names , began to reason\n", | |
"* suit myself .18 One or two only will write with very little change from me .19\n", | |
"* been a hard tussle to get her to write the apology , and , but for\n", | |
"* bear them , or could she ?3 Better write without his knowledge .4 Then , on\n", | |
"* which Mr. Grey had agreed that she should write to him , he hesitated to open\n", | |
"## ('money', 'saving')\n", | |
"-----\n", | |
"\n", | |
"w1=money, w2=saving\n", | |
"money contexts:\n", | |
"* out of his pocket and began counting his money .6 There was a great deal of\n", | |
"* to the screen .\tDelamere\t\ttime|places|Delamere|Tom|Gus|Augustus|lem|Sandy|ha|Davidson1 `` It 's Confederate money . ''2 `` So it is ,\n", | |
"* that ship-money was not a tax , but money paid in commutation of the duty of\n", | |
"* .13 There is nobody I want to leave money to except you and Mr. Glover .\n", | |
"* whole of life .10 Wasted plans , wasted money , wasted love , and she had\n", | |
"saving contexts:\n", | |
"* shipwrecked and , according to tradition , only saving his poem which he held in one\n", | |
"* rooms that night , and worked hardest , saving her library and her pictures and her\n", | |
"* Everard , `` I had the satisfaction of saving the life of a French officer in\n", | |
"* gotten any manner of strength ; and the saving of the friar 's life , which\n", | |
"* used to take Ralph to task for not saving Ham from his iniquities , and Ralph\n" | |
] | |
} | |
], | |
"source": [ | |
"for w in words:\n", | |
" print \"## \" + str(w)\n", | |
" print \"-----\\n\"\n", | |
" print_random_context(*w)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Old" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# 1. Gather occurences\n", | |
"\n", | |
"\n", | |
"for w1, w2 in words:\n", | |
" for w in [w1, w2]:\n", | |
" if w in contexts:\n", | |
" continue\n", | |
" occurences = (np.where(text8_corpus == w)[0])\n", | |
" print(\"{} occurences of {}\".format(len(occurences), w))\n", | |
" contexts[w] = [text8_corpus[max(0, occ-window_size):occ+window_size] for occ in occurences]\n", | |
" \n", | |
" \n", | |
"\n", | |
"# 2. Print some\n", | |
"\n", | |
"N = 5\n", | |
"\n", | |
"for w1, w2 in words:\n", | |
" print \"w1={}, w2={}\".format(w1, w2)\n", | |
" print \"{} contexts:\".format(w1)\n", | |
" contexts_w1 = np.random.choice(len(contexts[w1]), N, replace=False)\n", | |
" contexts_w1 = [contexts[w1][id] for id in contexts_w1]\n", | |
" contexts_w1 = [\"\\t\" + \" \".join(s) for s in contexts_w1]\n", | |
" print \"\\n\".join(contexts_w1)\n", | |
" print \"{} contexts:\".format(w2)\n", | |
" contexts_w2 = np.random.choice(len(contexts[w2]), N, replace=False)\n", | |
" contexts_w2 = [contexts[w2][id] for id in contexts_w2]\n", | |
" contexts_w2 = [\"\\t\" + \" \".join(s) for s in contexts_w2]\n", | |
" print \"\\n\".join(contexts_w2) " | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment