Last active
December 26, 2015 17:58
-
-
Save NickRSearcy/7190393 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import nltk\n", | |
"from nltk import Tree\n", | |
"from nltk.tree import ProbabilisticTree\n", | |
"from nltk.grammar import WeightedProduction,Nonterminal,WeightedGrammar\n", | |
"import itertools" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"`f_def` defines the features used in concepts, examples, and the PCFG. `len(f_def)` is equal to the number of features and `f_def[i]` is equal to the number of values for the i-th featur.\n", | |
"A probablistic CFG (PCFG) is then automatically generated with uniform weights on features and values within features." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"f_def = [2, 2]\n", | |
"literals = [ [ ('f'+str(i), str(j)) for j in range(f_def[i]) ]\n", | |
" for i in range(len(f_def))]\n", | |
"examples = [[]]\n", | |
"for literal in literals:\n", | |
" examples = [example+[l] for example in examples for l in literal]\n", | |
"examples = [dict(example) for example in examples]\n", | |
"# print(literals)\n", | |
"print(examples)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[{'f0': '0', 'f1': '0'}, {'f0': '0', 'f1': '1'}, {'f0': '1', 'f1': '0'}, {'f0': '1', 'f1': '1'}]\n" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# this is the basic-level DNF grammar.\n", | |
"base = nltk.parse_pcfg(\"\"\"\n", | |
"D -> C 'or' D [0.5] | C [0.5]\n", | |
"C -> P 'and' C [0.5] | P [0.5]\n", | |
"\"\"\")\n", | |
"\n", | |
"# add productions for features and feature values\n", | |
"new_productions = []\n", | |
"P = Nonterminal('P')\n", | |
"for i in range(len(f_def)):\n", | |
" F_i = Nonterminal('F'+str(i))\n", | |
" new_productions.append(WeightedProduction(P,[F_i],prob=1/len(f_def)))\n", | |
" for j in range(f_def[i]):\n", | |
" A_j = 'f'+str(i)+'='+str(j)\n", | |
" new_productions.append(WeightedProduction(F_i,[A_j],prob=1/f_def[i]))\n", | |
"\n", | |
"dnf_grammar = WeightedGrammar(base.start(), base.productions() + new_productions)\n", | |
"print(dnf_grammar)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Grammar with 10 productions (start state = D)\n", | |
" D -> C 'or' D [0.5]\n", | |
" D -> C [0.5]\n", | |
" C -> P 'and' C [0.5]\n", | |
" C -> P [0.5]\n", | |
" P -> F0 [0.5]\n", | |
" F0 -> 'f0=0' [0.5]\n", | |
" F0 -> 'f0=1' [0.5]\n", | |
" P -> F1 [0.5]\n", | |
" F1 -> 'f1=0' [0.5]\n", | |
" F1 -> 'f1=1' [0.5]\n" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"`pcfg_generate()` generates a PCFG parse based on `pcfg`. If passed a Tree with one or more nonterminal leaves, `pcfg_generate` will generate subtrees for each nonterminal leaf.\n", | |
"\n", | |
"**Important**: this function returns a `ProbabilisticTree` that differs from the NLTK class `ProbabilisticTree` in the meaning of the `.prob()` and `.logprob()` properties. Here, `.prob()` refers to the probability associated with the top-level parse, not the cumulative probability of the entire tree. The cumulative probability can be determined by `2**sum([st.logprob() for st in out.subtrees()])`. A helper function, `prob_tree()`, has been added to do just that. This was done to simplify recalculation of cumulative probabilities for trimmed trees." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def pcfg_generate(pcfg, start=None):\n", | |
" \n", | |
" def sample_production(node):\n", | |
" productions = pcfg.productions(node)\n", | |
" dist = [p.prob() for p in productions]\n", | |
" choice = productions[random.choice(len(dist), 1, dist)[0]]\n", | |
" return ProbabilisticTree(node.symbol(), choice.rhs(), logprob=choice.logprob())\n", | |
" \n", | |
" if not start:\n", | |
" start = pcfg.start()\n", | |
" \n", | |
" if isinstance(start, Nonterminal):\n", | |
" tree = sample_production(start)\n", | |
" elif isinstance(start, Tree):\n", | |
" tree = start.copy(deep=True)\n", | |
" else:\n", | |
" raise TypeError(\"Expected a node, tree or None\")\n", | |
" \n", | |
" for tp in tree.treepositions('leaves'):\n", | |
" if isinstance(tree[tp],Nonterminal):\n", | |
" tree[tp] = pcfg_generate(pcfg,tree[tp])\n", | |
"\n", | |
" return tree\n", | |
"\n", | |
"def prob_tree(tree):\n", | |
" return 2**(sum( st.logprob() for st in tree.subtrees() ))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"`trim_tree()` selects a nonternminal node $n \\in T$ uniformly at random and removes all nodes below $n$.\n", | |
"\n", | |
"**TODO**: `trim_tree()` should accept the top-level node but currently does not" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def trim_tree(tree):\n", | |
" if not isinstance(tree, Tree):\n", | |
" raise TypeError('tree must be a Tree!')\n", | |
" if any( [isinstance(leaf, Nonterminal) for leaf in tree.leaves()] ):\n", | |
" return tree\n", | |
" \n", | |
" # randomly sample candidate index (c_i) from candidate nonterminal nodes\n", | |
" candidates = [tp for tp in tree.treepositions() if \n", | |
" tp not in set(tree.treepositions('leaves')+[()]) ]\n", | |
" c_i = candidates[random.choice(len(candidates))]\n", | |
" \n", | |
" # deep copy new tree\n", | |
" tree_p = tree.copy(deep=True)\n", | |
" \n", | |
" tree_p[c_i] = Nonterminal(tree[c_i].node)\n", | |
" return tree_p" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Given an formula (either in the form of a string or the parse tree of a DNF grammar), and an example, `evaluate_concept()` returns the evaluated example label." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def evaluate_formula(formula,example):\n", | |
" if isinstance(formula,Tree):\n", | |
" formula = ' '.join(formula.leaves())\n", | |
" terms = formula.split(' or ')\n", | |
" terms = [term.split(' and ') for term in terms]\n", | |
" for term in terms:\n", | |
" out = True\n", | |
" for literal in term:\n", | |
" [var, val] = literal.split('=')\n", | |
" if example[var] == val:\n", | |
" continue\n", | |
" else:\n", | |
" out = False\n", | |
" break\n", | |
" if out == True:\n", | |
" break\n", | |
" return out" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def m_beta(a):\n", | |
" return prod( [math.gamma(a_i) for a_i in a] ) /math.gamma(sum(a))\n", | |
"\n", | |
"def to_dict(grammar):\n", | |
" p_dict = {}\n", | |
" for production in grammar.productions():\n", | |
" if production.lhs() not in p_dict:\n", | |
" p_dict[production.lhs()] = []\n", | |
" if production.rhs() not in p_dict[production.lhs()]:\n", | |
" p_dict[production.lhs()] += [production.rhs()]\n", | |
" return p_dict\n", | |
"\n", | |
"def rr_prior(formula, grammar_dict): \n", | |
" use_count = {}\n", | |
" for key in grammar_dict:\n", | |
" use_count[key] = {terminal: 0 for terminal in grammar_dict[key]}\n", | |
" for production in tree.productions():\n", | |
" use_count[production.lhs()][production.rhs()] += 1\n", | |
" p = 1\n", | |
" for key in use_count:\n", | |
" c = list(use_count[key].values())\n", | |
" p *= m_beta( c + ones_like(c) )/m_beta(ones_like(c))\n", | |
" return p" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Cocnepts are generated from intensions." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"concept_intensions = [ \n", | |
" 'f0=0',\n", | |
" 'f0=0 and f1=0',\n", | |
" 'f0=0 or f1=0',\n", | |
" 'f0=0 and f1=1 or f0=1 and f1=0'\n", | |
" ]\n", | |
"concepts = []\n", | |
"for intension in concept_intensions:\n", | |
" concept = [(example, evaluate_formula(intension,example)) for example in examples ]\n", | |
" concepts.append(concept)\n", | |
"print(concepts)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[[({'f0': '0', 'f1': '0'}, True), ({'f0': '0', 'f1': '1'}, True), ({'f0': '1', 'f1': '0'}, False), ({'f0': '1', 'f1': '1'}, False)], [({'f0': '0', 'f1': '0'}, True), ({'f0': '0', 'f1': '1'}, False), ({'f0': '1', 'f1': '0'}, False), ({'f0': '1', 'f1': '1'}, False)], [({'f0': '0', 'f1': '0'}, True), ({'f0': '0', 'f1': '1'}, True), ({'f0': '1', 'f1': '0'}, True), ({'f0': '1', 'f1': '1'}, False)], [({'f0': '0', 'f1': '0'}, False), ({'f0': '0', 'f1': '1'}, True), ({'f0': '1', 'f1': '0'}, True), ({'f0': '1', 'f1': '1'}, False)]]\n" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The likelihood, $P(E,l(E)|F_T,p_e)$, the is probability of a formula generating a set of labels that match a concept. $p_e$ is an optional parameter that allows outliers at a certain probability." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def likelihood(concept,formula,p_e=0):\n", | |
" n_t = 0\n", | |
" for l_e in concept:\n", | |
" if evaluate_formula(formula,l_e[0])==l_e[1]:\n", | |
" n_t +=1\n", | |
" return ((1-p_e)**n_t) * ((p_e)**(len(concept)-n_t))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**TODO**:\n", | |
"\n", | |
"- fix subtree regeneration to include root node\n", | |
"- validate results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"p_e = 0.1\n", | |
"c_i = 0\n", | |
"tree = pcfg_generate(dnf_grammar)\n", | |
"trim = trim_tree(tree)\n", | |
"tree_prime = pcfg_generate(dnf_grammar, trim)\n", | |
"dnf_dict = to_dict(dnf_grammar)\n", | |
"print('target concept: ',concept_intensions[c_i])\n", | |
"print('original: ',tree.flatten())\n", | |
"print('grammar prior: ', prob_tree(tree))\n", | |
"print('rr prior: ', rr_prior(tree,dnf_dict))\n", | |
"print('likelihood prior: ', likelihood(concepts[c_i],tree,p_e))\n", | |
"print('trimmed tree: ', trim.flatten())\n", | |
"print('proposal: ',tree_prime.flatten())\n", | |
"print('grammar prior: ', prob_tree(tree_prime))\n", | |
"print('rr prior: ', rr_prior(tree_prime,dnf_dict))\n", | |
"print('likelihood prior: ', likelihood(concepts[c_i],tree_prime,p_e))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"target concept: f0=0\n", | |
"original: (D f0=1 and f0=1 or f1=1)\n", | |
"grammar prior: 0.00048828125\n", | |
"rr prior: 0.000192901234568\n", | |
"likelihood prior: 0.0009000000000000002\n", | |
"trimmed tree: (D f0=1 and C or f1=1)\n", | |
"proposal: (D f0=1 and f0=0 or f1=1)\n", | |
"grammar prior: 0.00048828125\n", | |
"rr prior: 0.000192901234568\n", | |
"likelihood prior: 0.008100000000000001\n" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment