Skip to content

Instantly share code, notes, and snippets.

@andreasvc
Last active July 4, 2018 15:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andreasvc/f6f626aadb7fd7fb4954 to your computer and use it in GitHub Desktop.
Save andreasvc/f6f626aadb7fd7fb4954 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Getting tree fragments from TSG derivations\n",
"-------------------------------------------\n",
"\n",
"Below we extract a simple Tree-Substitution Grammar (TSG) and parse sentences with it,\n",
"and show which tree fragments were used in the derivations and how to extract them."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import io\n",
"from discodop import parser, runexp, tree"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/tmp\n"
]
}
],
"source": [
"# Go to a temporary directory where we will create a simple treebank\n",
"# and store the extract grammar\n",
"%cd /tmp\n",
"with io.open('treebankExample.mrg', 'w', encoding='utf8') as out:\n",
" out.write(u\"\"\"(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (JJ hungry) (NN dog))))\n",
"(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (NN dog))))\n",
"(S (NP (DT The) (NN mouse)) (VP (VBP saw) (NP (DT the) (NN cat))))\n",
"(S (NP (DT The) (NN mouse)) (VP (VBP saw) (NP (DT the) (JJ yellow) (NN cat))))\n",
"(S (NP (DT The) (JJ little) (NN mouse)) (VP (VBP saw) (NP (DT the) (NN cat))))\n",
"(S (NP (DT The) (NN cat)) (VP (VBP ate) (NP (DT the) (NN dog))))\n",
"(S (NP (DT The) (NN mouse)) (VP (VBP ate) (NP (DT the) (NN cat))))\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Write a paramater file containing the specification for the grammar we will extract.\n",
"with io.open('mygrammar.prm', 'w', encoding='utf8') as out:\n",
" out.write(u\"\"\"stages=[\n",
" dict(name='dop', mode='pcfg', dop='doubledop',\n",
" m=1000, estimator='rfe', objective = 'mpp')\n",
"],\n",
"corpusfmt='bracket',\n",
"traincorpus=dict(\n",
" path='treebankExample.mrg', encoding='utf8',\n",
" numsents=7, maxwords=100),\n",
"testcorpus=dict(\n",
" path='treebankExample.mrg', encoding='utf8',\n",
" numsents=7, maxwords=100, skiptrain=False),\n",
"postagging=dict(\n",
" method='unknownword', model='4',\n",
" unknownthreshold=1, openclassthreshold=50,\n",
" simplelexsmooth=True),\n",
"binarization=dict(\n",
" method='default', factor='right',\n",
" h=1, v=1),\n",
"numproc=1, punct=None, functions=None, morphology=None, transformations=None, relationalrealizational=False, removeempty=False, ensureroot=False,\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"7 training sentences after length restriction <= 100\n",
"known words: 10, signature types seen: 0\n",
"open class tags: \n",
"\n",
"closed class tags: DT:1 JJ:3 NN:3 VBP:2\n",
"treebank fan-out before binarization: 1 #6\n",
"(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\n",
"The mouse ate the cat\n",
"binarization: default right h=1 v=1 ; cpu time elapsed: 0.001019s\n",
"binarized treebank fan-out: 1 #6\n",
"extracting recurring fragments\n",
"finished 0--7\n",
"getting exact counts for 25 fragments\n",
"exact indices chunk 1 of 1\n",
"merged 9 cover fragments up to depth 1 with max 999 frontier non-terminals.\n",
"found 34 fragments\n",
"DOP model based on 7 sentences, 69 nodes, 45 nonterminals\n",
"labels: 45 of which preterminals: 11\n",
"clauses: 74 lexical clauses: 21 non-lexical clauses: 53\n",
"max fan-out: 1 in 7/7 01\tVP VBP NP mean: 1\n",
"max variables: 2 in 7/7 01\tVP VBP NP\n",
"max parsing complexity: 3 in 1/1 01\tS}<13> S}<8> VBP mean 2.43243\n",
"All left hand sides sum to 1 +/- epsilon=1e-16\n",
"equal number of nodes, but not equivalent:\n",
"coarse labels without mapping: { DT, DT@The, DT@the, JJ, NN, NN@cat, NN@dog, NN@mouse, NP, NP|<JJ>, ... }\n",
"wrote grammar to mygrammar/dop.{rules,lex,backtransform}.gz\n"
]
}
],
"source": [
"# Extract the grammar using the command line interface;\n",
"# the grammar will end up in several files under /tmp/mygrammar/\n",
"!discodop grammar param mygrammar.prm mygrammar"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(NP (DT 0=) (NN 1=))\t11\r\n",
"(NP (DT 0=) (NP|<JJ> (JJ 1=) (NN 2=)))\t3\r\n",
"(NP (DT 0=) (NN 1=cat))\t6\r\n",
"(NP (DT 0=) (NP|<JJ> 1=))\t3\r\n",
"(NP|<JJ> (JJ 0=) (NN 1=))\t3\r\n",
"(S (NP (DT 0=The) (NN 1=mouse)) (VP (VBP 2=saw) (NP 3=)))\t2\r\n",
"(S (NP (DT 0=The) (NN 1=)) (VP (VBP 2=saw) (NP (DT 3=the) (NP|<JJ> (JJ 4=) (NN 5=)))))\t2\r\n",
"(S (NP 0=) (VP (VBP 1=) (NP (DT 2=the) (NN 3=))))\t5\r\n",
"(S (NP 0=) (VP (VBP 1=saw) (NP (DT 2=the) (NN 3=))))\t3\r\n",
"(S (NP (DT 0=The) (NN 1=)) (VP (VBP 2=) (NP (DT 3=the) (NN 4=))))\t4\r\n"
]
}
],
"source": [
"# The fragments that this grammar is composed of are listed in mygrammar/dop.fragments.gz\n",
"! zcat mygrammar/dop.fragments.gz | head"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Load the grammar and construct a Parser object for it\n",
"top = 'S' # the root label in the treebank\n",
"directory = 'mygrammar'\n",
"params = parser.readparam(directory + '/params.prm')\n",
"parser.readgrammars(directory, params.stages, params.postagging, top=getattr(params, 'top', top))\n",
"myparser = parser.Parser(params)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[DictObj(msg='DOP:\\titems 20, edges 22, blocked 0\\n\\tdisambiguation: 4 derivations, 1 parsetrees, 0.000451s\\n\\tp=6.407e-05 0.00s cpu time elapsed\\n',\n",
" \tname='dop',\n",
" \tprob=6.406868578758892e-05,\n",
" \tparsetrees=[('(S (NP (DT 0) (NP|<JJ> (JJ 1) (NN 2))) (VP (VBP 3) (NP (DT 4) (NN 5))))', 6.406868578758892e-05, ['(S (NP 0=) (VP (VBP 1=) (NP (DT 2=the) (NN 3=))))', '(NP (DT 0=) (NP|<JJ> (JJ 1=) (NN 2=)))', '(DT 0=The)', '(JJ 0=hungry)', '(NN 0=dog)', '(VBP 0=ate)', '(NN 0=dog)'])],\n",
" \tnoparse=False,\n",
" \tgolditems=0,\n",
" \ttotalgolditems=0,\n",
" \telapsedtime=0.0015919999999999268,\n",
" \tparsetree=ParentedTree('S', [ParentedTree('NP', [ParentedTree('DT', [0]), ParentedTree('JJ', [1]), ParentedTree('NN', [2])]), ParentedTree('VP', [ParentedTree('VBP', [3]), ParentedTree('NP', [ParentedTree('DT', [4]), ParentedTree('NN', [5])])])]),\n",
" \tnumitems=20,\n",
" \tfragments=['(S (NP 0=) (VP (VBP 1=) (NP (DT 2=the) (NN 3=))))', '(NP (DT 0=) (NP|<JJ> (JJ 1=) (NN 2=)))', '(DT 0=The)', '(JJ 0=hungry)', '(NN 0=dog)', '(VBP 0=ate)', '(NN 0=dog)'])]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We now parse a sentence with two different probability models.\n",
"# The first is the relative frequency estimate (RFE),\n",
"# the second is the shortest derivation criterion, with ties\n",
"# broken by relative frequencies (the most probable shortest derivation, MPSD).\n",
"# With this small treebank they give the same result, but with\n",
"# a larger treebank these two disambiguation methods can select\n",
"# a different best parse.\n",
"sent = 'The hungry dog ate the dog'\n",
"myparser.stages[-1].estimator = 'rfe'\n",
"result = list(myparser.parse(sent.split()))\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['(S (NP 0=) (VP (VBP 1=) (NP (DT 2=the) (NN 3=))))',\n",
" '(NP (DT 0=) (NP|<JJ> (JJ 1=) (NN 2=)))',\n",
" '(DT 0=The)',\n",
" '(JJ 0=hungry)',\n",
" '(NN 0=dog)',\n",
" '(VBP 0=ate)',\n",
" '(NN 0=dog)']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# These are the fragments used in the Most Probable Derivation\n",
"result[0].fragments"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" S \n",
" ┌───────┴───┐ \n",
" │ VP \n",
" │ ┌───────┴───┐ \n",
" │ │ NP \n",
" │ │ ┌───┴───┐ \n",
" NP VBP DT NN\n",
" │ │ │ │ \n",
"... ... the ...\n",
"\n",
" NP \n",
" ┌───┴─────┐ \n",
" │ NP|<JJ> \n",
" │ ┌─────┴─────┐ \n",
" DT JJ NN\n",
" │ │ │ \n",
"... ... ...\n",
"\n",
" DT\n",
" │ \n",
"The\n",
"\n",
" JJ \n",
" │ \n",
"hungry\n",
"\n",
" NN\n",
" │ \n",
"dog\n",
"\n",
"VBP\n",
" │ \n",
"ate\n",
"\n",
" NN\n",
" │ \n",
"dog\n",
"\n"
]
}
],
"source": [
"for a in result[0].fragments: print(tree.DrawTree(a))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" S \n",
" ┌───────┴───┐ \n",
" │ VP \n",
" │ ┌───────┴───┐ \n",
" │ │ NP \n",
" │ │ ┌───┴───┐ \n",
" NP VBP DT NN\n",
" │ │ │ │ \n",
"... ... the ...\n",
"\n",
" NP \n",
" ┌───┴─────┐ \n",
" │ NP|<JJ> \n",
" │ ┌─────┴─────┐ \n",
" DT JJ NN\n",
" │ │ │ \n",
"... ... ...\n",
"\n",
" DT\n",
" │ \n",
"The\n",
"\n",
" JJ \n",
" │ \n",
"hungry\n",
"\n",
" NN\n",
" │ \n",
"dog\n",
"\n",
"VBP\n",
" │ \n",
"ate\n",
"\n",
" NN\n",
" │ \n",
"dog\n",
"\n"
]
}
],
"source": [
"# Now we switch to the Most Probable Shortest Derivation\n",
"myparser.stages[-1].estimator = 'shortest'\n",
"result = list(myparser.parse(sent.split()))\n",
"for a in result[0].fragments: print(tree.DrawTree(a))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment