Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save arne-cl/52d5747c2f14cb771824 to your computer and use it in GitHub Desktop.
Save arne-cl/52d5747c2f14cb771824 to your computer and use it in GitHub Desktop.
DiscourseGraphs: extracting coreference chains from MMAX2 annotated corpora
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:143480b57bc562208ab4ea3a75ca5ae16e2fce0e332208440d2c8c81a7f9f3b9"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Playing with MMAX2 coreference data in DiscourseGraphs"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from discoursegraphs.readwrite import MMAXDocumentGraph\n",
"from discoursegraphs import get_pointing_chains\n",
"\n",
"MMAX_FILE = '/home/arne/repos/pcc-annis-merged/maz176/coreference/maz-1423.mmax'"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mdg = MMAXDocumentGraph(MMAX_FILE)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Extracting coreference chains"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"get_pointing_chains(mdg)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"text": [
"[['markable_22',\n",
" 'markable_19',\n",
" 'markable_17',\n",
" 'markable_14',\n",
" 'markable_12',\n",
" 'markable_11'],\n",
" ['markable_21', 'markable_10', 'markable_8', 'markable_7', 'markable_2']]"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for chain in get_pointing_chains(mdg):\n",
" for node_id in chain:\n",
" print node_id, mdg.node[node_id], '\\n'"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"markable_22 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'ne', 'layers': set(['mmax', 'mmax:markable']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_22', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'sbj', 'mmax:phrase_type': 'np', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_22:primmark', 'mmax:span': 'word_180', 'mmax:anaphor_antecedent': 'markable_19', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n",
"\n",
"markable_19 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'ne', 'layers': set(['mmax', 'mmax:markable']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_19', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'sbj', 'mmax:phrase_type': 'np', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_19:primmark', 'mmax:span': 'word_160..word_161', 'mmax:anaphor_antecedent': 'markable_17', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n",
"\n",
"markable_17 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'ne', 'layers': set(['mmax', 'mmax:markable']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_17', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'indir-obj', 'mmax:phrase_type': 'np', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_17:primmark', 'mmax:span': 'word_149', 'mmax:anaphor_antecedent': 'markable_14', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n",
"\n",
"markable_14 {'layers': set(['mmax', 'mmax:markable']), 'mmax:dir_speech': 'text_level', 'mmax:type': 'anaphoric', 'mmax:id': 'markable_14', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'sbj', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_14:primmark', 'mmax:span': 'word_116', 'mmax:anaphor_antecedent': 'markable_12', 'mmax:phrase_type': 'np', 'mmax:np_form': 'ne', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n",
"\n",
"markable_12 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'defnp', 'layers': set(['mmax', 'mmax:markable']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_12', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'other', 'mmax:phrase_type': 'pp', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_12:primmark', 'mmax:span': 'word_101..word_103', 'mmax:anaphor_antecedent': 'markable_11', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n",
"\n",
"markable_11 {'layers': set(['mmax', 'mmax:markable']), 'mmax:dir_speech': 'text_level', 'mmax:type': 'none', 'mmax:id': 'markable_11', 'mmax:grammatical_role': 'other', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_11:primmark', 'mmax:span': 'word_10', 'mmax:anaphor_antecedent': 'empty', 'mmax:phrase_type': 'np', 'mmax:np_form': 'ne', 'mmax:referentiality': 'discourse-new', 'mmax:complex_np': 'no'} \n",
"\n",
"markable_21 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'defnp', 'layers': set(['mmax', 'mmax:markable']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_21', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'dir-obj', 'mmax:phrase_type': 'np', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_21:primmark', 'mmax:span': 'word_177..word_178', 'mmax:anaphor_antecedent': 'markable_10', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n",
"\n",
"markable_10 {'layers': set(['mmax', 'mmax:markable']), 'mmax:dir_speech': 'text_level', 'mmax:type': 'anaphoric', 'mmax:id': 'markable_10', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'sbj', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_10:primmark', 'mmax:span': 'word_94..word_95', 'mmax:anaphor_antecedent': 'markable_8', 'mmax:phrase_type': 'np', 'mmax:np_form': 'defnp', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n",
"\n",
"markable_8 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'defnp', 'layers': set(['mmax', 'mmax:markable']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_8', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'dir-obj', 'mmax:phrase_type': 'np', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_8:primmark', 'mmax:span': 'word_73..word_74', 'mmax:anaphor_antecedent': 'markable_7', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n",
"\n",
"markable_7 {'layers': set(['mmax', 'mmax:markable']), 'mmax:dir_speech': 'text_level', 'mmax:type': 'anaphoric', 'mmax:id': 'markable_7', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'other', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_7:primmark', 'mmax:span': 'word_60..word_63', 'mmax:anaphor_antecedent': 'markable_2', 'mmax:phrase_type': 'pp', 'mmax:np_form': 'defnp', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n",
"\n",
"markable_2 {'layers': set(['mmax', 'mmax:markable']), 'mmax:dir_speech': 'text_level', 'mmax:type': 'none', 'mmax:id': 'markable_2', 'mmax:grammatical_role': 'sbj', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_2:primmark', 'mmax:span': 'word_8..word_11', 'mmax:anaphor_antecedent': 'empty', 'mmax:phrase_type': 'np', 'mmax:np_form': 'defnp', 'mmax:referentiality': 'discourse-new', 'mmax:complex_np': 'no'} \n",
"\n"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from discoursegraphs.readwrite.mmax2 import spanstring2text\n",
"\n",
"for chain in get_pointing_chains(mdg):\n",
" for node_id in chain:\n",
" print node_id, spanstring2text(mdg, mdg.node[node_id]['mmax:span'])\n",
" print '\\n'"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"markable_22 Wittstock\n",
"markable_19 die Dosse-Stadt\n",
"markable_17 Wittstock\n",
"markable_14 Wittstock\n",
"markable_12 in der Region\n",
"markable_11 Wittstocker\n",
"\n",
"\n",
"markable_21 die Halle\n",
"markable_10 Die Halle\n",
"markable_8 die Halle\n",
"markable_7 f\u00fcr den schmucken Veranstaltungsort\n",
"markable_2 die neue Wittstocker Stadthalle\n",
"\n",
"\n"
]
}
],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for chain in get_pointing_chains(mdg):\n",
" for node_id in chain:\n",
" print mdg.in_edges(node_id, data=True)\n",
" print mdg.out_edges(node_id, data=True)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[]\n",
"[('markable_22', 'markable_19', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_22', 'word_180', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n",
"[('markable_22', 'markable_19', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n",
"[('markable_19', 'markable_17', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_19', 'word_160', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_19', 'word_161', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n",
"[('markable_19', 'markable_17', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n",
"[('markable_17', 'word_149', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_17', 'markable_14', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n",
"[('markable_17', 'markable_14', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n",
"[('markable_14', 'markable_12', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_14', 'word_116', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n",
"[('markable_14', 'markable_12', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n",
"[('markable_12', 'markable_11', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_12', 'word_101', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_12', 'word_102', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_12', 'word_103', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n",
"[('markable_12', 'markable_11', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n",
"[('markable_11', 'word_10', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n",
"[]\n",
"[('markable_21', 'word_178', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_21', 'markable_10', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_21', 'word_177', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n",
"[('markable_21', 'markable_10', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n",
"[('markable_10', 'markable_8', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_10', 'word_95', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_10', 'word_94', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n",
"[('markable_10', 'markable_8', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n",
"[('markable_8', 'word_73', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_8', 'word_74', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_8', 'markable_7', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n",
"[('markable_8', 'markable_7', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n",
"[('markable_7', 'markable_2', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_7', 'word_62', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_7', 'word_63', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_7', 'word_60', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_7', 'word_61', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n",
"[('markable_7', 'markable_2', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n",
"[('markable_2', 'word_10', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_2', 'word_9', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_2', 'word_8', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_2', 'word_11', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n"
]
}
],
"prompt_number": 44
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment