Created
September 4, 2014 14:00
-
-
Save arne-cl/52d5747c2f14cb771824 to your computer and use it in GitHub Desktop.
DiscourseGraphs: extracting coreference chains from MMAX2 annotated corpora
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:143480b57bc562208ab4ea3a75ca5ae16e2fce0e332208440d2c8c81a7f9f3b9" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Playing with MMAX2 coreference data in DiscourseGraphs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from discoursegraphs.readwrite import MMAXDocumentGraph\n", | |
"from discoursegraphs import get_pointing_chains\n", | |
"\n", | |
"MMAX_FILE = '/home/arne/repos/pcc-annis-merged/maz176/coreference/maz-1423.mmax'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mdg = MMAXDocumentGraph(MMAX_FILE)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Extracting coreference chains" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"get_pointing_chains(mdg)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 12, | |
"text": [ | |
"[['markable_22',\n", | |
" 'markable_19',\n", | |
" 'markable_17',\n", | |
" 'markable_14',\n", | |
" 'markable_12',\n", | |
" 'markable_11'],\n", | |
" ['markable_21', 'markable_10', 'markable_8', 'markable_7', 'markable_2']]" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for chain in get_pointing_chains(mdg):\n", | |
" for node_id in chain:\n", | |
" print node_id, mdg.node[node_id], '\\n'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"markable_22 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'ne', 'layers': set(['mmax', 'mmax:markable']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_22', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'sbj', 'mmax:phrase_type': 'np', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_22:primmark', 'mmax:span': 'word_180', 'mmax:anaphor_antecedent': 'markable_19', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n", | |
"\n", | |
"markable_19 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'ne', 'layers': set(['mmax', 'mmax:markable']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_19', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'sbj', 'mmax:phrase_type': 'np', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_19:primmark', 'mmax:span': 'word_160..word_161', 'mmax:anaphor_antecedent': 'markable_17', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n", | |
"\n", | |
"markable_17 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'ne', 'layers': set(['mmax', 'mmax:markable']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_17', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'indir-obj', 'mmax:phrase_type': 'np', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_17:primmark', 'mmax:span': 'word_149', 'mmax:anaphor_antecedent': 'markable_14', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n", | |
"\n", | |
"markable_14 {'layers': set(['mmax', 'mmax:markable']), 'mmax:dir_speech': 'text_level', 'mmax:type': 'anaphoric', 'mmax:id': 'markable_14', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'sbj', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_14:primmark', 'mmax:span': 'word_116', 'mmax:anaphor_antecedent': 'markable_12', 'mmax:phrase_type': 'np', 'mmax:np_form': 'ne', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n", | |
"\n", | |
"markable_12 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'defnp', 'layers': set(['mmax', 'mmax:markable']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_12', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'other', 'mmax:phrase_type': 'pp', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_12:primmark', 'mmax:span': 'word_101..word_103', 'mmax:anaphor_antecedent': 'markable_11', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n", | |
"\n", | |
"markable_11 {'layers': set(['mmax', 'mmax:markable']), 'mmax:dir_speech': 'text_level', 'mmax:type': 'none', 'mmax:id': 'markable_11', 'mmax:grammatical_role': 'other', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_11:primmark', 'mmax:span': 'word_10', 'mmax:anaphor_antecedent': 'empty', 'mmax:phrase_type': 'np', 'mmax:np_form': 'ne', 'mmax:referentiality': 'discourse-new', 'mmax:complex_np': 'no'} \n", | |
"\n", | |
"markable_21 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'defnp', 'layers': set(['mmax', 'mmax:markable']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_21', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'dir-obj', 'mmax:phrase_type': 'np', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_21:primmark', 'mmax:span': 'word_177..word_178', 'mmax:anaphor_antecedent': 'markable_10', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n", | |
"\n", | |
"markable_10 {'layers': set(['mmax', 'mmax:markable']), 'mmax:dir_speech': 'text_level', 'mmax:type': 'anaphoric', 'mmax:id': 'markable_10', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'sbj', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_10:primmark', 'mmax:span': 'word_94..word_95', 'mmax:anaphor_antecedent': 'markable_8', 'mmax:phrase_type': 'np', 'mmax:np_form': 'defnp', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n", | |
"\n", | |
"markable_8 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'defnp', 'layers': set(['mmax', 'mmax:markable']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_8', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'dir-obj', 'mmax:phrase_type': 'np', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_8:primmark', 'mmax:span': 'word_73..word_74', 'mmax:anaphor_antecedent': 'markable_7', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n", | |
"\n", | |
"markable_7 {'layers': set(['mmax', 'mmax:markable']), 'mmax:dir_speech': 'text_level', 'mmax:type': 'anaphoric', 'mmax:id': 'markable_7', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'other', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_7:primmark', 'mmax:span': 'word_60..word_63', 'mmax:anaphor_antecedent': 'markable_2', 'mmax:phrase_type': 'pp', 'mmax:np_form': 'defnp', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} \n", | |
"\n", | |
"markable_2 {'layers': set(['mmax', 'mmax:markable']), 'mmax:dir_speech': 'text_level', 'mmax:type': 'none', 'mmax:id': 'markable_2', 'mmax:grammatical_role': 'sbj', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_2:primmark', 'mmax:span': 'word_8..word_11', 'mmax:anaphor_antecedent': 'empty', 'mmax:phrase_type': 'np', 'mmax:np_form': 'defnp', 'mmax:referentiality': 'discourse-new', 'mmax:complex_np': 'no'} \n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from discoursegraphs.readwrite.mmax2 import spanstring2text\n", | |
"\n", | |
"for chain in get_pointing_chains(mdg):\n", | |
" for node_id in chain:\n", | |
" print node_id, spanstring2text(mdg, mdg.node[node_id]['mmax:span'])\n", | |
" print '\\n'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"markable_22 Wittstock\n", | |
"markable_19 die Dosse-Stadt\n", | |
"markable_17 Wittstock\n", | |
"markable_14 Wittstock\n", | |
"markable_12 in der Region\n", | |
"markable_11 Wittstocker\n", | |
"\n", | |
"\n", | |
"markable_21 die Halle\n", | |
"markable_10 Die Halle\n", | |
"markable_8 die Halle\n", | |
"markable_7 f\u00fcr den schmucken Veranstaltungsort\n", | |
"markable_2 die neue Wittstocker Stadthalle\n", | |
"\n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 21 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for chain in get_pointing_chains(mdg):\n", | |
" for node_id in chain:\n", | |
" print mdg.in_edges(node_id, data=True)\n", | |
" print mdg.out_edges(node_id, data=True)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[]\n", | |
"[('markable_22', 'markable_19', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_22', 'word_180', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n", | |
"[('markable_22', 'markable_19', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n", | |
"[('markable_19', 'markable_17', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_19', 'word_160', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_19', 'word_161', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n", | |
"[('markable_19', 'markable_17', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n", | |
"[('markable_17', 'word_149', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_17', 'markable_14', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n", | |
"[('markable_17', 'markable_14', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n", | |
"[('markable_14', 'markable_12', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_14', 'word_116', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n", | |
"[('markable_14', 'markable_12', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n", | |
"[('markable_12', 'markable_11', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_12', 'word_101', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_12', 'word_102', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_12', 'word_103', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n", | |
"[('markable_12', 'markable_11', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n", | |
"[('markable_11', 'word_10', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n", | |
"[]\n", | |
"[('markable_21', 'word_178', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_21', 'markable_10', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_21', 'word_177', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n", | |
"[('markable_21', 'markable_10', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n", | |
"[('markable_10', 'markable_8', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_10', 'word_95', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_10', 'word_94', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n", | |
"[('markable_10', 'markable_8', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n", | |
"[('markable_8', 'word_73', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_8', 'word_74', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_8', 'markable_7', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n", | |
"[('markable_8', 'markable_7', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n", | |
"[('markable_7', 'markable_2', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_7', 'word_62', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_7', 'word_63', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_7', 'word_60', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_7', 'word_61', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n", | |
"[('markable_7', 'markable_2', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]\n", | |
"[('markable_2', 'word_10', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_2', 'word_9', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_2', 'word_8', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_2', 'word_11', {'layers': set(['mmax', 'mmax:markable']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]\n" | |
] | |
} | |
], | |
"prompt_number": 44 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment