parajain/tree_to_clause.py

## tree_to_clause.py
'''
https://www.clips.uantwerpen.be/conll2001/clauses/
Clauses are word sequences which contain a subject and a predicate. Here is an example of a sentence and its clauses obtained from Wall Street Journal section 15 of the Penn Treebank [MSM93]:

   (S The deregulation of railroads and trucking companies
      (SBAR that
          (S began in 1980)
      )
      enabled
      (S shippers to bargain for transportation)
      .
   )
The clauses of this sentence have been enclosed between brackets. A tag next to the open bracket denotes the type of the clause.

In the CoNLL-2001 shared task, the goal is to identify clauses in text. Training and test data for this task are available. This data consists of the same partitions of the Wall Street Journal part (WSJ) of the Penn Treebank as the widely used data for noun phrase chunking: sections 15-18 as training data (211727 tokens) and section 20 as test data (47377 tokens). The clause segmentation of the data has been derived from the Penn Treebank by a program written by Sabine Buchholz from Tilburg University, The Netherlands.

The shared task consists of three parts: identifying clause start positions, recognizing clause end positions and building complete clauses. We have not used clauses labeled with FRAG or RRC, and all clause labels have been converted to S. The goal of this task is to come forward with machine learning methods which after a training phase can recognize the clause segmentation of the test data as well as possible. For all three parts of the shared task, the clause segmentation methods will be evaluated with the F rate, which is a combination of the precision and recall rates: F = 2*precision*recall / (recall+precision) [Rij79].
'''

from nltk import Tree

parse_str = "(ROOT (S (NP (PRP You)) (VP (MD could) (VP (VB say) (SBAR (IN that) (S (NP (PRP they)) (ADVP (RB regularly)) (VP (VB catch) (NP (NP (DT a) (NN shower)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBZ adds) (PP (TO to) (NP (NP (PRP$ their) (NN exhilaration)) (CC and) (NP (FW joie) (FW de) (FW vivre))))))))))))) (. .)))"
#parse_str = "(ROOT (S (SBAR (IN Though) (S (NP (PRP he)) (VP (VBD was) (ADJP (RB very) (JJ rich))))) (, ,) (NP (PRP he)) (VP (VBD was) (ADVP (RB still)) (ADJP (RB very) (JJ unhappy))) (. .)))"

t = Tree.fromstring(parse_str)
#print t

subtexts = []
for subtree in t.subtrees():
    if subtree.label()=="S" or subtree.label()=="SBAR":
        #print subtree.leaves()
        subtexts.append(' '.join(subtree.leaves()))
#print subtexts

presubtexts = subtexts[:]       # ADDED IN EDIT for leftover check

for i in reversed(range(len(subtexts)-1)):
    subtexts[i] = subtexts[i][0:subtexts[i].index(subtexts[i+1])]

for text in subtexts:
    print text

# ADDED IN EDIT - Not sure for generalized cases
leftover = presubtexts[0][presubtexts[0].index(presubtexts[1])+len(presubtexts[1]):]
print leftover
	'''
	https://www.clips.uantwerpen.be/conll2001/clauses/
	Clauses are word sequences which contain a subject and a predicate. Here is an example of a sentence and its clauses obtained from Wall Street Journal section 15 of the Penn Treebank [MSM93]:

	(S The deregulation of railroads and trucking companies
	(SBAR that
	(S began in 1980)
	)
	enabled
	(S shippers to bargain for transportation)
	.
	)
	The clauses of this sentence have been enclosed between brackets. A tag next to the open bracket denotes the type of the clause.

	In the CoNLL-2001 shared task, the goal is to identify clauses in text. Training and test data for this task are available. This data consists of the same partitions of the Wall Street Journal part (WSJ) of the Penn Treebank as the widely used data for noun phrase chunking: sections 15-18 as training data (211727 tokens) and section 20 as test data (47377 tokens). The clause segmentation of the data has been derived from the Penn Treebank by a program written by Sabine Buchholz from Tilburg University, The Netherlands.

	The shared task consists of three parts: identifying clause start positions, recognizing clause end positions and building complete clauses. We have not used clauses labeled with FRAG or RRC, and all clause labels have been converted to S. The goal of this task is to come forward with machine learning methods which after a training phase can recognize the clause segmentation of the test data as well as possible. For all three parts of the shared task, the clause segmentation methods will be evaluated with the F rate, which is a combination of the precision and recall rates: F = 2precisionrecall / (recall+precision) [Rij79].
	'''

	from nltk import Tree

	parse_str = "(ROOT (S (NP (PRP You)) (VP (MD could) (VP (VB say) (SBAR (IN that) (S (NP (PRP they)) (ADVP (RB regularly)) (VP (VB catch) (NP (NP (DT a) (NN shower)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBZ adds) (PP (TO to) (NP (NP (PRP$ their) (NN exhilaration)) (CC and) (NP (FW joie) (FW de) (FW vivre))))))))))))) (. .)))"
	#parse_str = "(ROOT (S (SBAR (IN Though) (S (NP (PRP he)) (VP (VBD was) (ADJP (RB very) (JJ rich))))) (, ,) (NP (PRP he)) (VP (VBD was) (ADVP (RB still)) (ADJP (RB very) (JJ unhappy))) (. .)))"

	t = Tree.fromstring(parse_str)
	#print t

	subtexts = []
	for subtree in t.subtrees():
	if subtree.label()=="S" or subtree.label()=="SBAR":
	#print subtree.leaves()
	subtexts.append(' '.join(subtree.leaves()))
	#print subtexts

	presubtexts = subtexts[:] # ADDED IN EDIT for leftover check

	for i in reversed(range(len(subtexts)-1)):
	subtexts[i] = subtexts[i][0:subtexts[i].index(subtexts[i+1])]

	for text in subtexts:
	print text

	# ADDED IN EDIT - Not sure for generalized cases
	leftover = presubtexts[0][presubtexts[0].index(presubtexts[1])+len(presubtexts[1]):]
	print leftover