erickpeirson/authoritativeText.py

## authoritativeText.py
import nltk

class AuthoritativeText(object):
    original = ""
    T = []    # Tokens.
    P = []    # Paragraph start indices (in T).
    S = []    # Sentence start indices (in T).
    I = []    # Token start,end character position indices.

    def __init__(self, text):
        if not type(text) in [ str, unicode ]:
            raise ValueError("`text` must be str or unicode.")

        self.original = text
        self.tokenize_index(text)

    def tokenize_index(self, text):
        """
        Decompose a `text` into paragraphs, sentences, and words.

        Parameters
        ----------
        text : str or unicode
            A raw text string, e.g. scraped from a document.

        Returns
        -------
        T : list
            String word tokens.
        P : list
            Integer paragraph start indices (in `T`).
        S : list
            Integer sentence start indices (in `S`).
        I : list
            Tuples containing the start and end indices of word tokens
            in the original text string. Newlines and carriage returns
            are ignored.
        """

        tok = nltk.tokenize.TextTilingTokenizer()
        paragraphs = tok.tokenize(text)
        para_indices = [ text.index(para) for para in paragraphs ]

        for para in paragraphs:
            self.P.append(len(self.T))    # Paragraph start index in T.
            para_index = text.index(para) # Paragraph start index in original text.

            sentences = nltk.tokenize.sent_tokenize(para)
            for sent in sentences:
                self.S.append(len(self.T))    # Sentence start index in T.
                sent_index = para_index + para.index(sent) # ...in original text.

                words = nltk.tokenize.word_tokenize(sent)
                last_index = 0
                word_indices = []
                for word in words:     # Use only remaining substring to prevent
                    this_index = sent[last_index:].index(word)    # false hits.
                    word_index = sent_index + this_index + last_index
                    end_index = word_index + len(word) - 1   # Last char in word.
                    word_indices.append((word, (word_index, end_index)))
                    last_index = this_index

                t,i = zip(*word_indices)
                self.T += t
                self.I += i
        return self.T, self.P, self.S, self.I

## usage.md

      
    Raw
  

              usage.md
            
          
    Given some text string...
>>> stearns = """Jason Stearns was born in San Francisco in 1976 to Stephen C. Stearns, an evolutionary biologist, and Beverly Peterson Stearns, a journalist. He has an older brother, Justin, who is professor of Middle Eastern history at New York University.

At the age of six, the family moved to Switzerland, where Stephen Stearns taught biology at the University of Basel. Stearns attended Swiss public school in Arlesheim and Muenchenstein, on the outskirts of Basel, and spent a year in Laja, Chile, on an exchange program. Upon graduation from Gymnasium Muenchenstein, he volunteered at the Swiss Tropical Institute's field research station in Ifakara, Tanzania.

In 1997, Jason joined his older brother in the United States, attending first Hampshire College and then neighboring Amherst College in Massachusetts. He graduated with a degree in political science. Having been accepted to law school at Harvard Law School, he traveled to Bukavu in the eastern Democratic Republic of the Congo to volunteer with Héritiers de la Justice, a local human rights group in September 2001. Deferring law school, he went on to work for the International Human Rights Law Group and the United Nations peacekeeping operation MONUC for the following two years. He eventually decided not to attend law school, enrolling in a PhD program in political science at Yale University in 2009.

Between 2005 and 2007, Stearns was based in Nairobi, Kenya, as a senior analyst for the International Crisis Group, working on the Democratic Republic of the Congo, Rwanda and Burundi. In 2007, he left to spend a year and a half researching and writing Dancing in the Glory of Monsters, a history of the Congo wars of 1996-2003, based on interviews with leading protagonists of the conflict. The title stems from a speech given by Congolese President Laurent Kabila in which he castigates Congolese for blaming their woes on a few political leaders, suggesting that the political malaise in the country is a more systemic problem. The book, which was eventually published in 2011, received critical acclaim in major newspapers and magazines.

In 2008, Stearns was named as coordinator of the United Nations Group of Experts on the Congo, a panel responsible for researching support and financing of armed groups in the eastern Democratic Republic of the Congo. In their final report, the Group found both the Rwandan and Congolese governments guilty of violating United Nations sanctions. According to eyewitness testimony, phone records and other documentary evidence, the Rwandan government had provided military support to the National Congress for the Defense of the People (CNDP) rebel group, while the Congolese government had collaborated with the Democratic Forces for the Liberation of Rwanda (FDLR) rebels, as well as other Congolese armed groups.[1]

Following the report, the Swedish and Dutch government temporarily suspended aid to the Rwandan government. Several months after the publication of the report, the Rwandan government arrested the leader of the CNDP, Laurent Nkunda, and struck a peace deal with the Congolese government.

In 2010, Stearns married Lusungu Kayani, a Tanzanian-American employee of the United Nations and a fashion and textile entrepreneur."""
AuthoritativeText tokenizes and indexes upon initialization.
>>> t = AuthoritativeText(stearns)
Word tokens and their start,end indices:
>>> print zip(t.T, t.I)[0:20]
[('Jason', (0, 4)), ('Stearns', (6, 12)), ('was', (14, 16)), ('born', (18, 21)), ('in', (23, 24)), ('San', (26, 28)), ('Francisco', (30, 38)), ('in', (23, 24)), ('1976', (43, 46)), ('to', (48, 49)), ('Stephen', (51, 57)), ('C.', (59, 60)), ('Stearns', (62, 68)), (',', (69, 69)), ('an', (27, 28)), ('evolutionary', (74, 85)), ('biologist', (87, 95)), (',', (69, 69)), ('and', (98, 100)), ('Beverly', (102, 108))]
	import nltk

	class AuthoritativeText(object):
	original = ""
	T = [] # Tokens.
	P = [] # Paragraph start indices (in T).
	S = [] # Sentence start indices (in T).
	I = [] # Token start,end character position indices.

	def __init__(self, text):
	if not type(text) in [ str, unicode ]:
	raise ValueError("`text` must be str or unicode.")

	self.original = text
	self.tokenize_index(text)

	def tokenize_index(self, text):
	"""
	Decompose a `text` into paragraphs, sentences, and words.

	Parameters
	----------
	text : str or unicode
	A raw text string, e.g. scraped from a document.

	Returns
	-------
	T : list
	String word tokens.
	P : list
	Integer paragraph start indices (in `T`).
	S : list
	Integer sentence start indices (in `S`).
	I : list
	Tuples containing the start and end indices of word tokens
	in the original text string. Newlines and carriage returns
	are ignored.
	"""

	tok = nltk.tokenize.TextTilingTokenizer()
	paragraphs = tok.tokenize(text)
	para_indices = [ text.index(para) for para in paragraphs ]

	for para in paragraphs:
	self.P.append(len(self.T)) # Paragraph start index in T.
	para_index = text.index(para) # Paragraph start index in original text.

	sentences = nltk.tokenize.sent_tokenize(para)
	for sent in sentences:
	self.S.append(len(self.T)) # Sentence start index in T.
	sent_index = para_index + para.index(sent) # ...in original text.

	words = nltk.tokenize.word_tokenize(sent)
	last_index = 0
	word_indices = []
	for word in words: # Use only remaining substring to prevent
	this_index = sent[last_index:].index(word) # false hits.
	word_index = sent_index + this_index + last_index
	end_index = word_index + len(word) - 1 # Last char in word.
	word_indices.append((word, (word_index, end_index)))
	last_index = this_index

	t,i = zip(*word_indices)
	self.T += t
	self.I += i
	return self.T, self.P, self.S, self.I