rolph-recto/contractions.py

## contractions.py
# contractions.py
# list from: http://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
contractions = [
    ("aren't", "are not"),
    ("can't", "cannot"),
    ("can't've", "cannot have"),
    ("'cause", "because"),
    ("could've", "could have"),
    ("couldn't", "could not"),
    ("couldn't've", "could not have"),
    ("didn't", "did not"),
    ("doesn't", "does not"),
    ("don't", "do not"),
    ("hadn't", "had not"),
    ("hadn't've", "had not have"),
    ("hasn't", "has not"),
    ("haven't", "have not"),
    ("he'd", "he had"),
    ("he'd've", "he would have"),
    ("he'll", "he will"),
    ("he'll've", "he will have"),
    ("he's", "he is"),
    ("how'd", "how did"),
    ("how'd'y", "how did you"),
    ("how'll", "how will"),
    ("how's", "how is"),
    ("i'd", "i had"),
    ("i'd've", "i would have"),
    ("i'll", "i will"),
    ("i'll've", "i will have"),
    ("i'm", "i am"),
    ("i've", "i have"),
    ("isn't", "is not"),
    ("it'd", "it had"),
    ("it'd've", "it would have"),
    ("it'll", "it will"),
    ("it'll've", "it will have"),
    ("it's", "it is"),
    ("let's", "let us"),
    ("ma'am", "madam"),
    ("might've", "might have"),
    ("mightn't", "might not"),
    ("mightn't've", "might not have"),
    ("must've", "must have"),
    ("mustn't", "must not"),
    ("mustn't've", "must not have"),
    ("needn't", "need not"),
    ("o'clock", "of the clock"),
    ("oughtn't", "ought not"),
    ("oughtn't've", "ought not have"),
    ("shan't", "shall not"),
    ("shan't've", "shall not have"),
    ("she'd", "she had"),
    ("she'd've", "she would have"),
    ("she'll", "she will"),
    ("she'll've", "she will have"),
    ("she's", "she is"),
    ("should've", "should have"),
    ("shouldn't", "should not"),
    ("shouldn't've", "should not have"),
    ("so's", "so is"),
    ("that's", "that is"),
    ("there'd", "there would"),
    ("there's", "there is"),
    ("they'd", "they would"),
    ("they'll", "they will"),
    ("they'll've", "they will have"),
    ("they're", "they are"),
    ("they've", "they have"),
    ("to've", "to have"),
    ("wasn't", "was not"),
    ("we'd", "we would"),
    ("we'll", "we will"),
    ("we'll've", "we will have"),
    ("we're", "we are"),
    ("we've", "we have"),
    ("weren't", "were not"),
    ("what'll", "what will"),
    ("what'll've", "what will have"),
    ("what're", "what are"),
    ("what's", "what is"),
    ("what've", "what have"),
    ("when's", "when is"),
    ("when've", "when have"),
    ("where'd", "where did"),
    ("where's", "where is"),
    ("where've", "where have"),
    ("who'll", "who will"),
    ("who'll've", "who will have"),
    ("who's", "who is"),
    ("who've", "who have"),
    ("why's", "why is"),
    ("will've", "will have"),
    ("won't", "will not"),
    ("won't've", "will not have"),
    ("would've", "would have"),
    ("wouldn't", "would not"),
    ("wouldn't've", "would not have"),
    ("y'all", "you all"),
    ("y'all'd've", "you all would have"),
    ("y'all're", "you all are"),
    ("y'all've", "you all have"),
    ("you'd", "you would"),
    ("you'd've", "you would have"),
    ("you'll", "you will"),
    ("you'll've", "you will have"),
    ("you're", "you are"),
    ("you've", "you have")
]

## stopwords.py
# stopwords.py
# because the NLTK stopwords corpus is not enough
# from http://www.ranks.nl/resources/stopwords.html

stopwords = [
    "a",
    "able",
    "about",
    "above",
    "abst",
    "accordance",
    "according",
    "accordingly",
    "across",
    "act",
    "actually",
    "added",
    "adj",
    "affected",
    "affecting",
    "affects",
    "after",
    "afterwards",
    "again",
    "against",
    "ah",
    "all",
    "almost",
    "alone",
    "along",
    "already",
    "also",
    "although",
    "always",
    "am",
    "among",
    "amongst",
    "an",
    "and",
    "announce",
    "another",
    "any",
    "anybody",
    "anyhow",
    "anymore",
    "anyone",
    "anything",
    "anyway",
    "anyways",
    "anywhere",
    "apparently",
    "approximately",
    "are",
    "aren",
    "arent",
    "arise",
    "around",
    "as",
    "aside",
    "ask",
    "asking",
    "at",
    "auth",
    "available",
    "away",
    "awfully",
    "b",
    "back",
    "be",
    "became",
    "because",
    "become",
    "becomes",
    "becoming",
    "been",
    "before",
    "beforehand",
    "begin",
    "beginning",
    "beginnings",
    "begins",
    "behind",
    "being",
    "believe",
    "below",
    "beside",
    "besides",
    "between",
    "beyond",
    "biol",
    "both",
    "brief",
    "briefly",
    "but",
    "by",
    "c",
    "ca",
    "came",
    "can",
    "cannot",
    "can't",
    "cause",
    "causes",
    "certain",
    "certainly",
    "co",
    "com",
    "come",
    "comes",
    "contain",
    "containing",
    "contains",
    "could",
    "couldnt",
    "d",
    "date",
    "did",
    "didn't",
    "different",
    "do",
    "does",
    "doesn't",
    "doing",
    "done",
    "don't",
    "down",
    "downwards",
    "due",
    "during",
    "e",
    "each",
    "ed",
    "edu",
    "effect",
    "eg",
    "eight",
    "eighty",
    "either",
    "else",
    "elsewhere",
    "end",
    "ending",
    "enough",
    "especially",
    "et",
    "et-al",
    "etc",
    "even",
    "ever",
    "every",
    "everybody",
    "everyone",
    "everything",
    "everywhere",
    "ex",
    "except",
    "f",
    "far",
    "few",
    "ff",
    "fifth",
    "first",
    "five",
    "fix",
    "followed",
    "following",
    "follows",
    "for",
    "former",
    "formerly",
    "forth",
    "found",
    "four",
    "from",
    "further",
    "furthermore",
    "g",
    "gave",
    "get",
    "gets",
    "getting",
    "give",
    "given",
    "gives",
    "giving",
    "go",
    "goes",
    "gone",
    "got",
    "gotten",
    "h",
    "had",
    "happens",
    "hardly",
    "has",
    "hasn't",
    "have",
    "haven't",
    "having",
    "he",
    "hed",
    "hence",
    "her",
    "here",
    "hereafter",
    "hereby",
    "herein",
    "heres",
    "hereupon",
    "hers",
    "herself",
    "hes",
    "hi",
    "hid",
    "him",
    "himself",
    "his",
    "hither",
    "home",
    "how",
    "howbeit",
    "however",
    "hundred",
    "i",
    "id",
    "ie",
    "if",
    "i'll",
    "im",
    "immediate",
    "immediately",
    "importance",
    "important",
    "in",
    "inc",
    "indeed",
    "index",
    "information",
    "instead",
    "into",
    "invention",
    "inward",
    "is",
    "isn't",
    "it",
    "itd",
    "it'll",
    "its",
    "itself",
    "i've",
    "j",
    "just",
    "k",
    "keep",
    "keeps",
    "kept",
    "kg",
    "km",
    "know",
    "known",
    "knows",
    "l",
    "largely",
    "last",
    "lately",
    "later",
    "latter",
    "latterly",
    "least",
    "less",
    "lest",
    "let",
    "lets",
    "like",
    "liked",
    "likely",
    "line",
    "little",
    "'ll",
    "look",
    "looking",
    "looks",
    "ltd",
    "m",
    "made",
    "mainly",
    "make",
    "makes",
    "many",
    "may",
    "maybe",
    "me",
    "mean",
    "means",
    "meantime",
    "meanwhile",
    "merely",
    "mg",
    "might",
    "million",
    "miss",
    "ml",
    "more",
    "moreover",
    "most",
    "mostly",
    "mr",
    "mrs",
    "much",
    "mug",
    "must",
    "my",
    "myself",
    "n",
    "na",
    "name",
    "namely",
    "nay",
    "nd",
    "near",
    "nearly",
    "necessarily",
    "necessary",
    "need",
    "needs",
    "neither",
    "never",
    "nevertheless",
    "new",
    "next",
    "nine",
    "ninety",
    "no",
    "nobody",
    "non",
    "none",
    "nonetheless",
    "noone",
    "nor",
    "normally",
    "nos",
    "not",
    "noted",
    "nothing",
    "now",
    "nowhere",
    "o",
    "obtain",
    "obtained",
    "obviously",
    "of",
    "off",
    "often",
    "oh",
    "ok",
    "okay",
    "old",
    "omitted",
    "on",
    "once",
    "one",
    "ones",
    "only",
    "onto",
    "or",
    "ord",
    "other",
    "others",
    "otherwise",
    "ought",
    "our",
    "ours",
    "ourselves",
    "out",
    "outside",
    "over",
    "overall",
    "owing",
    "own",
    "p",
    "page",
    "pages",
    "part",
    "particular",
    "particularly",
    "past",
    "per",
    "perhaps",
    "placed",
    "please",
    "plus",
    "poorly",
    "possible",
    "possibly",
    "potentially",
    "pp",
    "predominantly",
    "present",
    "previously",
    "primarily",
    "probably",
    "promptly",
    "proud",
    "provides",
    "put",
    "q",
    "que",
    "quickly",
    "quite",
    "qv",
    "r",
    "ran",
    "rather",
    "rd",
    "re",
    "readily",
    "really",
    "recent",
    "recently",
    "ref",
    "refs",
    "regarding",
    "regardless",
    "regards",
    "related",
    "relatively",
    "research",
    "respectively",
    "resulted",
    "resulting",
    "results",
    "right",
    "run",
    "s",
    "said",
    "same",
    "saw",
    "say",
    "saying",
    "says",
    "sec",
    "section",
    "see",
    "seeing",
    "seem",
    "seemed",
    "seeming",
    "seems",
    "seen",
    "self",
    "selves",
    "sent",
    "seven",
    "several",
    "shall",
    "she",
    "shed",
    "she'll",
    "shes",
    "should",
    "shouldn't",
    "show",
    "showed",
    "shown",
    "showns",
    "shows",
    "significant",
    "significantly",
    "similar",
    "similarly",
    "since",
    "six",
    "slightly",
    "so",
    "some",
    "somebody",
    "somehow",
    "someone",
    "somethan",
    "something",
    "sometime",
    "sometimes",
    "somewhat",
    "somewhere",
    "soon",
    "sorry",
    "specifically",
    "specified",
    "specify",
    "specifying",
    "still",
    "stop",
    "strongly",
    "sub",
    "substantially",
    "successfully",
    "such",
    "sufficiently",
    "suggest",
    "sup",
    "sure",
    "t",
    "take",
    "taken",
    "taking",
    "tell",
    "tends",
    "th",
    "than",
    "thank",
    "thanks",
    "thanx",
    "that",
    "that'll",
    "thats",
    "that've",
    "the",
    "their",
    "theirs",
    "them",
    "themselves",
    "then",
    "thence",
    "there",
    "thereafter",
    "thereby",
    "thered",
    "therefore",
    "therein",
    "there'll",
    "thereof",
    "therere",
    "theres",
    "thereto",
    "thereupon",
    "there've",
    "these",
    "they",
    "theyd",
    "they'll",
    "theyre",
    "they've",
    "think",
    "this",
    "those",
    "thou",
    "though",
    "thoughh",
    "thousand",
    "throug",
    "through",
    "throughout",
    "thru",
    "thus",
    "til",
    "tip",
    "to",
    "together",
    "too",
    "took",
    "toward",
    "towards",
    "tried",
    "tries",
    "truly",
    "try",
    "trying",
    "ts",
    "twice",
    "two",
    "u",
    "un",
    "under",
    "unfortunately",
    "unless",
    "unlike",
    "unlikely",
    "until",
    "unto",
    "up",
    "upon",
    "ups",
    "us",
    "use",
    "used",
    "useful",
    "usefully",
    "usefulness",
    "uses",
    "using",
    "usually",
    "v",
    "value",
    "various",
    "'ve",
    "very",
    "via",
    "viz",
    "vol",
    "vols",
    "vs",
    "w",
    "want",
    "wants",
    "was",
    "wasn't",
    "way",
    "we",
    "wed",
    "welcome",
    "we'll",
    "went",
    "were",
    "weren't",
    "we've",
    "what",
    "whatever",
    "what'll",
    "whats",
    "when",
    "whence",
    "whenever",
    "where",
    "whereafter",
    "whereas",
    "whereby",
    "wherein",
    "wheres",
    "whereupon",
    "wherever",
    "whether",
    "which",
    "while",
    "whim",
    "whither",
    "who",
    "whod",
    "whoever",
    "whole",
    "who'll",
    "whom",
    "whomever",
    "whos",
    "whose",
    "why",
    "widely",
    "willing",
    "wish",
    "with",
    "within",
    "without",
    "won't",
    "words",
    "world",
    "would",
    "wouldn't",
    "www",
    "x",
    "y",
    "yes",
    "yet",
    "you",
    "youd",
    "you'll",
    "your",
    "youre",
    "yours",
    "yourself",
    "yourselves",
    "you've",
    "z",
    "zero"
]

## tf_idf.py
# tf_idf.py
# term frequency - inverse document frequency
# word ranker used to build featureset for classifiers

# for convenience, the functions below assume the following:
# a "document" is a list of tokens
# a "corpus" is a list of documents (a list of lists of tokens)

#allows for integer division that returns rational numbers, not rounded integers
from __future__ import division
import math


class IdfDict(dict):
    """
    dictionary of idf values
    """

    def __init__(self, corpus_size, *args, **kwargs):
        super(IdfDict, self).__init__(*args, **kwargs)
        self.corpus_size = corpus_size


    def __getitem__(self, key):
        #if a term is not in the dictionary, calculate an idf value anyway
        if not key in self:
            return math.log(self.corpus_size)
        else:
            return super(IdfDict, self).__getitem__(key)


def tf_raw(term, document):
    return document.count(term)


def tf_bool(term, document):
    return 1 if term in document else 0


def tf_log(term, document):
    return math.log(document.count(term))


def tf(term, document, algorithm="RAW"):
    """
    Calculates the frequency of a term within a document
    Can specific different algorithms:
    -RAW calculates the raw frequency of a term
        -i.e., number of times it occurs in the document
    -BOOL calculates the "boolean frequency" of a term
        -i.e., if the term is in the document, tf is 1; if not, tf is 0
    -LOG calculates a logarithmically scaled term frequency
        -i.e., tf = log of the raw frequency
    -AUG calculates term frequency divided by the max frequency of the word
    in the document
        -i.e., this prevents bias towards longer documents

    NOTE: only RAW, BOOL, and LOG are implemented at the moment
    """

    if algorithm == "RAW":
        return tf_raw(term, document)
    elif algorithm == "BOOL":
        return tf_bool(term, document)
    elif algorithm == "LOG":
        return tf_log(term, document)
    else:
        raise ValueError("tf cannot use algorithm %s" % algorithm)


def idf(term, corpus):
    """
    Calculates the inverse document frequency for a term
    idf is the log of the ratio between the total number of documents
    in the corpus and the number of docs in the corpus with the given term
    """
    corpus_size = len(corpus)
    docs_with_term = 0

    for document in corpus:
        if term in document:
            docs_with_term += 1

    #add 1 to docs_with_term to account for terms that don't occur in the corpus
    #so that a division by zero doesn't occur
    return math.log( corpus_size / (docs_with_term+1) )


def tf_idf(term, document, corpus, algorithm="RAW"):
    """
    return tf-idf score
    """
    #if idf score was calculated previously, don't calculate it again;
    #this occurs when the tf-idf of a term is being calculated
    #for all documents in a corpus
    if type(corpus) is float:
        return tf(term, document, algorithm) * corpus
    else:
        return tf(term, document, algorithm) * idf(term, corpus)


def idf_corpus(corpus):
    """
    calculates idf score for all terms in a corpus
    """

    #build idf score for all terms in the corpus
    #first, build a vocab of the corpus
    vocab = set()
    for document in corpus:
        vocab |= set(document)

    #then, calculate the idf for each term in the vocab
    idf_set = IdfDict(len(corpus))
    for term in vocab:
        idf_set[term] = idf(term, corpus)

    return idf_set


def tf_idf_corpus(corpus, algorithm="RAW", idf_set=None):
    """
    calculates tf-idf score for all terms in every document of a corpus
    """
    #retrieve idf scores for all words in corpus
    if idf_set == None:
        idf_set = idf_corpus(corpus)

    #calculate tf-idf score for every document
    doc_set = []
    for document in corpus:
        doc_vocab = set(document)
        tf_idf_set = {}
        #calculate tf and then tf-idf score for every term
        for term in doc_vocab:
            tf_score = tf(term, document, algorithm)
            tf_idf_set[term] = tf_idf(term, document, idf_set[term], algorithm)

        doc_set.append(tf_idf_set)

    return doc_set

## tweet_featureset.py
# tweet_featureset.py
# build a featureset from a tweet corpus

# a tweet corpus is assumed to be a list of tweets,
# where a tweet is a dictionary with at least the following keys:
# -id
# -text
# -political (Boolean; answers whether a tweet is political or not)

from nltk.tokenize import WhitespaceTokenizer

from tweet_preprocess import cleanup_text, cleanup_tokens
from tf_idf import tf, idf_corpus, tf_idf_corpus


class TweetFeatureset(object):
    """
    TweetFeatureset class
    creates featuresets for tweets
    """

    tokenizer = WhitespaceTokenizer()


    def __init__(self, corpus):
        self.train(corpus)


    @classmethod
    def tokenize_tweet(cls, tweet):
        """
        tokenize a single tweet
        """
        #cleanup tweets
        tweet["text"] = cleanup_text(tweet["text"])
        #tokenize tweets using NLTK word tokenizer
        #use the whitespace tokenizer to preserve contractions,
        #which will be expanded during tokenization processing
        tweet["tokens"] = cls.tokenizer.tokenize(tweet["text"])
        #clean up tokenized tweets
        tweet["tokens"] = cleanup_tokens(tweet["tokens"])

        return tweet


    @classmethod
    def tokenize_corpus(cls, corpus):
        """
        return a tweet corpus in tokenized form
        """
        corpus = [TweetFeatureset.tokenize_tweet(tweet) for tweet in corpus]

        #remove empty tweets from corpus
        return [tweet for tweet in corpus if len(tweet["tokens"]) > 0]


    def train(self, corpus):
        """
        use corpus to calculate idf scores
        """
        corpus = TweetFeatureset.tokenize_corpus(corpus)
        token_corpus = [tweet["tokens"] for tweet in corpus]
        self.idf_set = idf_corpus(token_corpus)


    def build_tagged_featureset(self, tweets, algorithm="BOOL"):
        """
        build a featureset for a classifier using a tweet corpus
        and pair it with a tag (political/apolitical)
        use boolean frequency algorithm for tweets because tweets are so short
        there is no reason to count words in each tweet,
        just detect if a word is in the tweet
        """
        #pair features with respective tags
        tagged_features = [
            (features, tweets[i]["political"])
            for i, features in enumerate(self.build_featureset(tweets, algorithm))
        ]

        return tagged_features


    def build_featureset(self, tweets, algorithm="BOOL"):
        """
        build a featureset with no pairing to a tag
        """
        #tokenize corpus
        corpus = TweetFeatureset.tokenize_corpus(tweets)

        #extract features from tweet corpus
        token_corpus = [tweet["tokens"] for tweet in tweets]
        feature_corpus = tf_idf_corpus(token_corpus, algorithm, self.idf_set)

        return feature_corpus

## tweet_preprocess.py
# tweet_preprocess.py
# preprocess tweets before using them
# to build a featureset for a classifier

import re
import string

import nltk
from nltk.tag import pos_tag

from stopwords import stopwords
from contractions import contractions

#utility functions for cleaning up
#i.e., preprocessing tweet text before tokenization

def normalize_whitespace(func):
    """
    strip whitespace from the start and end of a tweet
    also, convert multiple sequential whitespace chars into one whitespace char
    """
    return lambda text: re.sub(r"[\s]{2,}", " ", func(text).strip())


def remove_punctuation(func):
    """
    remove punctuation from text
    """
    #TO DO
    #preserve apostrophes between letters for contractions
    #strip possessives
    return lambda text: re.sub(r"[`~!@#$%^&*()-=_+,./<>?;':\"\[\]{}\|]",
        " ", func(text))


def remove_possessives(func):
    """
    remove possessives from a noun
    """
    return lambda text: re.sub(r"[^\s]'s([\s]+|$)",
            lambda match: (match.group(0).strip()[:-2] + " "), func(text))


def convert_to_lowercase(func):
    """
    convert tweet to all all_lowercase
    """
    return lambda text: func(text).lower()


def convert_hashtags(func):
    """
    convert hashtags into individual words
    ex. #TeamJacob would be converted to team jacob
    """
    return lambda text: \
        re.sub(r"#([^\s#]+)",
        lambda match: \
            " "
            + re.sub(r"([a-z])([A-Z])", "\g<1> \g<2>", match.group(1))
            + " ",
        func(text))


def remove_retweets(func):
    """
    remove retweet tag ("RT") from tweet
    """
    return lambda text: re.sub(r"([\s]+|^)RT([\s]+|$)", " ", func(text))


def remove_usernames(func):
    """
    remove usernames from tweet
    """
    return lambda text: re.sub(r"([\s]+|^)@[^\s]+", " ", func(text))


def remove_email(func):
    """
    remove emails from tweet
    """
    return lambda text: re.sub(r"[\s]*[^@\s]+@[^@\s]+\.[^@\s]", " ", func(text))


def remove_links(func):
    """
    remove hyperlinks from tweet
    """
    return lambda text: re.sub(r"[\s]*(https|http|ftp)[^\s]+", " ", func(text))


def convert_to_ascii(func):
    """
    convert unicode to ascii by removing all non-ascii characters
    """
    return lambda text: func(text).encode("ascii", "ignore")


@normalize_whitespace
@remove_punctuation
@remove_possessives
@convert_to_lowercase
@convert_hashtags
@remove_retweets
@remove_usernames
@remove_email
@remove_links
@convert_to_ascii
def cleanup_text(text):
    return text


#preprocess tokens

def remove_irrelevant_pos_tokens(tokens):
    """
    remove tokens of a certain part of speech that is probably irrelevant
    to the political content of the tweet

    assume that only nouns, verbs, and adjectives are relevant;
    remove all other tokens
    """
    pos_tokens = pos_tag(tokens)

    return [token for token, tag in pos_tokens
        if token.startswith("N")
        or token.startswith("V")
        or token.startswith("J")
    ]


def remove_short_tokens(tokens):
    """
    remove tokens that are 2 or less characters
    we can assume that these tokens aren't important
    """
    return [token for token in tokens if len(token) > 2]


def remove_stopwords(tokens):
    """
    remove stopwords (i.e., "scaffold words" in English w/o much meaning)
    """
    global stopwords
    return [token for token in tokens if not token in stopwords]


def expand_contraction_tokens(tokens):
    """
    expand a list of tokens
    """
    global contractions
    result_tokens = []
    for token in tokens:
        is_contraction = False
        for contraction, expansion in contractions:
            if token == contraction or token == contraction.replace("'", ""):
                result_tokens += expansion.split()
                is_contraction = True
                break

        if not is_contraction:
            result_tokens.append(token)

    return result_tokens


def cleanup_tokens(tokens):
    """
    preprocess tokens before using them to build a featureset
    """
    tokens = expand_contraction_tokens(tokens)
    tokens = remove_stopwords(tokens)
    tokens = remove_short_tokens(tokens)

    return tokens
	# contractions.py
	# list from: http://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
	contractions = [
	("aren't", "are not"),
	("can't", "cannot"),
	("can't've", "cannot have"),
	("'cause", "because"),
	("could've", "could have"),
	("couldn't", "could not"),
	("couldn't've", "could not have"),
	("didn't", "did not"),
	("doesn't", "does not"),
	("don't", "do not"),
	("hadn't", "had not"),
	("hadn't've", "had not have"),
	("hasn't", "has not"),
	("haven't", "have not"),
	("he'd", "he had"),
	("he'd've", "he would have"),
	("he'll", "he will"),
	("he'll've", "he will have"),
	("he's", "he is"),
	("how'd", "how did"),
	("how'd'y", "how did you"),
	("how'll", "how will"),
	("how's", "how is"),
	("i'd", "i had"),
	("i'd've", "i would have"),
	("i'll", "i will"),
	("i'll've", "i will have"),
	("i'm", "i am"),
	("i've", "i have"),
	("isn't", "is not"),
	("it'd", "it had"),
	("it'd've", "it would have"),
	("it'll", "it will"),
	("it'll've", "it will have"),
	("it's", "it is"),
	("let's", "let us"),
	("ma'am", "madam"),
	("might've", "might have"),
	("mightn't", "might not"),
	("mightn't've", "might not have"),
	("must've", "must have"),
	("mustn't", "must not"),
	("mustn't've", "must not have"),
	("needn't", "need not"),
	("o'clock", "of the clock"),
	("oughtn't", "ought not"),
	("oughtn't've", "ought not have"),
	("shan't", "shall not"),
	("shan't've", "shall not have"),
	("she'd", "she had"),
	("she'd've", "she would have"),
	("she'll", "she will"),
	("she'll've", "she will have"),
	("she's", "she is"),
	("should've", "should have"),
	("shouldn't", "should not"),
	("shouldn't've", "should not have"),
	("so's", "so is"),
	("that's", "that is"),
	("there'd", "there would"),
	("there's", "there is"),
	("they'd", "they would"),
	("they'll", "they will"),
	("they'll've", "they will have"),
	("they're", "they are"),
	("they've", "they have"),
	("to've", "to have"),
	("wasn't", "was not"),
	("we'd", "we would"),
	("we'll", "we will"),
	("we'll've", "we will have"),
	("we're", "we are"),
	("we've", "we have"),
	("weren't", "were not"),
	("what'll", "what will"),
	("what'll've", "what will have"),
	("what're", "what are"),
	("what's", "what is"),
	("what've", "what have"),
	("when's", "when is"),
	("when've", "when have"),
	("where'd", "where did"),
	("where's", "where is"),
	("where've", "where have"),
	("who'll", "who will"),
	("who'll've", "who will have"),
	("who's", "who is"),
	("who've", "who have"),
	("why's", "why is"),
	("will've", "will have"),
	("won't", "will not"),
	("won't've", "will not have"),
	("would've", "would have"),
	("wouldn't", "would not"),
	("wouldn't've", "would not have"),
	("y'all", "you all"),
	("y'all'd've", "you all would have"),
	("y'all're", "you all are"),
	("y'all've", "you all have"),
	("you'd", "you would"),
	("you'd've", "you would have"),
	("you'll", "you will"),
	("you'll've", "you will have"),
	("you're", "you are"),
	("you've", "you have")
	]
	# stopwords.py
	# because the NLTK stopwords corpus is not enough
	# from http://www.ranks.nl/resources/stopwords.html

	stopwords = [
	"a",
	"able",
	"about",
	"above",
	"abst",
	"accordance",
	"according",
	"accordingly",
	"across",
	"act",
	"actually",
	"added",
	"adj",
	"affected",
	"affecting",
	"affects",
	"after",
	"afterwards",
	"again",
	"against",
	"ah",
	"all",
	"almost",
	"alone",
	"along",
	"already",
	"also",
	"although",
	"always",
	"am",
	"among",
	"amongst",
	"an",
	"and",
	"announce",
	"another",
	"any",
	"anybody",
	"anyhow",
	"anymore",
	"anyone",
	"anything",
	"anyway",
	"anyways",
	"anywhere",
	"apparently",
	"approximately",
	"are",
	"aren",
	"arent",
	"arise",
	"around",
	"as",
	"aside",
	"ask",
	"asking",
	"at",
	"auth",
	"available",
	"away",
	"awfully",
	"b",
	"back",
	"be",
	"became",
	"because",
	"become",
	"becomes",
	"becoming",
	"been",
	"before",
	"beforehand",
	"begin",
	"beginning",
	"beginnings",
	"begins",
	"behind",
	"being",
	"believe",
	"below",
	"beside",
	"besides",
	"between",
	"beyond",
	"biol",
	"both",
	"brief",
	"briefly",
	"but",
	"by",
	"c",
	"ca",
	"came",
	"can",
	"cannot",
	"can't",
	"cause",
	"causes",
	"certain",
	"certainly",
	"co",
	"com",
	"come",
	"comes",
	"contain",
	"containing",
	"contains",
	"could",
	"couldnt",
	"d",
	"date",
	"did",
	"didn't",
	"different",
	"do",
	"does",
	"doesn't",
	"doing",
	"done",
	"don't",
	"down",
	"downwards",
	"due",
	"during",
	"e",
	"each",
	"ed",
	"edu",
	"effect",
	"eg",
	"eight",
	"eighty",
	"either",
	"else",
	"elsewhere",
	"end",
	"ending",
	"enough",
	"especially",
	"et",
	"et-al",
	"etc",
	"even",
	"ever",
	"every",
	"everybody",
	"everyone",
	"everything",
	"everywhere",
	"ex",
	"except",
	"f",
	"far",
	"few",
	"ff",
	"fifth",
	"first",
	"five",
	"fix",
	"followed",
	"following",
	"follows",
	"for",
	"former",
	"formerly",
	"forth",
	"found",
	"four",
	"from",
	"further",
	"furthermore",
	"g",
	"gave",
	"get",
	"gets",
	"getting",
	"give",
	"given",
	"gives",
	"giving",
	"go",
	"goes",
	"gone",
	"got",
	"gotten",
	"h",
	"had",
	"happens",
	"hardly",
	"has",
	"hasn't",
	"have",
	"haven't",
	"having",
	"he",
	"hed",
	"hence",
	"her",
	"here",
	"hereafter",
	"hereby",
	"herein",
	"heres",
	"hereupon",
	"hers",
	"herself",
	"hes",
	"hi",
	"hid",
	"him",
	"himself",
	"his",
	"hither",
	"home",
	"how",
	"howbeit",
	"however",
	"hundred",
	"i",
	"id",
	"ie",
	"if",
	"i'll",
	"im",
	"immediate",
	"immediately",
	"importance",
	"important",
	"in",
	"inc",
	"indeed",
	"index",
	"information",
	"instead",
	"into",
	"invention",
	"inward",
	"is",
	"isn't",
	"it",
	"itd",
	"it'll",
	"its",
	"itself",
	"i've",
	"j",
	"just",
	"k",
	"keep",
	"keeps",
	"kept",
	"kg",
	"km",
	"know",
	"known",
	"knows",
	"l",
	"largely",
	"last",
	"lately",
	"later",
	"latter",
	"latterly",
	"least",
	"less",
	"lest",
	"let",
	"lets",
	"like",
	"liked",
	"likely",
	"line",
	"little",
	"'ll",
	"look",
	"looking",
	"looks",
	"ltd",
	"m",
	"made",
	"mainly",
	"make",
	"makes",
	"many",
	"may",
	"maybe",
	"me",
	"mean",
	"means",
	"meantime",
	"meanwhile",
	"merely",
	"mg",
	"might",
	"million",
	"miss",
	"ml",
	"more",
	"moreover",
	"most",
	"mostly",
	"mr",
	"mrs",
	"much",
	"mug",
	"must",
	"my",
	"myself",
	"n",
	"na",
	"name",
	"namely",
	"nay",
	"nd",
	"near",
	"nearly",
	"necessarily",
	"necessary",
	"need",
	"needs",
	"neither",
	"never",
	"nevertheless",
	"new",
	"next",
	"nine",
	"ninety",
	"no",
	"nobody",
	"non",
	"none",
	"nonetheless",
	"noone",
	"nor",
	"normally",
	"nos",
	"not",
	"noted",
	"nothing",
	"now",
	"nowhere",
	"o",
	"obtain",
	"obtained",
	"obviously",
	"of",
	"off",
	"often",
	"oh",
	"ok",
	"okay",
	"old",
	"omitted",
	"on",
	"once",
	"one",
	"ones",
	"only",
	"onto",
	"or",
	"ord",
	"other",
	"others",
	"otherwise",
	"ought",
	"our",
	"ours",
	"ourselves",
	"out",
	"outside",
	"over",
	"overall",
	"owing",
	"own",
	"p",
	"page",
	"pages",
	"part",
	"particular",
	"particularly",
	"past",
	"per",
	"perhaps",
	"placed",
	"please",
	"plus",
	"poorly",
	"possible",
	"possibly",
	"potentially",
	"pp",
	"predominantly",
	"present",
	"previously",
	"primarily",
	"probably",
	"promptly",
	"proud",
	"provides",
	"put",
	"q",
	"que",
	"quickly",
	"quite",
	"qv",
	"r",
	"ran",
	"rather",
	"rd",
	"re",
	"readily",
	"really",
	"recent",
	"recently",
	"ref",
	"refs",
	"regarding",
	"regardless",
	"regards",
	"related",
	"relatively",
	"research",
	"respectively",
	"resulted",
	"resulting",
	"results",
	"right",
	"run",
	"s",
	"said",
	"same",
	"saw",
	"say",
	"saying",
	"says",
	"sec",
	"section",
	"see",
	"seeing",
	"seem",
	"seemed",
	"seeming",
	"seems",
	"seen",
	"self",
	"selves",
	"sent",
	"seven",
	"several",
	"shall",
	"she",
	"shed",
	"she'll",
	"shes",
	"should",
	"shouldn't",
	"show",
	"showed",
	"shown",
	"showns",
	"shows",
	"significant",
	"significantly",
	"similar",
	"similarly",
	"since",
	"six",
	"slightly",
	"so",
	"some",
	"somebody",
	"somehow",
	"someone",
	"somethan",
	"something",
	"sometime",
	"sometimes",
	"somewhat",
	"somewhere",
	"soon",
	"sorry",
	"specifically",
	"specified",
	"specify",
	"specifying",
	"still",
	"stop",
	"strongly",
	"sub",
	"substantially",
	"successfully",
	"such",
	"sufficiently",
	"suggest",
	"sup",
	"sure",
	"t",
	"take",
	"taken",
	"taking",
	"tell",
	"tends",
	"th",
	"than",
	"thank",
	"thanks",
	"thanx",
	"that",
	"that'll",
	"thats",
	"that've",
	"the",
	"their",
	"theirs",
	"them",
	"themselves",
	"then",
	"thence",
	"there",
	"thereafter",
	"thereby",
	"thered",
	"therefore",
	"therein",
	"there'll",
	"thereof",
	"therere",
	"theres",
	"thereto",
	"thereupon",
	"there've",
	"these",
	"they",
	"theyd",
	"they'll",
	"theyre",
	"they've",
	"think",
	"this",
	"those",
	"thou",
	"though",
	"thoughh",
	"thousand",
	"throug",
	"through",
	"throughout",
	"thru",
	"thus",
	"til",
	"tip",
	"to",
	"together",
	"too",
	"took",
	"toward",
	"towards",
	"tried",
	"tries",
	"truly",
	"try",
	"trying",
	"ts",
	"twice",
	"two",
	"u",
	"un",
	"under",
	"unfortunately",
	"unless",
	"unlike",
	"unlikely",
	"until",
	"unto",
	"up",
	"upon",
	"ups",
	"us",
	"use",
	"used",
	"useful",
	"usefully",
	"usefulness",
	"uses",
	"using",
	"usually",
	"v",
	"value",
	"various",
	"'ve",
	"very",
	"via",
	"viz",
	"vol",
	"vols",
	"vs",
	"w",
	"want",
	"wants",
	"was",
	"wasn't",
	"way",
	"we",
	"wed",
	"welcome",
	"we'll",
	"went",
	"were",
	"weren't",
	"we've",
	"what",
	"whatever",
	"what'll",
	"whats",
	"when",
	"whence",
	"whenever",
	"where",
	"whereafter",
	"whereas",
	"whereby",
	"wherein",
	"wheres",
	"whereupon",
	"wherever",
	"whether",
	"which",
	"while",
	"whim",
	"whither",
	"who",
	"whod",
	"whoever",
	"whole",
	"who'll",
	"whom",
	"whomever",
	"whos",
	"whose",
	"why",
	"widely",
	"willing",
	"wish",
	"with",
	"within",
	"without",
	"won't",
	"words",
	"world",
	"would",
	"wouldn't",
	"www",
	"x",
	"y",
	"yes",
	"yet",
	"you",
	"youd",
	"you'll",
	"your",
	"youre",
	"yours",
	"yourself",
	"yourselves",
	"you've",
	"z",
	"zero"
	]
	# tf_idf.py
	# term frequency - inverse document frequency
	# word ranker used to build featureset for classifiers

	# for convenience, the functions below assume the following:
	# a "document" is a list of tokens
	# a "corpus" is a list of documents (a list of lists of tokens)

	#allows for integer division that returns rational numbers, not rounded integers
	from __future__ import division
	import math


	class IdfDict(dict):
	"""
	dictionary of idf values
	"""

	def __init__(self, corpus_size, args, *kwargs):
	super(IdfDict, self).__init__(args, *kwargs)
	self.corpus_size = corpus_size


	def __getitem__(self, key):
	#if a term is not in the dictionary, calculate an idf value anyway
	if not key in self:
	return math.log(self.corpus_size)
	else:
	return super(IdfDict, self).__getitem__(key)


	def tf_raw(term, document):
	return document.count(term)


	def tf_bool(term, document):
	return 1 if term in document else 0


	def tf_log(term, document):
	return math.log(document.count(term))


	def tf(term, document, algorithm="RAW"):
	"""
	Calculates the frequency of a term within a document
	Can specific different algorithms:
	-RAW calculates the raw frequency of a term
	-i.e., number of times it occurs in the document
	-BOOL calculates the "boolean frequency" of a term
	-i.e., if the term is in the document, tf is 1; if not, tf is 0
	-LOG calculates a logarithmically scaled term frequency
	-i.e., tf = log of the raw frequency
	-AUG calculates term frequency divided by the max frequency of the word
	in the document
	-i.e., this prevents bias towards longer documents

	NOTE: only RAW, BOOL, and LOG are implemented at the moment
	"""

	if algorithm == "RAW":
	return tf_raw(term, document)
	elif algorithm == "BOOL":
	return tf_bool(term, document)
	elif algorithm == "LOG":
	return tf_log(term, document)
	else:
	raise ValueError("tf cannot use algorithm %s" % algorithm)


	def idf(term, corpus):
	"""
	Calculates the inverse document frequency for a term
	idf is the log of the ratio between the total number of documents
	in the corpus and the number of docs in the corpus with the given term
	"""
	corpus_size = len(corpus)
	docs_with_term = 0

	for document in corpus:
	if term in document:
	docs_with_term += 1

	#add 1 to docs_with_term to account for terms that don't occur in the corpus
	#so that a division by zero doesn't occur
	return math.log( corpus_size / (docs_with_term+1) )


	def tf_idf(term, document, corpus, algorithm="RAW"):
	"""
	return tf-idf score
	"""
	#if idf score was calculated previously, don't calculate it again;
	#this occurs when the tf-idf of a term is being calculated
	#for all documents in a corpus
	if type(corpus) is float:
	return tf(term, document, algorithm) * corpus
	else:
	return tf(term, document, algorithm) * idf(term, corpus)


	def idf_corpus(corpus):
	"""
	calculates idf score for all terms in a corpus
	"""

	#build idf score for all terms in the corpus
	#first, build a vocab of the corpus
	vocab = set()
	for document in corpus:
	vocab \|= set(document)

	#then, calculate the idf for each term in the vocab
	idf_set = IdfDict(len(corpus))
	for term in vocab:
	idf_set[term] = idf(term, corpus)

	return idf_set


	def tf_idf_corpus(corpus, algorithm="RAW", idf_set=None):
	"""
	calculates tf-idf score for all terms in every document of a corpus
	"""
	#retrieve idf scores for all words in corpus
	if idf_set == None:
	idf_set = idf_corpus(corpus)

	#calculate tf-idf score for every document
	doc_set = []
	for document in corpus:
	doc_vocab = set(document)
	tf_idf_set = {}
	#calculate tf and then tf-idf score for every term
	for term in doc_vocab:
	tf_score = tf(term, document, algorithm)
	tf_idf_set[term] = tf_idf(term, document, idf_set[term], algorithm)

	doc_set.append(tf_idf_set)

	return doc_set
	# tweet_featureset.py
	# build a featureset from a tweet corpus

	# a tweet corpus is assumed to be a list of tweets,
	# where a tweet is a dictionary with at least the following keys:
	# -id
	# -text
	# -political (Boolean; answers whether a tweet is political or not)

	from nltk.tokenize import WhitespaceTokenizer

	from tweet_preprocess import cleanup_text, cleanup_tokens
	from tf_idf import tf, idf_corpus, tf_idf_corpus


	class TweetFeatureset(object):
	"""
	TweetFeatureset class
	creates featuresets for tweets
	"""

	tokenizer = WhitespaceTokenizer()


	def __init__(self, corpus):
	self.train(corpus)


	@classmethod
	def tokenize_tweet(cls, tweet):
	"""
	tokenize a single tweet
	"""
	#cleanup tweets
	tweet["text"] = cleanup_text(tweet["text"])
	#tokenize tweets using NLTK word tokenizer
	#use the whitespace tokenizer to preserve contractions,
	#which will be expanded during tokenization processing
	tweet["tokens"] = cls.tokenizer.tokenize(tweet["text"])
	#clean up tokenized tweets
	tweet["tokens"] = cleanup_tokens(tweet["tokens"])

	return tweet


	@classmethod
	def tokenize_corpus(cls, corpus):
	"""
	return a tweet corpus in tokenized form
	"""
	corpus = [TweetFeatureset.tokenize_tweet(tweet) for tweet in corpus]

	#remove empty tweets from corpus
	return [tweet for tweet in corpus if len(tweet["tokens"]) > 0]


	def train(self, corpus):
	"""
	use corpus to calculate idf scores
	"""
	corpus = TweetFeatureset.tokenize_corpus(corpus)
	token_corpus = [tweet["tokens"] for tweet in corpus]
	self.idf_set = idf_corpus(token_corpus)


	def build_tagged_featureset(self, tweets, algorithm="BOOL"):
	"""
	build a featureset for a classifier using a tweet corpus
	and pair it with a tag (political/apolitical)
	use boolean frequency algorithm for tweets because tweets are so short
	there is no reason to count words in each tweet,
	just detect if a word is in the tweet
	"""
	#pair features with respective tags
	tagged_features = [
	(features, tweets[i]["political"])
	for i, features in enumerate(self.build_featureset(tweets, algorithm))
	]

	return tagged_features


	def build_featureset(self, tweets, algorithm="BOOL"):
	"""
	build a featureset with no pairing to a tag
	"""
	#tokenize corpus
	corpus = TweetFeatureset.tokenize_corpus(tweets)

	#extract features from tweet corpus
	token_corpus = [tweet["tokens"] for tweet in tweets]
	feature_corpus = tf_idf_corpus(token_corpus, algorithm, self.idf_set)

	return feature_corpus
	# tweet_preprocess.py
	# preprocess tweets before using them
	# to build a featureset for a classifier

	import re
	import string

	import nltk
	from nltk.tag import pos_tag

	from stopwords import stopwords
	from contractions import contractions

	#utility functions for cleaning up
	#i.e., preprocessing tweet text before tokenization

	def normalize_whitespace(func):
	"""
	strip whitespace from the start and end of a tweet
	also, convert multiple sequential whitespace chars into one whitespace char
	"""
	return lambda text: re.sub(r"[\s]{2,}", " ", func(text).strip())


	def remove_punctuation(func):
	"""
	remove punctuation from text
	"""
	#TO DO
	#preserve apostrophes between letters for contractions
	#strip possessives
	return lambda text: re.sub(r"[`~!@#$%^&*()-=_+,./<>?;':\"\[\]{}\\|]",
	" ", func(text))


	def remove_possessives(func):
	"""
	remove possessives from a noun
	"""
	return lambda text: re.sub(r"[^\s]'s([\s]+\|$)",
	lambda match: (match.group(0).strip()[:-2] + " "), func(text))


	def convert_to_lowercase(func):
	"""
	convert tweet to all all_lowercase
	"""
	return lambda text: func(text).lower()


	def convert_hashtags(func):
	"""
	convert hashtags into individual words
	ex. #TeamJacob would be converted to team jacob
	"""
	return lambda text: \
	re.sub(r"#([^\s#]+)",
	lambda match: \
	" "
	+ re.sub(r"([a-z])([A-Z])", "\g<1> \g<2>", match.group(1))
	+ " ",
	func(text))


	def remove_retweets(func):
	"""
	remove retweet tag ("RT") from tweet
	"""
	return lambda text: re.sub(r"([\s]+\|^)RT([\s]+\|$)", " ", func(text))


	def remove_usernames(func):
	"""
	remove usernames from tweet
	"""
	return lambda text: re.sub(r"([\s]+\|^)@[^\s]+", " ", func(text))


	def remove_email(func):
	"""
	remove emails from tweet
	"""
	return lambda text: re.sub(r"[\s]*[^@\s]+@[^@\s]+\.[^@\s]", " ", func(text))


	def remove_links(func):
	"""
	remove hyperlinks from tweet
	"""
	return lambda text: re.sub(r"[\s]*(https\|http\|ftp)[^\s]+", " ", func(text))


	def convert_to_ascii(func):
	"""
	convert unicode to ascii by removing all non-ascii characters
	"""
	return lambda text: func(text).encode("ascii", "ignore")


	@normalize_whitespace
	@remove_punctuation
	@remove_possessives
	@convert_to_lowercase
	@convert_hashtags
	@remove_retweets
	@remove_usernames
	@remove_email
	@remove_links
	@convert_to_ascii
	def cleanup_text(text):
	return text


	#preprocess tokens

	def remove_irrelevant_pos_tokens(tokens):
	"""
	remove tokens of a certain part of speech that is probably irrelevant
	to the political content of the tweet

	assume that only nouns, verbs, and adjectives are relevant;
	remove all other tokens
	"""
	pos_tokens = pos_tag(tokens)

	return [token for token, tag in pos_tokens
	if token.startswith("N")
	or token.startswith("V")
	or token.startswith("J")
	]


	def remove_short_tokens(tokens):
	"""
	remove tokens that are 2 or less characters
	we can assume that these tokens aren't important
	"""
	return [token for token in tokens if len(token) > 2]


	def remove_stopwords(tokens):
	"""
	remove stopwords (i.e., "scaffold words" in English w/o much meaning)
	"""
	global stopwords
	return [token for token in tokens if not token in stopwords]


	def expand_contraction_tokens(tokens):
	"""
	expand a list of tokens
	"""
	global contractions
	result_tokens = []
	for token in tokens:
	is_contraction = False
	for contraction, expansion in contractions:
	if token == contraction or token == contraction.replace("'", ""):
	result_tokens += expansion.split()
	is_contraction = True
	break

	if not is_contraction:
	result_tokens.append(token)

	return result_tokens


	def cleanup_tokens(tokens):
	"""
	preprocess tokens before using them to build a featureset
	"""
	tokens = expand_contraction_tokens(tokens)
	tokens = remove_stopwords(tokens)
	tokens = remove_short_tokens(tokens)

	return tokens