Skip to content

Instantly share code, notes, and snippets.

@rolph-recto
Last active January 14, 2016 18:06
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rolph-recto/5921554 to your computer and use it in GitHub Desktop.
Save rolph-recto/5921554 to your computer and use it in GitHub Desktop.
Featureset builder for tweets Used for classification and other machine learning tasks
# contractions.py
# list from: http://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
contractions = [
("aren't", "are not"),
("can't", "cannot"),
("can't've", "cannot have"),
("'cause", "because"),
("could've", "could have"),
("couldn't", "could not"),
("couldn't've", "could not have"),
("didn't", "did not"),
("doesn't", "does not"),
("don't", "do not"),
("hadn't", "had not"),
("hadn't've", "had not have"),
("hasn't", "has not"),
("haven't", "have not"),
("he'd", "he had"),
("he'd've", "he would have"),
("he'll", "he will"),
("he'll've", "he will have"),
("he's", "he is"),
("how'd", "how did"),
("how'd'y", "how did you"),
("how'll", "how will"),
("how's", "how is"),
("i'd", "i had"),
("i'd've", "i would have"),
("i'll", "i will"),
("i'll've", "i will have"),
("i'm", "i am"),
("i've", "i have"),
("isn't", "is not"),
("it'd", "it had"),
("it'd've", "it would have"),
("it'll", "it will"),
("it'll've", "it will have"),
("it's", "it is"),
("let's", "let us"),
("ma'am", "madam"),
("might've", "might have"),
("mightn't", "might not"),
("mightn't've", "might not have"),
("must've", "must have"),
("mustn't", "must not"),
("mustn't've", "must not have"),
("needn't", "need not"),
("o'clock", "of the clock"),
("oughtn't", "ought not"),
("oughtn't've", "ought not have"),
("shan't", "shall not"),
("shan't've", "shall not have"),
("she'd", "she had"),
("she'd've", "she would have"),
("she'll", "she will"),
("she'll've", "she will have"),
("she's", "she is"),
("should've", "should have"),
("shouldn't", "should not"),
("shouldn't've", "should not have"),
("so's", "so is"),
("that's", "that is"),
("there'd", "there would"),
("there's", "there is"),
("they'd", "they would"),
("they'll", "they will"),
("they'll've", "they will have"),
("they're", "they are"),
("they've", "they have"),
("to've", "to have"),
("wasn't", "was not"),
("we'd", "we would"),
("we'll", "we will"),
("we'll've", "we will have"),
("we're", "we are"),
("we've", "we have"),
("weren't", "were not"),
("what'll", "what will"),
("what'll've", "what will have"),
("what're", "what are"),
("what's", "what is"),
("what've", "what have"),
("when's", "when is"),
("when've", "when have"),
("where'd", "where did"),
("where's", "where is"),
("where've", "where have"),
("who'll", "who will"),
("who'll've", "who will have"),
("who's", "who is"),
("who've", "who have"),
("why's", "why is"),
("will've", "will have"),
("won't", "will not"),
("won't've", "will not have"),
("would've", "would have"),
("wouldn't", "would not"),
("wouldn't've", "would not have"),
("y'all", "you all"),
("y'all'd've", "you all would have"),
("y'all're", "you all are"),
("y'all've", "you all have"),
("you'd", "you would"),
("you'd've", "you would have"),
("you'll", "you will"),
("you'll've", "you will have"),
("you're", "you are"),
("you've", "you have")
]
# stopwords.py
# because the NLTK stopwords corpus is not enough
# from http://www.ranks.nl/resources/stopwords.html
stopwords = [
"a",
"able",
"about",
"above",
"abst",
"accordance",
"according",
"accordingly",
"across",
"act",
"actually",
"added",
"adj",
"affected",
"affecting",
"affects",
"after",
"afterwards",
"again",
"against",
"ah",
"all",
"almost",
"alone",
"along",
"already",
"also",
"although",
"always",
"am",
"among",
"amongst",
"an",
"and",
"announce",
"another",
"any",
"anybody",
"anyhow",
"anymore",
"anyone",
"anything",
"anyway",
"anyways",
"anywhere",
"apparently",
"approximately",
"are",
"aren",
"arent",
"arise",
"around",
"as",
"aside",
"ask",
"asking",
"at",
"auth",
"available",
"away",
"awfully",
"b",
"back",
"be",
"became",
"because",
"become",
"becomes",
"becoming",
"been",
"before",
"beforehand",
"begin",
"beginning",
"beginnings",
"begins",
"behind",
"being",
"believe",
"below",
"beside",
"besides",
"between",
"beyond",
"biol",
"both",
"brief",
"briefly",
"but",
"by",
"c",
"ca",
"came",
"can",
"cannot",
"can't",
"cause",
"causes",
"certain",
"certainly",
"co",
"com",
"come",
"comes",
"contain",
"containing",
"contains",
"could",
"couldnt",
"d",
"date",
"did",
"didn't",
"different",
"do",
"does",
"doesn't",
"doing",
"done",
"don't",
"down",
"downwards",
"due",
"during",
"e",
"each",
"ed",
"edu",
"effect",
"eg",
"eight",
"eighty",
"either",
"else",
"elsewhere",
"end",
"ending",
"enough",
"especially",
"et",
"et-al",
"etc",
"even",
"ever",
"every",
"everybody",
"everyone",
"everything",
"everywhere",
"ex",
"except",
"f",
"far",
"few",
"ff",
"fifth",
"first",
"five",
"fix",
"followed",
"following",
"follows",
"for",
"former",
"formerly",
"forth",
"found",
"four",
"from",
"further",
"furthermore",
"g",
"gave",
"get",
"gets",
"getting",
"give",
"given",
"gives",
"giving",
"go",
"goes",
"gone",
"got",
"gotten",
"h",
"had",
"happens",
"hardly",
"has",
"hasn't",
"have",
"haven't",
"having",
"he",
"hed",
"hence",
"her",
"here",
"hereafter",
"hereby",
"herein",
"heres",
"hereupon",
"hers",
"herself",
"hes",
"hi",
"hid",
"him",
"himself",
"his",
"hither",
"home",
"how",
"howbeit",
"however",
"hundred",
"i",
"id",
"ie",
"if",
"i'll",
"im",
"immediate",
"immediately",
"importance",
"important",
"in",
"inc",
"indeed",
"index",
"information",
"instead",
"into",
"invention",
"inward",
"is",
"isn't",
"it",
"itd",
"it'll",
"its",
"itself",
"i've",
"j",
"just",
"k",
"keep",
"keeps",
"kept",
"kg",
"km",
"know",
"known",
"knows",
"l",
"largely",
"last",
"lately",
"later",
"latter",
"latterly",
"least",
"less",
"lest",
"let",
"lets",
"like",
"liked",
"likely",
"line",
"little",
"'ll",
"look",
"looking",
"looks",
"ltd",
"m",
"made",
"mainly",
"make",
"makes",
"many",
"may",
"maybe",
"me",
"mean",
"means",
"meantime",
"meanwhile",
"merely",
"mg",
"might",
"million",
"miss",
"ml",
"more",
"moreover",
"most",
"mostly",
"mr",
"mrs",
"much",
"mug",
"must",
"my",
"myself",
"n",
"na",
"name",
"namely",
"nay",
"nd",
"near",
"nearly",
"necessarily",
"necessary",
"need",
"needs",
"neither",
"never",
"nevertheless",
"new",
"next",
"nine",
"ninety",
"no",
"nobody",
"non",
"none",
"nonetheless",
"noone",
"nor",
"normally",
"nos",
"not",
"noted",
"nothing",
"now",
"nowhere",
"o",
"obtain",
"obtained",
"obviously",
"of",
"off",
"often",
"oh",
"ok",
"okay",
"old",
"omitted",
"on",
"once",
"one",
"ones",
"only",
"onto",
"or",
"ord",
"other",
"others",
"otherwise",
"ought",
"our",
"ours",
"ourselves",
"out",
"outside",
"over",
"overall",
"owing",
"own",
"p",
"page",
"pages",
"part",
"particular",
"particularly",
"past",
"per",
"perhaps",
"placed",
"please",
"plus",
"poorly",
"possible",
"possibly",
"potentially",
"pp",
"predominantly",
"present",
"previously",
"primarily",
"probably",
"promptly",
"proud",
"provides",
"put",
"q",
"que",
"quickly",
"quite",
"qv",
"r",
"ran",
"rather",
"rd",
"re",
"readily",
"really",
"recent",
"recently",
"ref",
"refs",
"regarding",
"regardless",
"regards",
"related",
"relatively",
"research",
"respectively",
"resulted",
"resulting",
"results",
"right",
"run",
"s",
"said",
"same",
"saw",
"say",
"saying",
"says",
"sec",
"section",
"see",
"seeing",
"seem",
"seemed",
"seeming",
"seems",
"seen",
"self",
"selves",
"sent",
"seven",
"several",
"shall",
"she",
"shed",
"she'll",
"shes",
"should",
"shouldn't",
"show",
"showed",
"shown",
"showns",
"shows",
"significant",
"significantly",
"similar",
"similarly",
"since",
"six",
"slightly",
"so",
"some",
"somebody",
"somehow",
"someone",
"somethan",
"something",
"sometime",
"sometimes",
"somewhat",
"somewhere",
"soon",
"sorry",
"specifically",
"specified",
"specify",
"specifying",
"still",
"stop",
"strongly",
"sub",
"substantially",
"successfully",
"such",
"sufficiently",
"suggest",
"sup",
"sure",
"t",
"take",
"taken",
"taking",
"tell",
"tends",
"th",
"than",
"thank",
"thanks",
"thanx",
"that",
"that'll",
"thats",
"that've",
"the",
"their",
"theirs",
"them",
"themselves",
"then",
"thence",
"there",
"thereafter",
"thereby",
"thered",
"therefore",
"therein",
"there'll",
"thereof",
"therere",
"theres",
"thereto",
"thereupon",
"there've",
"these",
"they",
"theyd",
"they'll",
"theyre",
"they've",
"think",
"this",
"those",
"thou",
"though",
"thoughh",
"thousand",
"throug",
"through",
"throughout",
"thru",
"thus",
"til",
"tip",
"to",
"together",
"too",
"took",
"toward",
"towards",
"tried",
"tries",
"truly",
"try",
"trying",
"ts",
"twice",
"two",
"u",
"un",
"under",
"unfortunately",
"unless",
"unlike",
"unlikely",
"until",
"unto",
"up",
"upon",
"ups",
"us",
"use",
"used",
"useful",
"usefully",
"usefulness",
"uses",
"using",
"usually",
"v",
"value",
"various",
"'ve",
"very",
"via",
"viz",
"vol",
"vols",
"vs",
"w",
"want",
"wants",
"was",
"wasn't",
"way",
"we",
"wed",
"welcome",
"we'll",
"went",
"were",
"weren't",
"we've",
"what",
"whatever",
"what'll",
"whats",
"when",
"whence",
"whenever",
"where",
"whereafter",
"whereas",
"whereby",
"wherein",
"wheres",
"whereupon",
"wherever",
"whether",
"which",
"while",
"whim",
"whither",
"who",
"whod",
"whoever",
"whole",
"who'll",
"whom",
"whomever",
"whos",
"whose",
"why",
"widely",
"willing",
"wish",
"with",
"within",
"without",
"won't",
"words",
"world",
"would",
"wouldn't",
"www",
"x",
"y",
"yes",
"yet",
"you",
"youd",
"you'll",
"your",
"youre",
"yours",
"yourself",
"yourselves",
"you've",
"z",
"zero"
]
# tf_idf.py
# term frequency - inverse document frequency
# word ranker used to build featureset for classifiers
# for convenience, the functions below assume the following:
# a "document" is a list of tokens
# a "corpus" is a list of documents (a list of lists of tokens)
#allows for integer division that returns rational numbers, not rounded integers
from __future__ import division
import math
class IdfDict(dict):
"""
dictionary of idf values
"""
def __init__(self, corpus_size, *args, **kwargs):
super(IdfDict, self).__init__(*args, **kwargs)
self.corpus_size = corpus_size
def __getitem__(self, key):
#if a term is not in the dictionary, calculate an idf value anyway
if not key in self:
return math.log(self.corpus_size)
else:
return super(IdfDict, self).__getitem__(key)
def tf_raw(term, document):
return document.count(term)
def tf_bool(term, document):
return 1 if term in document else 0
def tf_log(term, document):
return math.log(document.count(term))
def tf(term, document, algorithm="RAW"):
"""
Calculates the frequency of a term within a document
Can specific different algorithms:
-RAW calculates the raw frequency of a term
-i.e., number of times it occurs in the document
-BOOL calculates the "boolean frequency" of a term
-i.e., if the term is in the document, tf is 1; if not, tf is 0
-LOG calculates a logarithmically scaled term frequency
-i.e., tf = log of the raw frequency
-AUG calculates term frequency divided by the max frequency of the word
in the document
-i.e., this prevents bias towards longer documents
NOTE: only RAW, BOOL, and LOG are implemented at the moment
"""
if algorithm == "RAW":
return tf_raw(term, document)
elif algorithm == "BOOL":
return tf_bool(term, document)
elif algorithm == "LOG":
return tf_log(term, document)
else:
raise ValueError("tf cannot use algorithm %s" % algorithm)
def idf(term, corpus):
"""
Calculates the inverse document frequency for a term
idf is the log of the ratio between the total number of documents
in the corpus and the number of docs in the corpus with the given term
"""
corpus_size = len(corpus)
docs_with_term = 0
for document in corpus:
if term in document:
docs_with_term += 1
#add 1 to docs_with_term to account for terms that don't occur in the corpus
#so that a division by zero doesn't occur
return math.log( corpus_size / (docs_with_term+1) )
def tf_idf(term, document, corpus, algorithm="RAW"):
"""
return tf-idf score
"""
#if idf score was calculated previously, don't calculate it again;
#this occurs when the tf-idf of a term is being calculated
#for all documents in a corpus
if type(corpus) is float:
return tf(term, document, algorithm) * corpus
else:
return tf(term, document, algorithm) * idf(term, corpus)
def idf_corpus(corpus):
"""
calculates idf score for all terms in a corpus
"""
#build idf score for all terms in the corpus
#first, build a vocab of the corpus
vocab = set()
for document in corpus:
vocab |= set(document)
#then, calculate the idf for each term in the vocab
idf_set = IdfDict(len(corpus))
for term in vocab:
idf_set[term] = idf(term, corpus)
return idf_set
def tf_idf_corpus(corpus, algorithm="RAW", idf_set=None):
"""
calculates tf-idf score for all terms in every document of a corpus
"""
#retrieve idf scores for all words in corpus
if idf_set == None:
idf_set = idf_corpus(corpus)
#calculate tf-idf score for every document
doc_set = []
for document in corpus:
doc_vocab = set(document)
tf_idf_set = {}
#calculate tf and then tf-idf score for every term
for term in doc_vocab:
tf_score = tf(term, document, algorithm)
tf_idf_set[term] = tf_idf(term, document, idf_set[term], algorithm)
doc_set.append(tf_idf_set)
return doc_set
# tweet_featureset.py
# build a featureset from a tweet corpus
# a tweet corpus is assumed to be a list of tweets,
# where a tweet is a dictionary with at least the following keys:
# -id
# -text
# -political (Boolean; answers whether a tweet is political or not)
from nltk.tokenize import WhitespaceTokenizer
from tweet_preprocess import cleanup_text, cleanup_tokens
from tf_idf import tf, idf_corpus, tf_idf_corpus
class TweetFeatureset(object):
"""
TweetFeatureset class
creates featuresets for tweets
"""
tokenizer = WhitespaceTokenizer()
def __init__(self, corpus):
self.train(corpus)
@classmethod
def tokenize_tweet(cls, tweet):
"""
tokenize a single tweet
"""
#cleanup tweets
tweet["text"] = cleanup_text(tweet["text"])
#tokenize tweets using NLTK word tokenizer
#use the whitespace tokenizer to preserve contractions,
#which will be expanded during tokenization processing
tweet["tokens"] = cls.tokenizer.tokenize(tweet["text"])
#clean up tokenized tweets
tweet["tokens"] = cleanup_tokens(tweet["tokens"])
return tweet
@classmethod
def tokenize_corpus(cls, corpus):
"""
return a tweet corpus in tokenized form
"""
corpus = [TweetFeatureset.tokenize_tweet(tweet) for tweet in corpus]
#remove empty tweets from corpus
return [tweet for tweet in corpus if len(tweet["tokens"]) > 0]
def train(self, corpus):
"""
use corpus to calculate idf scores
"""
corpus = TweetFeatureset.tokenize_corpus(corpus)
token_corpus = [tweet["tokens"] for tweet in corpus]
self.idf_set = idf_corpus(token_corpus)
def build_tagged_featureset(self, tweets, algorithm="BOOL"):
"""
build a featureset for a classifier using a tweet corpus
and pair it with a tag (political/apolitical)
use boolean frequency algorithm for tweets because tweets are so short
there is no reason to count words in each tweet,
just detect if a word is in the tweet
"""
#pair features with respective tags
tagged_features = [
(features, tweets[i]["political"])
for i, features in enumerate(self.build_featureset(tweets, algorithm))
]
return tagged_features
def build_featureset(self, tweets, algorithm="BOOL"):
"""
build a featureset with no pairing to a tag
"""
#tokenize corpus
corpus = TweetFeatureset.tokenize_corpus(tweets)
#extract features from tweet corpus
token_corpus = [tweet["tokens"] for tweet in tweets]
feature_corpus = tf_idf_corpus(token_corpus, algorithm, self.idf_set)
return feature_corpus
# tweet_preprocess.py
# preprocess tweets before using them
# to build a featureset for a classifier
import re
import string
import nltk
from nltk.tag import pos_tag
from stopwords import stopwords
from contractions import contractions
#utility functions for cleaning up
#i.e., preprocessing tweet text before tokenization
def normalize_whitespace(func):
"""
strip whitespace from the start and end of a tweet
also, convert multiple sequential whitespace chars into one whitespace char
"""
return lambda text: re.sub(r"[\s]{2,}", " ", func(text).strip())
def remove_punctuation(func):
"""
remove punctuation from text
"""
#TO DO
#preserve apostrophes between letters for contractions
#strip possessives
return lambda text: re.sub(r"[`~!@#$%^&*()-=_+,./<>?;':\"\[\]{}\|]",
" ", func(text))
def remove_possessives(func):
"""
remove possessives from a noun
"""
return lambda text: re.sub(r"[^\s]'s([\s]+|$)",
lambda match: (match.group(0).strip()[:-2] + " "), func(text))
def convert_to_lowercase(func):
"""
convert tweet to all all_lowercase
"""
return lambda text: func(text).lower()
def convert_hashtags(func):
"""
convert hashtags into individual words
ex. #TeamJacob would be converted to team jacob
"""
return lambda text: \
re.sub(r"#([^\s#]+)",
lambda match: \
" "
+ re.sub(r"([a-z])([A-Z])", "\g<1> \g<2>", match.group(1))
+ " ",
func(text))
def remove_retweets(func):
"""
remove retweet tag ("RT") from tweet
"""
return lambda text: re.sub(r"([\s]+|^)RT([\s]+|$)", " ", func(text))
def remove_usernames(func):
"""
remove usernames from tweet
"""
return lambda text: re.sub(r"([\s]+|^)@[^\s]+", " ", func(text))
def remove_email(func):
"""
remove emails from tweet
"""
return lambda text: re.sub(r"[\s]*[^@\s]+@[^@\s]+\.[^@\s]", " ", func(text))
def remove_links(func):
"""
remove hyperlinks from tweet
"""
return lambda text: re.sub(r"[\s]*(https|http|ftp)[^\s]+", " ", func(text))
def convert_to_ascii(func):
"""
convert unicode to ascii by removing all non-ascii characters
"""
return lambda text: func(text).encode("ascii", "ignore")
@normalize_whitespace
@remove_punctuation
@remove_possessives
@convert_to_lowercase
@convert_hashtags
@remove_retweets
@remove_usernames
@remove_email
@remove_links
@convert_to_ascii
def cleanup_text(text):
return text
#preprocess tokens
def remove_irrelevant_pos_tokens(tokens):
"""
remove tokens of a certain part of speech that is probably irrelevant
to the political content of the tweet
assume that only nouns, verbs, and adjectives are relevant;
remove all other tokens
"""
pos_tokens = pos_tag(tokens)
return [token for token, tag in pos_tokens
if token.startswith("N")
or token.startswith("V")
or token.startswith("J")
]
def remove_short_tokens(tokens):
"""
remove tokens that are 2 or less characters
we can assume that these tokens aren't important
"""
return [token for token in tokens if len(token) > 2]
def remove_stopwords(tokens):
"""
remove stopwords (i.e., "scaffold words" in English w/o much meaning)
"""
global stopwords
return [token for token in tokens if not token in stopwords]
def expand_contraction_tokens(tokens):
"""
expand a list of tokens
"""
global contractions
result_tokens = []
for token in tokens:
is_contraction = False
for contraction, expansion in contractions:
if token == contraction or token == contraction.replace("'", ""):
result_tokens += expansion.split()
is_contraction = True
break
if not is_contraction:
result_tokens.append(token)
return result_tokens
def cleanup_tokens(tokens):
"""
preprocess tokens before using them to build a featureset
"""
tokens = expand_contraction_tokens(tokens)
tokens = remove_stopwords(tokens)
tokens = remove_short_tokens(tokens)
return tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment