Instantly share code, notes, and snippets.

Embed
What would you like to do?
Calculate Chi Squared Value for each term
import re
# SMRT Station Names
STATIONS = [
'Admiralty MRT',
'Aljunied MRT',
'Ang Mo Kio MRT',
'Bartley MRT',
'Bayfront MRT',
'Bedok MRT',
'Bishan MRT',
'Bras Basah MRT',
'Botanic Gardens MRT',
'Braddell MRT',
'Bukit Batok MRT',
'Bukit Gombak MRT',
'Caldecott MRT',
'Choa Chu Kang MRT',
'Boon Keng MRT',
'Boon Lay MRT',
'Buangkok MRT',
'Bugis MRT',
'Buona Vista MRT',
'Changi Airport MRT',
'Chinatown MRT',
'Clarke Quay MRT',
'Chinese Garden MRT',
'City Hall MRT',
'Clementi MRT',
'Commonwealth MRT',
'Dakota MRT',
'Dhoby Ghaut MRT',
'Dover MRT',
'Downtown MRT',
'Esplanade MRT',
'Eunos MRT',
'Expo MRT',
'Farrer Park MRT',
'Farrer Road MRT',
'HarbourFront MRT',
'Haw Par Villa MRT',
'Holland Village MRT',
'Hougang MRT',
'Joo Koon MRT',
'Jurong East MRT',
'Kallang MRT',
'Kovan MRT',
'Kembangan MRT',
'Kent Ridge MRT',
'Khatib MRT',
'Kranji MRT',
'Lakeside MRT',
'Labrador Park MRT',
'Lavender MRT',
'Little India MRT',
'Lorong Chuan MRT',
'Marina Bay MRT',
'Marsiling MRT',
'MacPherson MRT',
'Marymount MRT',
'Mountbatten MRT',
'Newton MRT',
'Nicoll Highway MRT',
'one-north MRT',
'Novena MRT',
'Orchard MRT',
'Outram Park MRT',
'Pasir Ris MRT',
'Pasir Panjang MRT',
'Paya Lebar MRT',
'Pioneer MRT',
'Potong Pasir MRT',
'Promenade MRT',
'Punggol MRT',
'Queenstown MRT',
'Raffles Place MRT',
'Redhill MRT',
'Sembawang MRT',
'Sengkang MRT',
'Serangoon MRT',
'Simei MRT',
'Somerset MRT',
'Stadium MRT',
'Tampines MRT',
'Tai Seng MRT',
'Tanah Merah MRT',
'Tanjong Pagar MRT',
'Tiong Bahru MRT',
'Telok Ayer MRT',
'Telok Blangah MRT',
'Toa Payoh MRT',
'Woodlands MRT',
'Woodleigh MRT',
'Yew Tree MRT',
'Yio Chu Kang MRT',
'Yishun MRT'
]
regex = re.compile('|'.join(STATIONS).lower())
# regular expressions used to normalize tweet
http_re = re.compile(r'\s+http://[^\s]*')
remove_ellipsis_re = re.compile(r'\.\.\.')
at_sign_re = re.compile(r'\@\S+')
punct_re = re.compile(r"[\"'\[\],.:;()\-&!]")
price_re = re.compile(r"\d+\.\d\d")
number_re = re.compile(r"\d+")
def normalize_tweet(tweet):
t = tweet.lower()
t = re.sub(price_re, 'PRICE', t)
t = re.sub(remove_ellipsis_re, '', t)
t = re.sub(regex, 'MRT_STATION', t)
t = re.sub(http_re, ' LINK', t)
t = re.sub(punct_re, '', t)
t = re.sub(at_sign_re, '@', t)
t = re.sub(number_re, 'NUM', t)
return t
# build the array of tweets and the class of the tweet, we use the station name as the class
stations, tweets = [], []
for line in file('tweets.csv'):
(tweet,station) = line.strip().split('\t')
stations.append(station)
tweets.append(normalize_tweet(tweet))
station_idx = map(lambda x: stations.index(x), stations)
# extract features using Term Frequency Inverse Document Frequency method
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(tweets)
from sklearn.feature_selection import chi2
# compute chi2 for each feature - test how closely each feature is correlated with it's class
chi2score = chi2(X, station_idx)[0]
import matplotlib.pyplot as plt
plt.figure(figsize=(16,30))
wscores = zip(vectorizer.get_feature_names(), chi2score)
wchi2 = sorted(wscores, key=lambda x:x[1])
topchi2 = zip(*wchi2[-100:])
x = range(len(topchi2[1]))
labels = topchi2[0]
plt.barh(x,topchi2[1], align='center', alpha=0.2, color='g')
plt.plot(topchi2[1], x, '-o', markersize=5, alpha=0.8, color='g')
plt.yticks(x, labels)
plt.xlabel('$\chi^2$')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment