Skip to content

Instantly share code, notes, and snippets.

@jonasft
Created June 5, 2016 19:32
Show Gist options
  • Save jonasft/b57edc86d582cd0f7a13d9e3d01e1f8f to your computer and use it in GitHub Desktop.
Save jonasft/b57edc86d582cd0f7a13d9e3d01e1f8f to your computer and use it in GitHub Desktop.
import csv
import re
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import normalize
from data import lexicons
import numpy as np
from data import resources
from transformers.tfidf_transformer import TfidfTransformer
class LexiconTransformer(TransformerMixin, BaseEstimator):
def __init__(self, norm=False, negate=True):
self.normalize = norm
self.norm = norm
self.nrc = lexicons._nrc_emotion
self.bingliu = lexicons._bing_liu
self.mpqa = lexicons._mpqa
self.negate = negate
def transform(self, raw_tweets, y=None):
manual_lexica = [self.nrc, self.bingliu, self.mpqa]
matrix = np.zeros((len(raw_tweets), 4))
#content = raw_tweets
content = TfidfTransformer().process_negation_in_dataset(raw_tweets) if self.negate else raw_tweets
for lexicon in manual_lexica:
scores = self._manual_lexicon_scorer(content, lexicon())
matrix += scores
return matrix
def fit(self, raw_tweets, y=None):
return self
def _manual_lexicon_scorer(self, raw_tweets, lexicon_dict):
data = raw_tweets
scores = np.zeros((len(data), 4))
negation_re = r'(.*)_NEG(?:FIRST)?$'
for i, tweet in enumerate(data):
for token in tweet.split(" "):
try:
if re.match(negation_re, token):
token = re.sub(negation_re, r'\1', token)
if lexicon_dict[token] > 0:
scores[i][2] -= lexicon_dict[token]
else:
scores[i][3] -= lexicon_dict[token]
else:
scores[i][0 if lexicon_dict[token] > 0 else 1] += lexicon_dict[token]
except KeyError:
pass
return normalize(scores) if self.normalize else scores
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment