jonasft/lexicon_transformer.py

## lexicon_transformer.py
import csv
import re
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import normalize
from data import lexicons
import numpy as np
from data import resources
from transformers.tfidf_transformer import TfidfTransformer

class LexiconTransformer(TransformerMixin, BaseEstimator):
	def __init__(self, norm=False, negate=True):
		self.normalize = norm
		self.norm = norm
		self.nrc = lexicons._nrc_emotion
		self.bingliu = lexicons._bing_liu
		self.mpqa = lexicons._mpqa
		self.negate = negate

	def transform(self, raw_tweets, y=None):
		manual_lexica = [self.nrc, self.bingliu, self.mpqa]
		matrix = np.zeros((len(raw_tweets), 4))
		#content = raw_tweets
		content = TfidfTransformer().process_negation_in_dataset(raw_tweets) if self.negate else raw_tweets
		for lexicon in manual_lexica:
			scores = self._manual_lexicon_scorer(content, lexicon())
			matrix += scores
		return matrix

	def fit(self, raw_tweets, y=None):
		return self

	def _manual_lexicon_scorer(self, raw_tweets, lexicon_dict):
		data = raw_tweets
		scores = np.zeros((len(data), 4))
		negation_re = r'(.*)_NEG(?:FIRST)?$'
		for i, tweet in enumerate(data):
			for token in tweet.split(" "):
				try:
					if re.match(negation_re, token):
						token = re.sub(negation_re, r'\1', token)
						if lexicon_dict[token] > 0:
							scores[i][2] -= lexicon_dict[token]
						else:
							scores[i][3] -= lexicon_dict[token]
					else:
						scores[i][0 if lexicon_dict[token] > 0 else 1] += lexicon_dict[token]
				except KeyError:
					pass
		return normalize(scores) if self.normalize else scores
	import csv
	import re
	from sklearn.base import TransformerMixin, BaseEstimator
	from sklearn.preprocessing import normalize
	from data import lexicons
	import numpy as np
	from data import resources
	from transformers.tfidf_transformer import TfidfTransformer

	class LexiconTransformer(TransformerMixin, BaseEstimator):
	def __init__(self, norm=False, negate=True):
	self.normalize = norm
	self.norm = norm
	self.nrc = lexicons._nrc_emotion
	self.bingliu = lexicons._bing_liu
	self.mpqa = lexicons._mpqa
	self.negate = negate

	def transform(self, raw_tweets, y=None):
	manual_lexica = [self.nrc, self.bingliu, self.mpqa]
	matrix = np.zeros((len(raw_tweets), 4))
	#content = raw_tweets
	content = TfidfTransformer().process_negation_in_dataset(raw_tweets) if self.negate else raw_tweets
	for lexicon in manual_lexica:
	scores = self._manual_lexicon_scorer(content, lexicon())
	matrix += scores
	return matrix

	def fit(self, raw_tweets, y=None):
	return self

	def _manual_lexicon_scorer(self, raw_tweets, lexicon_dict):
	data = raw_tweets
	scores = np.zeros((len(data), 4))
	negation_re = r'(.*)_NEG(?:FIRST)?$'
	for i, tweet in enumerate(data):
	for token in tweet.split(" "):
	try:
	if re.match(negation_re, token):
	token = re.sub(negation_re, r'\1', token)
	if lexicon_dict[token] > 0:
	scores[i][2] -= lexicon_dict[token]
	else:
	scores[i][3] -= lexicon_dict[token]
	else:
	scores[i][0 if lexicon_dict[token] > 0 else 1] += lexicon_dict[token]
	except KeyError:
	pass
	return normalize(scores) if self.normalize else scores