Skip to content

Instantly share code, notes, and snippets.

@geoom
Last active April 19, 2016 21:17
Show Gist options
  • Save geoom/c81970ef0171ecbb1a1b to your computer and use it in GitHub Desktop.
Save geoom/c81970ef0171ecbb1a1b to your computer and use it in GitHub Desktop.
ANN scripts
class TextualAnalizer(object):
STOP_WORDS = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any',
'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below',
'between', 'both', 'but', 'by', 'can\'t', 'cannot', 'could', 'couldn\'t', 'did',
'didn\'t', 'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each',
'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have', 'haven\'t',
'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself',
'him', 'himself', 'his', 'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if',
'in', 'into', 'is', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'let\'s', 'me', 'more',
'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only',
'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'shan\'t',
'she', 'she\'d', 'she\'ll', 'she\'s', 'should', 'shouldn\'t', 'so', 'some', 'such', 'than',
'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'there\'s',
'these', 'they', 'they\'d', 'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through',
'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll',
'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s', 'when', 'when\'s', 'where',
'where\'s', 'which', 'while', 'who', 'who\'s', 'whom', 'why', 'why\'s', 'with', 'won\'t',
'would', 'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours',
'yourself', 'yourselves']
PUNCTUATION_MARKS = ['.', ',', '?', ':', ';', '-', '...']
EXCLAMATION_MARK = '!'
NOISE_MARKS = ['/', '&']
POSITIVE_EMOTICONS = [':)', ':D']
NEUTRAL_EMOTICONS = [':|']
NEGATIVE_EMOTICONS = [':(', ':\'(']
def __init__(self, text):
self.text = text.lower()
print "text is ", self.text
def _discard_terms(self, *terms_lists):
for _, term_list in enumerate(terms_lists):
for term in term_list:
if term in self.text:
self.text = self.text.replace(term, '')
def _get_ocurrences_number(self, term_list):
counter = 0
self.text = self.text.lower()
for stop_word in term_list:
if stop_word in self.text:
counter += 1
return counter
def get_stop_words_number(self):
return self._get_ocurrences_number(self.STOP_WORDS)
def get_words_number(self, exclude_stop_words=False):
return len(self.get_words_list(self.text, exclude_stop_words))
def get_words_list(self, only_uniques=False, exclude_stop_words=False):
self._discard_terms(self.PUNCTUATION_MARKS, self.EXCLAMATION_MARK,
self.POSITIVE_EMOTICONS, self.NEUTRAL_EMOTICONS,
self.NEGATIVE_EMOTICONS, self.NOISE_MARKS)
all_words = self.text.strip().split(' ')
all_words = filter(lambda item: item != '' and '#' not in item and 'http' not in item and '@' not in item,
all_words)
if exclude_stop_words:
for excl_word in self.STOP_WORDS:
if excl_word in all_words:
all_words.remove(excl_word)
return list(set(all_words)) if only_uniques else all_words
def get_punctuation_marks_number(self):
return self._get_ocurrences_number(self.PUNCTUATION_MARKS)
def get_exclamation_marks_number(self):
exclamation_string = filter(lambda item: item == self.EXCLAMATION_MARK, self.text)
return len(exclamation_string)
def get_capitalized_words_number(self):
ocurrences = [word for word in self.text if word[0].isupper()]
return len(ocurrences)
def get_positive_emoticons_number(self):
return self._get_ocurrences_number(self.POSITIVE_EMOTICONS)
def get_neutral_emoticons_number(self):
return self._get_ocurrences_number(self.NEUTRAL_EMOTICONS)
def get_negative_emoticons_number(self):
return self._get_ocurrences_number(self.NEGATIVE_EMOTICONS)
import urllib
import settings
from xml.dom import minidom
from twitter import *
class TwitterHandler(object):
def __init__(self, query):
self.query = query
self.twitter = Twitter(auth=OAuth(settings.ACCESS_KEY, settings.ACCESS_SECRET,
settings.CONSUMER_KEY, settings.CONSUMER_SECRET))
def get_product_tweets(self):
results = self.twitter.search.tweets(q=self.query, count=5)
return results["statuses"]
class FileHandler(object):
def __init__(self, row_format_in_file):
self.row_format_in_file = row_format_in_file
self.output_file = settings.OUTPUT_FILENAME
def save(self, stored_data):
output_file = file(self.output_file, "a")
row = self.row_format_in_file % stored_data
output_file.write(row)
output_file.close()
def clean(self):
output_file = file(self.output_file, "w")
output_file.write('')
output_file.close()
class DALHandler(object):
NEGATIVE_WORD, NEUTRAL_WORD, POSITIVE_WORD = (-1, 0, 1)
word_affect_list = list()
polarity_list = list()
def __init__(self, word_list):
query = '+'.join(word_list)
self.url = 'http://compling.org/cgi-bin/DAL_sentence_xml.cgi?sentence=%s' % query
def _get_remote_document(self):
remote_doc = urllib.urlopen(self.url).read()
parsed_doc = minidom.parseString(remote_doc)
return parsed_doc
@staticmethod
def get_polarity(valence):
valence = float(valence)
normalization_factor = 3.0
result = valence/normalization_factor
if result < 0.5:
return DALHandler.NEGATIVE_WORD
elif result > 0.8:
return DALHandler.POSITIVE_WORD
return DALHandler.NEUTRAL_WORD
def make_word_affect_list(self):
doc = self._get_remote_document()
words = doc.getElementsByTagName("word")
for word in words:
token_tag = word.getElementsByTagName('token')[0]
emotion_measure_tag = word.getElementsByTagName('measure')[0]
valence = emotion_measure_tag.getAttribute("valence")
polarity = DALHandler.get_polarity(valence) if len(valence) > 0 else None
self.word_affect_list.append(
(token_tag.firstChild.data, polarity))
self.polarity_list.append(polarity)
print 'word_affect_list', self.word_affect_list
def get_positive_word_number(self):
return self.polarity_list.count(DALHandler.POSITIVE_WORD)
def get_negative_word_number(self):
return self.polarity_list.count(DALHandler.NEGATIVE_WORD)
def get_neutral_word_number(self):
return self.polarity_list.count(DALHandler.NEUTRAL_WORD)
from handler import FileHandler, TwitterHandler, DALHandler
from analizer import TextualAnalizer
import settings
class MinedProductTweet(object):
def __init__(self, tweet_text):
self.tweet_text = tweet_text
self.positive_words_number = 0
self.neutral_words_number = 0
self.negative_words_number = 0
self.stop_words_number = 0
self.words_number = 0
self.punctuation_marks_number = 0
self.exclamation_marks_number = 0
self.capitalized_words_number = 0
self.positive_emoticons_number = 0
self.neutral_emoticons_number = 0
self.negative_emoticons_number = 0
def make_data(self):
analizer = TextualAnalizer(self.tweet_text)
self.stop_words_number = analizer.get_stop_words_number()
self.words_number = analizer.get_words_number()
self.punctuation_marks_number = analizer.get_punctuation_marks_number()
self.exclamation_marks_number = analizer.get_exclamation_marks_number()
self.capitalized_words_number = analizer.get_capitalized_words_number()
self.positive_emoticons_number = analizer.get_positive_emoticons_number()
self.neutral_emoticons_number = analizer.get_neutral_emoticons_number()
self.negative_emoticons_number = analizer.get_negative_emoticons_number()
word_list = analizer.get_words_list(exclude_stop_words=True)
handler = DALHandler(word_list)
handler.make_word_affect_list()
self.positive_words_number = handler.get_positive_word_number()
self.neutral_words_number = handler.get_neutral_word_number()
self.negative_words_number = handler.get_negative_word_number()
class MinedProduct(object):
product_tweet_list = list()
def __init__(self, hashtag):
self.hashtag = hashtag
self.tweets_number = 0
self.retweet_percentage = 0
self.price_from_amazon_seller = 0
self.sell_raking = 0
self.rating_by_clients = 0
self.elapsed_time_since_release = 0
self.average_positive_words_number = 0
self.average_neutral_words_number = 0
self.average_negative_words_number = 0
self.average_stop_words_number = 0
self.average_words_number = 0
self.average_punctuation_marks_number = 0
self.average_exclamation_marks_number = 0
self.average_capitalized_marks_number = 0
self.average_positive_emoticons_number = 0
self.average_neutral_emoticons_number = 0
self.average_negative_emoticons_number = 0
self.acceptability = 0
def make_data(self):
handler = TwitterHandler(self.hashtag)
tweet_results = handler.get_product_tweets()
self.tweets_number = len(tweet_results)
for tweet in tweet_results:
product_tweet = MinedProductTweet(tweet['text'].encode('utf-8'))
product_tweet.make_data()
self.product_tweet_list.append(product_tweet)
positive_words_number_list, neutral_words_number_list, \
negative_words_number_list = zip(*[(product_tweet.positive_words_number, product_tweet.neutral_words_number,
product_tweet.negative_words_number)
for product_tweet in self.product_tweet_list])
self.average_positive_words_number = sum(positive_words_number_list)/len(positive_words_number_list)
self.average_neutral_words_number = sum(neutral_words_number_list)/len(neutral_words_number_list)
self.average_negative_words_number = sum(negative_words_number_list)/len(negative_words_number_list)
print self.__dict__
def calculate_acceptability(self):
pass
def save(self):
row_format_in_file = "%(tweets_number)s, %(average_positive_words_number)s, " \
"%(average_neutral_words_number)s, %(average_negative_words_number)s\n"
handler = FileHandler(row_format_in_file)
handler.save(self.__dict__)
class Miner(object):
product_hashtag_list = settings.ALL_PRODUCT_HASTAGS
def perform_mining(self):
for product_hashtag in self.product_hashtag_list:
product = MinedProduct(product_hashtag)
product.make_data()
# product.calculate_acceptability()
product.save()
miner = Miner()
miner.perform_mining()
CONSUMER_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
CONSUMER_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
ACCESS_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
ACCESS_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
OUTPUT_FILENAME = "products.txt"
ALL_PRODUCT_HASTAGS = ['#gopro']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment