Created
March 23, 2016 11:16
-
-
Save mikexine/b791e401808f16f41144 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import operator | |
import json | |
from nltk import bigrams | |
from collections import Counter | |
import re | |
from collections import defaultdict | |
from nltk.corpus import stopwords | |
import string | |
import vincent | |
import pandas | |
import arrow | |
start = arrow.now() | |
punctuation = list(string.punctuation) | |
stop = stopwords.words('french') + stopwords.words('german') + stopwords.words('italian') + stopwords.words('english') + punctuation + ["l'a", "amp", "ter", "les", "c'est", 'de', 'en', 'el', 'https', 'rt', 'via', 'RT'] | |
print stop | |
emoticons_str = r""" | |
(?: | |
[:=;] # Eyes | |
[oO\-]? # Nose (optional) | |
[D\)\]\(\]/\\OpP] # Mouth | |
)""" | |
regex_str = [ | |
emoticons_str, | |
r'<[^>]+>', # HTML tags | |
r'(?:@[\w_]+)', # @-mentions | |
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags | |
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', | |
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers | |
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and ' | |
r'(?:[\w_]+)', # other words | |
r'(?:\S)' # anything else | |
] | |
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', | |
re.VERBOSE | re.IGNORECASE) | |
emoticon_re = re.compile(r'^'+emoticons_str+'$', | |
re.VERBOSE | re.IGNORECASE) | |
def tokenize(s): | |
return tokens_re.findall(s) | |
def preprocess(s, lowercase=True): | |
tokens = tokenize(s) | |
if lowercase: | |
tokens = [token if emoticon_re.search(token) | |
else token.lower() for token in tokens] | |
return tokens | |
fname = 'tweets_bruxelles.json' | |
pr = 0 | |
err = 0 | |
dates_isis = [] | |
dates_peace = [] | |
with open(fname, 'r') as f: | |
count_stop = Counter() | |
count_hashtags = Counter() | |
count_mentions = Counter() | |
count_bigram = Counter() | |
for line in f: | |
tweet = json.loads(line) | |
try: | |
pr += 1 | |
print str(tweet['created_at']) + " ok: " + str(pr) | |
terms_stop = [term for term in preprocess(tweet['text']) | |
if term not in stop and | |
not term.startswith(('#', '@', 'http')) and | |
len(term) > 2] | |
hashtags_only = [term for term in preprocess(tweet['text']) | |
if term.startswith(('#')) and | |
len(term) > 2] | |
mentions_only = [term for term in preprocess(tweet['text']) | |
if term.startswith(('@'))] | |
terms_bigram = bigrams(terms_stop) | |
terms = [term for term in preprocess(tweet['text']) | |
if term not in stop and len(term) != 1] | |
# track when the hashtag is mentioned | |
if 'isis' in terms: | |
dates_isis.append(tweet['created_at']) | |
if 'peace' in terms: | |
dates_peace.append(tweet['created_at']) | |
except: | |
err += 1 | |
count_stop.update(terms_stop) | |
count_hashtags.update(hashtags_only) | |
count_mentions.update(mentions_only) | |
count_bigram.update(terms_bigram) | |
# if pr == 1000: | |
# break | |
nElements = 50 | |
print "----------------------------" | |
print "count_bigram" | |
with open('html/bigrams.txt', 'w') as file_: | |
file_.write(str(count_bigram.most_common(nElements))) | |
print "----------------------------" | |
print "funzionanti: " + str(pr) | |
print "errore: " + str(err) | |
print "generating most common terms json" | |
word_freq = count_stop.most_common(nElements) | |
labels, freq = zip(*word_freq) | |
data = {'data': freq, 'x': labels} | |
bar = vincent.Bar(data, iter_idx='x', height=600, width=900) | |
bar.x_axis_properties(label_angle=-45, label_align="right") | |
bar.legend(title="Most frequent terms") | |
bar.to_json('html/json/bruxelles_freq_terms.json') | |
print "generating most common hashtags json" | |
word_freq = count_hashtags.most_common(nElements) | |
labels, freq = zip(*word_freq) | |
data = {'data': freq, 'x': labels} | |
bar = vincent.Bar(data, iter_idx='x', height=600, width=900) | |
bar.x_axis_properties(label_angle=-45, label_align="right") | |
bar.legend(title="Most frequent hashtags") | |
bar.to_json('html/json/bruxelles_freq_hashtags.json') | |
print "generating most common mentions json" | |
word_freq = count_mentions.most_common(nElements) | |
labels, freq = zip(*word_freq) | |
data = {'data': freq, 'x': labels} | |
bar = vincent.Bar(data, iter_idx='x', height=600, width=900) | |
bar.x_axis_properties(label_angle=-45, label_align="right") | |
bar.legend(title="Most frequent mentions") | |
bar.to_json('html/json/bruxelles_freq_mentions.json') | |
print "time charting now" | |
# 1 time charting | |
print '1' | |
ones = [1]*len(dates_isis) | |
twos = [1]*len(dates_peace) | |
# 2 the index of the series | |
print '2' | |
idxn = pandas.DatetimeIndex(dates_isis) | |
idxp = pandas.DatetimeIndex(dates_peace) | |
# 3 the actual series (at series of 1s for the moment) | |
print '3' | |
isis = pandas.Series(ones, index=idxn) | |
peace = pandas.Series(twos, index=idxp) | |
# 4 Resampling / bucketing | |
print '4' | |
per_minute_i = isis.resample('1Min', how='sum').fillna(0) | |
per_minute_p = peace.resample('1Min', how='sum').fillna(0) | |
# 5 all the data together | |
print '5' | |
match_data = dict(isis=per_minute_i, peace=per_minute_p) | |
# 6 we need a DataFrame, to accommodate multiple series | |
print '6' | |
all_matches = pandas.DataFrame(data=match_data, | |
index=per_minute_i.index) | |
# 7 Resampling as above | |
print '7' | |
all_matches = all_matches.resample('1Min', how='sum').fillna(0) | |
# 8 and now the plotting | |
print '8' | |
time_chart = vincent.Line(all_matches[['isis', 'peace']]) | |
time_chart.axis_titles(x='Time', y='Freq') | |
time_chart.legend(title='Matches') | |
time_chart.to_json('html/json/bruxelles_time_chart.json') | |
print "started at: " + str(start) | |
print '' | |
stop = arrow.now() | |
print "stopped at: " + str(stop) | |
print '' | |
print "total time: " + str(stop - start) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment