Skip to content

Instantly share code, notes, and snippets.

@mikexine
Created March 23, 2016 11:16
Show Gist options
  • Save mikexine/b791e401808f16f41144 to your computer and use it in GitHub Desktop.
Save mikexine/b791e401808f16f41144 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import operator
import json
from nltk import bigrams
from collections import Counter
import re
from collections import defaultdict
from nltk.corpus import stopwords
import string
import vincent
import pandas
import arrow
start = arrow.now()
punctuation = list(string.punctuation)
stop = stopwords.words('french') + stopwords.words('german') + stopwords.words('italian') + stopwords.words('english') + punctuation + ["l'a", "amp", "ter", "les", "c'est", 'de', 'en', 'el', 'https', 'rt', 'via', 'RT']
print stop
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')',
re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$',
re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=True):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token)
else token.lower() for token in tokens]
return tokens
fname = 'tweets_bruxelles.json'
pr = 0
err = 0
dates_isis = []
dates_peace = []
with open(fname, 'r') as f:
count_stop = Counter()
count_hashtags = Counter()
count_mentions = Counter()
count_bigram = Counter()
for line in f:
tweet = json.loads(line)
try:
pr += 1
print str(tweet['created_at']) + " ok: " + str(pr)
terms_stop = [term for term in preprocess(tweet['text'])
if term not in stop and
not term.startswith(('#', '@', 'http')) and
len(term) > 2]
hashtags_only = [term for term in preprocess(tweet['text'])
if term.startswith(('#')) and
len(term) > 2]
mentions_only = [term for term in preprocess(tweet['text'])
if term.startswith(('@'))]
terms_bigram = bigrams(terms_stop)
terms = [term for term in preprocess(tweet['text'])
if term not in stop and len(term) != 1]
# track when the hashtag is mentioned
if 'isis' in terms:
dates_isis.append(tweet['created_at'])
if 'peace' in terms:
dates_peace.append(tweet['created_at'])
except:
err += 1
count_stop.update(terms_stop)
count_hashtags.update(hashtags_only)
count_mentions.update(mentions_only)
count_bigram.update(terms_bigram)
# if pr == 1000:
# break
nElements = 50
print "----------------------------"
print "count_bigram"
with open('html/bigrams.txt', 'w') as file_:
file_.write(str(count_bigram.most_common(nElements)))
print "----------------------------"
print "funzionanti: " + str(pr)
print "errore: " + str(err)
print "generating most common terms json"
word_freq = count_stop.most_common(nElements)
labels, freq = zip(*word_freq)
data = {'data': freq, 'x': labels}
bar = vincent.Bar(data, iter_idx='x', height=600, width=900)
bar.x_axis_properties(label_angle=-45, label_align="right")
bar.legend(title="Most frequent terms")
bar.to_json('html/json/bruxelles_freq_terms.json')
print "generating most common hashtags json"
word_freq = count_hashtags.most_common(nElements)
labels, freq = zip(*word_freq)
data = {'data': freq, 'x': labels}
bar = vincent.Bar(data, iter_idx='x', height=600, width=900)
bar.x_axis_properties(label_angle=-45, label_align="right")
bar.legend(title="Most frequent hashtags")
bar.to_json('html/json/bruxelles_freq_hashtags.json')
print "generating most common mentions json"
word_freq = count_mentions.most_common(nElements)
labels, freq = zip(*word_freq)
data = {'data': freq, 'x': labels}
bar = vincent.Bar(data, iter_idx='x', height=600, width=900)
bar.x_axis_properties(label_angle=-45, label_align="right")
bar.legend(title="Most frequent mentions")
bar.to_json('html/json/bruxelles_freq_mentions.json')
print "time charting now"
# 1 time charting
print '1'
ones = [1]*len(dates_isis)
twos = [1]*len(dates_peace)
# 2 the index of the series
print '2'
idxn = pandas.DatetimeIndex(dates_isis)
idxp = pandas.DatetimeIndex(dates_peace)
# 3 the actual series (at series of 1s for the moment)
print '3'
isis = pandas.Series(ones, index=idxn)
peace = pandas.Series(twos, index=idxp)
# 4 Resampling / bucketing
print '4'
per_minute_i = isis.resample('1Min', how='sum').fillna(0)
per_minute_p = peace.resample('1Min', how='sum').fillna(0)
# 5 all the data together
print '5'
match_data = dict(isis=per_minute_i, peace=per_minute_p)
# 6 we need a DataFrame, to accommodate multiple series
print '6'
all_matches = pandas.DataFrame(data=match_data,
index=per_minute_i.index)
# 7 Resampling as above
print '7'
all_matches = all_matches.resample('1Min', how='sum').fillna(0)
# 8 and now the plotting
print '8'
time_chart = vincent.Line(all_matches[['isis', 'peace']])
time_chart.axis_titles(x='Time', y='Freq')
time_chart.legend(title='Matches')
time_chart.to_json('html/json/bruxelles_time_chart.json')
print "started at: " + str(start)
print ''
stop = arrow.now()
print "stopped at: " + str(stop)
print ''
print "total time: " + str(stop - start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment