Created
May 12, 2013 20:38
-
-
Save ryancatalani/5564842 to your computer and use it in GitHub Desktop.
Some methods for analyzing tweets (tuned to updates from #ec2013, the 2013 Emerson College commencement).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "csv" | |
require "cgi" | |
require "net/http" | |
require "uri" | |
require 'nokogiri' | |
require 'open-uri' | |
OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE | |
times = [] | |
tweets = [] | |
mutchnick = [] | |
pelton = [] | |
CSV.foreach("2013/ec2013-2.csv") do |row| | |
d = DateTime.parse(row[1]) rescue next | |
d = d.to_time + 3*3600 # + Time.zone_offset('EST') | |
if d.day == 12 | |
times << d | |
tweets << row | |
mutchnick << row if row[3].downcase =~ /(max|mutchnick)/ | |
pelton << row if row[3].downcase =~ /(lee\W+|pelton)/ | |
end | |
end | |
puts tweets.count | |
puts mutchnick.count | |
puts pelton.count | |
puts "" | |
# via http://pullmonkey.com/2008/1/31/rounding-to-the-nearest-number-in-ruby/ | |
def roundup(num, nearest=15) | |
num % nearest == 0 ? num : num + nearest - (num % nearest) | |
end | |
def rounddown(num, nearest=15) | |
num % nearest == 0 ? num : num - (num % nearest) | |
end | |
def roundnearest(num, nearest=15) | |
up = roundup(num, nearest) | |
down = rounddown(num, nearest) | |
if (up-num) < (num-down) | |
return up | |
else | |
return down | |
end | |
end | |
def time_intervals(minutes_interval=15, min_hour=7, max_hour=19) | |
raise ArgumentError unless minutes_interval.is_a? Integer | |
raise ArgumentError if min_hour > max_hour | |
minutes = [] | |
time_intervals = [] | |
0.upto(59).each do |i| | |
minutes << i if i % minutes_interval == 0 | |
end | |
min_hour.upto(max_hour).each do |h| | |
time_intervals << minutes.map do |m| | |
mm = m.to_s | |
mm.length == 2 ? "#{h}:#{m}" : "#{h}:0#{m}" | |
end | |
end | |
time_intervals.flatten! | |
return time_intervals | |
end | |
def group_by_time(arr, minutes_interval=15, min_hour=7, max_hour=19) | |
intervals = time_intervals(minutes_interval, min_hour, max_hour) | |
times_res = Array.new(intervals.count, 0) | |
arr.reverse.each do |t| | |
time = Time.parse(t[1]) + 3*3600 # + Time.zone_offset('EST') | |
mmin = roundnearest(time.min.to_i, minutes_interval) | |
mhour = time.hour | |
next if mhour < min_hour || mhour > max_hour | |
if mmin == 60 | |
mhour += 1 | |
mmin = 0 | |
end | |
mtime = mmin.to_s.length == 2 ? "#{mhour}:#{mmin}" : "#{mhour}:0#{mmin}" | |
times_res[intervals.index(mtime)] += 1 | |
end | |
return times_res | |
end | |
p time_intervals(15, 7, 14) | |
p group_by_time(tweets, 15, 7, 14) | |
def sort_by_time(arr) | |
arr_times = group_by_time(arr) | |
return arr_times.sort {|a,b| b[1].count <=> a[1].count} | |
end | |
def find_bigrams(sentence) | |
s = sentence.downcase.split(' ') | |
s.delete_if{|w| w =~ /(http|@|rt|#|^[^a-zA-Z0-9]+$)/} | |
# s = sentence.gsub(/[^a-zA-Z0-9\s]/,'').split(' ') | |
b = s.each_slice(2).to_a | |
b.pop if s.count.odd? | |
s.shift | |
b += s.each_slice(2).to_a | |
b.pop if s.count.odd? | |
return b.map {|x| x.join(' ') } | |
end | |
# ------- | |
words = [] | |
bigrams = [] | |
urls = [] | |
expanded_urls = [] | |
hashtags = [] | |
users = [] | |
dont_count = ["rt","the","be","to","of","and","a","in","that","have","I","i","it","for","not","on","with","he","as","you","do","at","this","but","his","by","from","they","we","say","her","she","or","an","will","my","one","all","would","there","their","what","so","up","out","if","about","who","get","which","go","me","when","make","can","like","time","no","just","him","know","take","person","into","year","your","good","some","could","them","see","other","than","then","now","look","only","come","its","over","think","also","back","after","use","two","how","our","work","first","well","even","new","want","because","any","these","give","day","most","us","are","is","were","was","has","having","had","did","does","doing","done","said","says","saying","goes","going","went","gone","made","making","could","likes","liked","liking","knew","known","knowing","sees","seeing","saw","seen","looks","looked","looking","came","coming","thought","thinking","gave","given","giving","find","found","finding","finds","tell","told","tells","telling","ask","asks","asking","asked","works","working","worked","seem","seems","seemed","seeming","feel","felt","feels","feeling","try","tries","trying","tried","leave","left","leaves","leaving","call","calling","called","calls","last","long","great","little","own","old","right","big","high","different","small","large","next","early","young","important","few","public","bad","same","able","many","beneath","under","above"] | |
tweets.each do |t| | |
t[3].split(' ').each do |w| | |
words << w unless dont_count.include?(w.downcase) or w[0,1] =~ /\W/ | |
if w.include?("http://") | |
urls << w[w.index("http"),w.length] | |
end | |
hashtags << w.gsub(/[^#\w+]/,'').strip if w[0] == "#" | |
end | |
bigrams += find_bigrams(t[3]) | |
users << t[2] | |
end | |
def word_freq_sort(arr,case_sensitive=false,limit=nil) | |
if case_sensitive | |
grouped_arr = arr.group_by {|w| w}.to_a | |
else | |
grouped_arr = arr.group_by {|w| w.downcase}.to_a | |
end | |
if limit | |
return grouped_arr.sort!{ |a,b| b[1].count <=> a[1].count }.first(limit) | |
else | |
return grouped_arr.sort!{ |a,b| b[1].count <=> a[1].count } | |
end | |
end | |
def print_word_freq(arr,case_sensitive=false,limit=nil,js=false) | |
grouped_arr = word_freq_sort(arr,case_sensitive,limit) | |
if js | |
arr = [] | |
grouped_arr.each do |w| | |
arr << "{term:'#{w[0]}', count:#{w[1].count}}" | |
end | |
puts arr.join(",") | |
else | |
grouped_arr.each do |w| | |
puts w[0] | |
puts w[1].count | |
end | |
end | |
end | |
def print_url_freq(arr,only_images=false,limit=20) | |
count = 0 | |
grouped_arr = word_freq_sort(arr,true) | |
grouped_arr.each do |w| | |
if only_images | |
image_urls = [] | |
url = w[0] | |
expanded = expand_url url | |
if expanded.include?("ow.ly") or expanded.include?("instagr") or expanded =~ /twitter.com\/.+\/photo/ | |
url = get_owly_photo(expanded) if expanded.include?("ow.ly") | |
url = get_instagram_photo(expanded) if expanded.include?("instagr") | |
url = get_twitter_photo(expanded) if expanded =~ /twitter.com\/.+\/photo/ | |
image_urls << "{url:'#{url}', count:#{w[1].count.to_i}}" | |
puts image_urls.join(",") | |
count += 1 | |
break if count == limit | |
end | |
else | |
#doesn't respect limit | |
url = w[0] | |
expanded = expand_url url | |
puts url | |
puts expanded | |
puts get_owly_photo(expanded) if expanded.include?("ow.ly") | |
puts get_instagram_photo(expanded) if expanded.include?("instagr") | |
puts w[1].count | |
count += 1 | |
break if count == limit | |
end | |
end | |
end | |
def search(source,q,position=nil) | |
source.each do |t| | |
if position | |
raise "Position must be a number" if position.class != Fixnum | |
puts t if t[3][position,position+q.length].downcase.include?(q.downcase) | |
else | |
puts t if t[3].downcase.include?(q.downcase) | |
end | |
end | |
end | |
def expand_url(base) | |
escaped = CGI.escape(base) | |
check = "http://expandurl.appspot.com/expand?url=" + escaped | |
body = Net::HTTP.get_response(URI.parse(check)).body | |
body.gsub!('":','" =>') | |
body_hash = eval body | |
return body_hash["end_url"] | |
end | |
def get_owly_photo(url) | |
id = url[url.index("/i/")+3,url.length] | |
return "http://static.ow.ly/photos/original/" + id + ".jpg" | |
end | |
def get_instagram_photo(url) | |
doc = Nokogiri::HTML(open(url)) | |
doc.css('img.photo').each {|x| return x["src"]} | |
end | |
def get_twitter_photo(url) | |
doc = Nokogiri::HTML(open(url)) | |
doc.css('img.media-slideshow-image').each {|x| return x['src']} | |
end | |
# def find_retweets(source) | |
# rts = [] | |
# source.each do |t| | |
# rts.each do |rt| | |
# if rt[3] == t[3] | |
# end | |
# end | |
# end | |
print_word_freq(words,false,20,true) | |
# print_url_freq(urls,true,10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment