Skip to content

Instantly share code, notes, and snippets.

@ryancatalani
Created May 12, 2013 20:38
Show Gist options
  • Save ryancatalani/5564842 to your computer and use it in GitHub Desktop.
Save ryancatalani/5564842 to your computer and use it in GitHub Desktop.
Some methods for analyzing tweets (tuned to updates from #ec2013, the 2013 Emerson College commencement).
require "csv"
require "cgi"
require "net/http"
require "uri"
require 'nokogiri'
require 'open-uri'
OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE
times = []
tweets = []
mutchnick = []
pelton = []
CSV.foreach("2013/ec2013-2.csv") do |row|
d = DateTime.parse(row[1]) rescue next
d = d.to_time + 3*3600 # + Time.zone_offset('EST')
if d.day == 12
times << d
tweets << row
mutchnick << row if row[3].downcase =~ /(max|mutchnick)/
pelton << row if row[3].downcase =~ /(lee\W+|pelton)/
end
end
puts tweets.count
puts mutchnick.count
puts pelton.count
puts ""
# via http://pullmonkey.com/2008/1/31/rounding-to-the-nearest-number-in-ruby/
def roundup(num, nearest=15)
num % nearest == 0 ? num : num + nearest - (num % nearest)
end
def rounddown(num, nearest=15)
num % nearest == 0 ? num : num - (num % nearest)
end
def roundnearest(num, nearest=15)
up = roundup(num, nearest)
down = rounddown(num, nearest)
if (up-num) < (num-down)
return up
else
return down
end
end
def time_intervals(minutes_interval=15, min_hour=7, max_hour=19)
raise ArgumentError unless minutes_interval.is_a? Integer
raise ArgumentError if min_hour > max_hour
minutes = []
time_intervals = []
0.upto(59).each do |i|
minutes << i if i % minutes_interval == 0
end
min_hour.upto(max_hour).each do |h|
time_intervals << minutes.map do |m|
mm = m.to_s
mm.length == 2 ? "#{h}:#{m}" : "#{h}:0#{m}"
end
end
time_intervals.flatten!
return time_intervals
end
def group_by_time(arr, minutes_interval=15, min_hour=7, max_hour=19)
intervals = time_intervals(minutes_interval, min_hour, max_hour)
times_res = Array.new(intervals.count, 0)
arr.reverse.each do |t|
time = Time.parse(t[1]) + 3*3600 # + Time.zone_offset('EST')
mmin = roundnearest(time.min.to_i, minutes_interval)
mhour = time.hour
next if mhour < min_hour || mhour > max_hour
if mmin == 60
mhour += 1
mmin = 0
end
mtime = mmin.to_s.length == 2 ? "#{mhour}:#{mmin}" : "#{mhour}:0#{mmin}"
times_res[intervals.index(mtime)] += 1
end
return times_res
end
p time_intervals(15, 7, 14)
p group_by_time(tweets, 15, 7, 14)
def sort_by_time(arr)
arr_times = group_by_time(arr)
return arr_times.sort {|a,b| b[1].count <=> a[1].count}
end
def find_bigrams(sentence)
s = sentence.downcase.split(' ')
s.delete_if{|w| w =~ /(http|@|rt|#|^[^a-zA-Z0-9]+$)/}
# s = sentence.gsub(/[^a-zA-Z0-9\s]/,'').split(' ')
b = s.each_slice(2).to_a
b.pop if s.count.odd?
s.shift
b += s.each_slice(2).to_a
b.pop if s.count.odd?
return b.map {|x| x.join(' ') }
end
# -------
words = []
bigrams = []
urls = []
expanded_urls = []
hashtags = []
users = []
dont_count = ["rt","the","be","to","of","and","a","in","that","have","I","i","it","for","not","on","with","he","as","you","do","at","this","but","his","by","from","they","we","say","her","she","or","an","will","my","one","all","would","there","their","what","so","up","out","if","about","who","get","which","go","me","when","make","can","like","time","no","just","him","know","take","person","into","year","your","good","some","could","them","see","other","than","then","now","look","only","come","its","over","think","also","back","after","use","two","how","our","work","first","well","even","new","want","because","any","these","give","day","most","us","are","is","were","was","has","having","had","did","does","doing","done","said","says","saying","goes","going","went","gone","made","making","could","likes","liked","liking","knew","known","knowing","sees","seeing","saw","seen","looks","looked","looking","came","coming","thought","thinking","gave","given","giving","find","found","finding","finds","tell","told","tells","telling","ask","asks","asking","asked","works","working","worked","seem","seems","seemed","seeming","feel","felt","feels","feeling","try","tries","trying","tried","leave","left","leaves","leaving","call","calling","called","calls","last","long","great","little","own","old","right","big","high","different","small","large","next","early","young","important","few","public","bad","same","able","many","beneath","under","above"]
tweets.each do |t|
t[3].split(' ').each do |w|
words << w unless dont_count.include?(w.downcase) or w[0,1] =~ /\W/
if w.include?("http://")
urls << w[w.index("http"),w.length]
end
hashtags << w.gsub(/[^#\w+]/,'').strip if w[0] == "#"
end
bigrams += find_bigrams(t[3])
users << t[2]
end
def word_freq_sort(arr,case_sensitive=false,limit=nil)
if case_sensitive
grouped_arr = arr.group_by {|w| w}.to_a
else
grouped_arr = arr.group_by {|w| w.downcase}.to_a
end
if limit
return grouped_arr.sort!{ |a,b| b[1].count <=> a[1].count }.first(limit)
else
return grouped_arr.sort!{ |a,b| b[1].count <=> a[1].count }
end
end
def print_word_freq(arr,case_sensitive=false,limit=nil,js=false)
grouped_arr = word_freq_sort(arr,case_sensitive,limit)
if js
arr = []
grouped_arr.each do |w|
arr << "{term:'#{w[0]}', count:#{w[1].count}}"
end
puts arr.join(",")
else
grouped_arr.each do |w|
puts w[0]
puts w[1].count
end
end
end
def print_url_freq(arr,only_images=false,limit=20)
count = 0
grouped_arr = word_freq_sort(arr,true)
grouped_arr.each do |w|
if only_images
image_urls = []
url = w[0]
expanded = expand_url url
if expanded.include?("ow.ly") or expanded.include?("instagr") or expanded =~ /twitter.com\/.+\/photo/
url = get_owly_photo(expanded) if expanded.include?("ow.ly")
url = get_instagram_photo(expanded) if expanded.include?("instagr")
url = get_twitter_photo(expanded) if expanded =~ /twitter.com\/.+\/photo/
image_urls << "{url:'#{url}', count:#{w[1].count.to_i}}"
puts image_urls.join(",")
count += 1
break if count == limit
end
else
#doesn't respect limit
url = w[0]
expanded = expand_url url
puts url
puts expanded
puts get_owly_photo(expanded) if expanded.include?("ow.ly")
puts get_instagram_photo(expanded) if expanded.include?("instagr")
puts w[1].count
count += 1
break if count == limit
end
end
end
def search(source,q,position=nil)
source.each do |t|
if position
raise "Position must be a number" if position.class != Fixnum
puts t if t[3][position,position+q.length].downcase.include?(q.downcase)
else
puts t if t[3].downcase.include?(q.downcase)
end
end
end
def expand_url(base)
escaped = CGI.escape(base)
check = "http://expandurl.appspot.com/expand?url=" + escaped
body = Net::HTTP.get_response(URI.parse(check)).body
body.gsub!('":','" =>')
body_hash = eval body
return body_hash["end_url"]
end
def get_owly_photo(url)
id = url[url.index("/i/")+3,url.length]
return "http://static.ow.ly/photos/original/" + id + ".jpg"
end
def get_instagram_photo(url)
doc = Nokogiri::HTML(open(url))
doc.css('img.photo').each {|x| return x["src"]}
end
def get_twitter_photo(url)
doc = Nokogiri::HTML(open(url))
doc.css('img.media-slideshow-image').each {|x| return x['src']}
end
# def find_retweets(source)
# rts = []
# source.each do |t|
# rts.each do |rt|
# if rt[3] == t[3]
# end
# end
# end
print_word_freq(words,false,20,true)
# print_url_freq(urls,true,10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment