ryancatalani/ec2013_analysis.rb

## ec2013_analysis.rb
require "csv"
require "cgi"
require "net/http"
require "uri"
require 'nokogiri'
require 'open-uri'

OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE

times = []
tweets = []

mutchnick = []
pelton = []

CSV.foreach("2013/ec2013-2.csv") do |row|
    d = DateTime.parse(row[1]) rescue next
    d = d.to_time + 3*3600 # + Time.zone_offset('EST')
    if d.day == 12
      times << d
    	tweets << row

    	mutchnick << row if row[3].downcase =~ /(max|mutchnick)/
    	pelton << row if row[3].downcase =~ /(lee\W+|pelton)/
    end
end

puts tweets.count
puts mutchnick.count
puts pelton.count
puts ""


# via http://pullmonkey.com/2008/1/31/rounding-to-the-nearest-number-in-ruby/
def roundup(num, nearest=15)
	num % nearest == 0 ? num : num + nearest - (num % nearest)
end

def rounddown(num, nearest=15)
	num % nearest == 0 ? num : num - (num % nearest)
end

def roundnearest(num, nearest=15)
	up = roundup(num, nearest)
	down = rounddown(num, nearest)
	if (up-num) < (num-down)
		return up
	else
		return down
	end
end


def time_intervals(minutes_interval=15, min_hour=7, max_hour=19)
	raise ArgumentError unless minutes_interval.is_a? Integer
	raise ArgumentError if min_hour > max_hour
	minutes = []
	time_intervals = []
	0.upto(59).each do |i|
		minutes << i if i % minutes_interval == 0
	end
	min_hour.upto(max_hour).each do |h|
		time_intervals << minutes.map do |m|
			mm = m.to_s
			mm.length == 2 ? "#{h}:#{m}" : "#{h}:0#{m}"
		end
	end
	time_intervals.flatten!
	return time_intervals
end

def group_by_time(arr, minutes_interval=15, min_hour=7, max_hour=19)
	intervals = time_intervals(minutes_interval, min_hour, max_hour)
	times_res = Array.new(intervals.count, 0)
	arr.reverse.each do |t|
		time = Time.parse(t[1]) + 3*3600 # + Time.zone_offset('EST')
		mmin = roundnearest(time.min.to_i, minutes_interval)
		mhour = time.hour
		next if mhour < min_hour || mhour > max_hour
		if mmin == 60
			mhour += 1
			mmin = 0
		end
		mtime = mmin.to_s.length == 2 ? "#{mhour}:#{mmin}" : "#{mhour}:0#{mmin}"
		times_res[intervals.index(mtime)] += 1
	end
	return times_res
end

p time_intervals(15, 7, 14)
p group_by_time(tweets, 15, 7, 14)

def sort_by_time(arr)
	arr_times = group_by_time(arr)
	return arr_times.sort {|a,b| b[1].count <=> a[1].count}
end

def find_bigrams(sentence)
	s = sentence.downcase.split(' ')
	s.delete_if{|w| w =~ /(http|@|rt|#|^[^a-zA-Z0-9]+$)/}
	# s = sentence.gsub(/[^a-zA-Z0-9\s]/,'').split(' ')
	b = s.each_slice(2).to_a
	b.pop if s.count.odd?
	s.shift
	b += s.each_slice(2).to_a
	b.pop if s.count.odd?
	return b.map {|x| x.join(' ') }
end


# -------

words = []
bigrams = []
urls = []
expanded_urls = []
hashtags = []
users = []

dont_count = ["rt","the","be","to","of","and","a","in","that","have","I","i","it","for","not","on","with","he","as","you","do","at","this","but","his","by","from","they","we","say","her","she","or","an","will","my","one","all","would","there","their","what","so","up","out","if","about","who","get","which","go","me","when","make","can","like","time","no","just","him","know","take","person","into","year","your","good","some","could","them","see","other","than","then","now","look","only","come","its","over","think","also","back","after","use","two","how","our","work","first","well","even","new","want","because","any","these","give","day","most","us","are","is","were","was","has","having","had","did","does","doing","done","said","says","saying","goes","going","went","gone","made","making","could","likes","liked","liking","knew","known","knowing","sees","seeing","saw","seen","looks","looked","looking","came","coming","thought","thinking","gave","given","giving","find","found","finding","finds","tell","told","tells","telling","ask","asks","asking","asked","works","working","worked","seem","seems","seemed","seeming","feel","felt","feels","feeling","try","tries","trying","tried","leave","left","leaves","leaving","call","calling","called","calls","last","long","great","little","own","old","right","big","high","different","small","large","next","early","young","important","few","public","bad","same","able","many","beneath","under","above"]

tweets.each do |t|
	t[3].split(' ').each  do |w|
		words << w unless dont_count.include?(w.downcase) or w[0,1] =~ /\W/
		if w.include?("http://")
			urls << w[w.index("http"),w.length]
		end
		hashtags << w.gsub(/[^#\w+]/,'').strip if w[0] == "#"
	end
	bigrams += find_bigrams(t[3])
	users << t[2]
end

def word_freq_sort(arr,case_sensitive=false,limit=nil)
	if case_sensitive
		grouped_arr = arr.group_by {|w| w}.to_a
	else
		grouped_arr = arr.group_by {|w| w.downcase}.to_a
	end
	if limit
		return grouped_arr.sort!{ |a,b| b[1].count <=> a[1].count }.first(limit)
	else
		return grouped_arr.sort!{ |a,b| b[1].count <=> a[1].count }
	end
end

def print_word_freq(arr,case_sensitive=false,limit=nil,js=false)
	grouped_arr = word_freq_sort(arr,case_sensitive,limit)
	if js
		arr = []
		grouped_arr.each do |w|
			arr << "{term:'#{w[0]}', count:#{w[1].count}}"
		end
		puts arr.join(",")
	else
		grouped_arr.each do |w|
			puts w[0]
			puts w[1].count
		end
	end
end

def print_url_freq(arr,only_images=false,limit=20)
	count = 0
	grouped_arr = word_freq_sort(arr,true)
	grouped_arr.each do |w|
		if only_images
			image_urls = []
			url = w[0]
			expanded = expand_url url
			if expanded.include?("ow.ly") or expanded.include?("instagr") or expanded =~ /twitter.com\/.+\/photo/
				url = get_owly_photo(expanded) if expanded.include?("ow.ly")
				url = get_instagram_photo(expanded) if expanded.include?("instagr")
				url = get_twitter_photo(expanded) if expanded =~ /twitter.com\/.+\/photo/
				image_urls << "{url:'#{url}', count:#{w[1].count.to_i}}"
				puts image_urls.join(",")
				count += 1
				break if count == limit
			end

		else
			#doesn't respect limit
			url = w[0]
			expanded = expand_url url
			puts url
			puts expanded
			puts get_owly_photo(expanded) if expanded.include?("ow.ly")
			puts get_instagram_photo(expanded) if expanded.include?("instagr")
			puts w[1].count
			count += 1
			break if count == limit
		end
	end
end

def search(source,q,position=nil)
	source.each do |t|
		if position
			raise "Position must be a number" if position.class != Fixnum
			puts t if t[3][position,position+q.length].downcase.include?(q.downcase)
		else
			puts t if t[3].downcase.include?(q.downcase)
		end
	end
end

def expand_url(base)
	escaped = CGI.escape(base)
	check = "http://expandurl.appspot.com/expand?url=" + escaped
	body = Net::HTTP.get_response(URI.parse(check)).body
	body.gsub!('":','" =>')
	body_hash = eval body
	return body_hash["end_url"]
end

def get_owly_photo(url)
	id = url[url.index("/i/")+3,url.length]
	return "http://static.ow.ly/photos/original/" + id + ".jpg"
end

def get_instagram_photo(url)
	doc = Nokogiri::HTML(open(url))
	doc.css('img.photo').each {|x| return x["src"]}
end

def get_twitter_photo(url)
	doc = Nokogiri::HTML(open(url))
	doc.css('img.media-slideshow-image').each {|x| return x['src']}
end

# def find_retweets(source)
# 	rts = []
# 	source.each do |t|
# 		rts.each do |rt|
# 			if rt[3] == t[3]
# 		end
# 	end
# end

print_word_freq(words,false,20,true)
# print_url_freq(urls,true,10)
	require "csv"
	require "cgi"
	require "net/http"
	require "uri"
	require 'nokogiri'
	require 'open-uri'

	OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE

	times = []
	tweets = []

	mutchnick = []
	pelton = []

	CSV.foreach("2013/ec2013-2.csv") do \|row\|
	d = DateTime.parse(row[1]) rescue next
	d = d.to_time + 3*3600 # + Time.zone_offset('EST')
	if d.day == 12
	times << d
	tweets << row

	mutchnick << row if row[3].downcase =~ /(max\|mutchnick)/
	pelton << row if row[3].downcase =~ /(lee\W+\|pelton)/
	end
	end

	puts tweets.count
	puts mutchnick.count
	puts pelton.count
	puts ""


	# via http://pullmonkey.com/2008/1/31/rounding-to-the-nearest-number-in-ruby/
	def roundup(num, nearest=15)
	num % nearest == 0 ? num : num + nearest - (num % nearest)
	end

	def rounddown(num, nearest=15)
	num % nearest == 0 ? num : num - (num % nearest)
	end

	def roundnearest(num, nearest=15)
	up = roundup(num, nearest)
	down = rounddown(num, nearest)
	if (up-num) < (num-down)
	return up
	else
	return down
	end
	end


	def time_intervals(minutes_interval=15, min_hour=7, max_hour=19)
	raise ArgumentError unless minutes_interval.is_a? Integer
	raise ArgumentError if min_hour > max_hour
	minutes = []
	time_intervals = []
	0.upto(59).each do \|i\|
	minutes << i if i % minutes_interval == 0
	end
	min_hour.upto(max_hour).each do \|h\|
	time_intervals << minutes.map do \|m\|
	mm = m.to_s
	mm.length == 2 ? "#{h}:#{m}" : "#{h}:0#{m}"
	end
	end
	time_intervals.flatten!
	return time_intervals
	end

	def group_by_time(arr, minutes_interval=15, min_hour=7, max_hour=19)
	intervals = time_intervals(minutes_interval, min_hour, max_hour)
	times_res = Array.new(intervals.count, 0)
	arr.reverse.each do \|t\|
	time = Time.parse(t[1]) + 3*3600 # + Time.zone_offset('EST')
	mmin = roundnearest(time.min.to_i, minutes_interval)
	mhour = time.hour
	next if mhour < min_hour \|\| mhour > max_hour
	if mmin == 60
	mhour += 1
	mmin = 0
	end
	mtime = mmin.to_s.length == 2 ? "#{mhour}:#{mmin}" : "#{mhour}:0#{mmin}"
	times_res[intervals.index(mtime)] += 1
	end
	return times_res
	end

	p time_intervals(15, 7, 14)
	p group_by_time(tweets, 15, 7, 14)

	def sort_by_time(arr)
	arr_times = group_by_time(arr)
	return arr_times.sort {\|a,b\| b[1].count <=> a[1].count}
	end

	def find_bigrams(sentence)
	s = sentence.downcase.split(' ')
	s.delete_if{\|w\| w =~ /(http\|@\|rt\|#\|^[^a-zA-Z0-9]+$)/}
	# s = sentence.gsub(/[^a-zA-Z0-9\s]/,'').split(' ')
	b = s.each_slice(2).to_a
	b.pop if s.count.odd?
	s.shift
	b += s.each_slice(2).to_a
	b.pop if s.count.odd?
	return b.map {\|x\| x.join(' ') }
	end


	# -------

	words = []
	bigrams = []
	urls = []
	expanded_urls = []
	hashtags = []
	users = []

	dont_count = ["rt","the","be","to","of","and","a","in","that","have","I","i","it","for","not","on","with","he","as","you","do","at","this","but","his","by","from","they","we","say","her","she","or","an","will","my","one","all","would","there","their","what","so","up","out","if","about","who","get","which","go","me","when","make","can","like","time","no","just","him","know","take","person","into","year","your","good","some","could","them","see","other","than","then","now","look","only","come","its","over","think","also","back","after","use","two","how","our","work","first","well","even","new","want","because","any","these","give","day","most","us","are","is","were","was","has","having","had","did","does","doing","done","said","says","saying","goes","going","went","gone","made","making","could","likes","liked","liking","knew","known","knowing","sees","seeing","saw","seen","looks","looked","looking","came","coming","thought","thinking","gave","given","giving","find","found","finding","finds","tell","told","tells","telling","ask","asks","asking","asked","works","working","worked","seem","seems","seemed","seeming","feel","felt","feels","feeling","try","tries","trying","tried","leave","left","leaves","leaving","call","calling","called","calls","last","long","great","little","own","old","right","big","high","different","small","large","next","early","young","important","few","public","bad","same","able","many","beneath","under","above"]

	tweets.each do \|t\|
	t[3].split(' ').each do \|w\|
	words << w unless dont_count.include?(w.downcase) or w[0,1] =~ /\W/
	if w.include?("http://")
	urls << w[w.index("http"),w.length]
	end
	hashtags << w.gsub(/[^#\w+]/,'').strip if w[0] == "#"
	end
	bigrams += find_bigrams(t[3])
	users << t[2]
	end

	def word_freq_sort(arr,case_sensitive=false,limit=nil)
	if case_sensitive
	grouped_arr = arr.group_by {\|w\| w}.to_a
	else
	grouped_arr = arr.group_by {\|w\| w.downcase}.to_a
	end
	if limit
	return grouped_arr.sort!{ \|a,b\| b[1].count <=> a[1].count }.first(limit)
	else
	return grouped_arr.sort!{ \|a,b\| b[1].count <=> a[1].count }
	end
	end

	def print_word_freq(arr,case_sensitive=false,limit=nil,js=false)
	grouped_arr = word_freq_sort(arr,case_sensitive,limit)
	if js
	arr = []
	grouped_arr.each do \|w\|
	arr << "{term:'#{w[0]}', count:#{w[1].count}}"
	end
	puts arr.join(",")
	else
	grouped_arr.each do \|w\|
	puts w[0]
	puts w[1].count
	end
	end
	end

	def print_url_freq(arr,only_images=false,limit=20)
	count = 0
	grouped_arr = word_freq_sort(arr,true)
	grouped_arr.each do \|w\|
	if only_images
	image_urls = []
	url = w[0]
	expanded = expand_url url
	if expanded.include?("ow.ly") or expanded.include?("instagr") or expanded =~ /twitter.com\/.+\/photo/
	url = get_owly_photo(expanded) if expanded.include?("ow.ly")
	url = get_instagram_photo(expanded) if expanded.include?("instagr")
	url = get_twitter_photo(expanded) if expanded =~ /twitter.com\/.+\/photo/
	image_urls << "{url:'#{url}', count:#{w[1].count.to_i}}"
	puts image_urls.join(",")
	count += 1
	break if count == limit
	end

	else
	#doesn't respect limit
	url = w[0]
	expanded = expand_url url
	puts url
	puts expanded
	puts get_owly_photo(expanded) if expanded.include?("ow.ly")
	puts get_instagram_photo(expanded) if expanded.include?("instagr")
	puts w[1].count
	count += 1
	break if count == limit
	end
	end
	end

	def search(source,q,position=nil)
	source.each do \|t\|
	if position
	raise "Position must be a number" if position.class != Fixnum
	puts t if t[3][position,position+q.length].downcase.include?(q.downcase)
	else
	puts t if t[3].downcase.include?(q.downcase)
	end
	end
	end

	def expand_url(base)
	escaped = CGI.escape(base)
	check = "http://expandurl.appspot.com/expand?url=" + escaped
	body = Net::HTTP.get_response(URI.parse(check)).body
	body.gsub!('":','" =>')
	body_hash = eval body
	return body_hash["end_url"]
	end

	def get_owly_photo(url)
	id = url[url.index("/i/")+3,url.length]
	return "http://static.ow.ly/photos/original/" + id + ".jpg"
	end

	def get_instagram_photo(url)
	doc = Nokogiri::HTML(open(url))
	doc.css('img.photo').each {\|x\| return x["src"]}
	end

	def get_twitter_photo(url)
	doc = Nokogiri::HTML(open(url))
	doc.css('img.media-slideshow-image').each {\|x\| return x['src']}
	end

	# def find_retweets(source)
	# rts = []
	# source.each do \|t\|
	# rts.each do \|rt\|
	# if rt[3] == t[3]
	# end
	# end
	# end

	print_word_freq(words,false,20,true)
	# print_url_freq(urls,true,10)