markmcspadden/Twitter_Forecasting_1.rb

## Twitter_Forecasting_1.rb
# In this model, we'll use a Markov chain to calculate probabilities associated with "Tweeting"

# A few code helpers for our exercise
class Float
  def to_percentage
    self*100
  end
end

# Now let's define our states
# State 0 will represent a 'dormant' user with no tweets in a given 24 hour period
# State 1 will represent an 'active' user with between 1 and 10 tweets in a given 24 hour period
# State 2 will represent a 'super' user with over 10 tweets in a given 24 hour period

# We want to answer questions like:
q1 = "If a user is dormant today, what's the chance they'll be active tomorrow?"
q2 = "If a user is active today, what's the chance they'll be active for the next 3 days?"
q3 = "If a user is active today, what's the chance that they'll slip into being dormant for the next 5 days?"
q4 = "What is the average number of days a user stays at the 'super' user level?"

# We'll start with these questions focused on an individual user, but the implications of what we find could have application in other areas of twitter.

###
### Question 1
###

# The first question is one we are going to guess at for our model today.
# We may come back and actually calculate it based on some twitter sampling, but for now, let's guess

# We're going to setup a matrix of these transition possibilities as such:

# This is our first transition
# Given a user is dormant today, what is the probability he will:
# 0) Be dormant tomorrow (0.7)
# 1) Be active tomorrow (0.2)
# 2) Be super tomorrow (0.1)
t0 = [ 0.7, 0.2, 0.1 ]

# Now for our second
# Given a user is active today, what is the probability he will:
# 0) Be dormant tomorrow
# 1) Be active tomorrow
# 2) Be super tomorrow
t1 = [ 0.3, 0.5, 0.2 ]

# Our third and final transition
# Given a user is super today, what is the probability he will:
# 1) Be dormant tomorrow
# 2) Be active tomorrow
# 3) Be super tomorrow
t2 = [ 0.2, 0.6, 0.2 ]

# Now we create our full transition matrix
# Using this matrix we can quickly access the probability of moving from one state (x1) to another (x2) with
#     transition[x1][x2]
#
# Ex. If we want to find the probability that a dormant user today will be an active user tomorrow, we do
#     transition[0][1]
transition = [ t0, t1, t2 ]

# So to answer question #1:
q1_answer = transition[0][1].to_percentage
puts "#{q1} #{q1_answer}%"


###
### Question 2
###

# Addressing question is where we start the heavy lifting of the day.
# If we assume that our model is a Markov process, we assume that a user's history does not effect their future.
# We want to know the probability of a given sequence, we simply multiple the probabilities together.

q2_answer = (transition[1][1] * transition[1][1] * transition[1][1]).to_percentage
puts "#{q2} #{q2_answer}%"


###
### Question 3
###

# The third question is very similar

q3_answer = (transition[1][0] * transition[0][0] * transition[0][0] * transition[0][0] * transition[0][0]).to_percentage
puts "#{q3} #{q3_answer}%"


###
### Question 4
###

# We use a variation for the fourth.

q4_answer = (1/(1-transition[2][2]))
puts "#{q4} #{q4_answer} days"


###
### Conclusion
###

# Using an assumed Markov property, it fairly easy to compute these simple probabilities.
# The challenges come in:
# 1) Actually creating a model from REAL data
# 2) Comparing our model's results to occurances found in the real data set
# 3) Computing more complex and meaningful questions
#
# I hope to take a stab at least one of these in Exercise #3.


### Exercise 3

# So we've looked at pulling transition numbers out of the air
# But what if we wanted to actually to base those on something

# Let's start with a sample size of 1...me
# I'll use the twitter gem to pull down my last 6 months worth of tweets
# Then we'll compare each day to the next to generate our transition numbers

require 'rubygems' # I know...I know
require 'active_support'
require 'httparty'
require 'json'

def fetch_statuses
  all_statuses = []

  last_updated_at = Time.now
  page = 1


  while(last_updated_at > Time.now.advance(:months => -6))
    puts "------------------"
    puts "REQUEST FOR PAGE ##{page}"
    puts "------------------"

    my_timeline = "http://twitter.com/statuses/user_timeline/markmcspadden.json?page=#{page}"

    response = HTTParty.get(my_timeline, :format => :json)
    statuses = JSON.parse(response.body)

    statuses.each do |status|
      y status
      puts status['created_at']

      last_updated_at = status['created_at'].to_time

      all_statuses << status
    end

    page += 1
  end

  # Cache the dates into a file
  filename = File.expand_path(File.dirname(__FILE__) + "/my_statuses")
  File.open(filename, "w") do |f|
    all_statuses.each do |s|
      f.puts s['created_at']
    end
  end
end


# If I haven't already fetched my twitter statuses, go get them
fetch_statuses unless File.exists?(File.expand_path(File.dirname(__FILE__) + "/my_statuses"))


# Get how many tweets per day on days where I tweeted
dates = ActiveSupport::OrderedHash.new
File.open(File.expand_path(File.dirname(__FILE__) + "/my_statuses")) do |file|
  file.lines.each do |line|
    date = line.to_time.beginning_of_day
    entry = dates["#{date}"]
    count = entry ? entry+1 : 1
    dates["#{date}"] = count
  end
end

# y dates

@dates = dates

# Now check for transitions over the period of the last 6 months
def find_count(date)
  date = date.to_time.beginning_of_day
  @dates["#{date}"] || 0
end
def find_status(date)
  case find_count(date)
    when 0: "dormant"
    when 1..9: "active"
    else "super"
  end
end
def find_status_code(status)
  case status
    when "dormant": 0
    when "active": 1
    when "super": 2
  end
end


now = Time.now.utc # The utc will get ya ;)
days = (now - now.advance(:months => -6))/(60*60*24)

all_transitions = []

(1..days).each do |i|
  today_status = find_status(now.advance(:days => -i))
  yesterday_status = find_status(now.advance(:days => -i-1))

  # puts "#{yesterday_status} -> #{today_status}"

  all_transitions << [find_status_code(yesterday_status), find_status_code(today_status)]
end

# Sort out the types of transitions
dormant_transitions = all_transitions.select{ |t| t[0] == 0 }
active_transitions = all_transitions.select{ |t| t[0] == 1 }
super_transitions = all_transitions.select{ |t| t[0] == 2 }

# Go through each transition and figure out it's probabilities of moving
transition = [[0.0, 0.0, 0.0],[0.0, 0.0, 0.0],[0.0, 0.0, 0.0]]

# Let's start with the dormant transitions
transition[0][0] = dormant_transitions.select{ |t| t[1] == 0 }.size/dormant_transitions.size.to_f
transition[0][1] = dormant_transitions.select{ |t| t[1] == 1 }.size/dormant_transitions.size.to_f
transition[0][2] = dormant_transitions.select{ |t| t[1] == 2 }.size/dormant_transitions.size.to_f

# Now the active transitions
(0..2).each do |i|
  transition[1][i] = active_transitions.select{ |t| t[1] == i}.size/active_transitions.size.to_f
end

# And the super transitions
(0..2).each do |i|
  transition[2][i] = super_transitions.select{ |t| t[1] == i}.size/super_transitions.size.to_f
end

transition_string =<<-EOS
Transition probabilities

[ #{transition[0][0]} #{transition[0][1]} #{transition[0][2]} ]
[ #{transition[1][0]} #{transition[1][1]} #{transition[1][2]} ]
[ #{transition[2][0]} #{transition[2][1]} #{transition[2][2]} ]

EOS

puts transition_string

puts "Simple Probabability of Tweeting on a Given Day: #{active_transitions.size/days.to_f}"

puts ""

# When using our new numbers with our previous exercise we get something like

# puts "=== Random Guessing"
# require '2'

puts ""

puts "=== Sample of Mark McSpadden"
t0 = [ 0.460526315789474, 0.539473684210526, 0.0 ]
t1 = [ 0.403846153846154, 0.586538461538462, 0.00961538461538462 ]
t2 = [ 0.0, 1.0, 0.0 ]

# This is ugly...but I really don't want to mess with Classifying 2 right now
q1 = "If a user is dormant today, what's the chance they'll be active tomorrow?"
q2 = "If a user is active today, what's the chance they'll be active for the next 3 days?"
q3 = "If a user is active today, what's the chance that they'll slip into being dormant for the next 5 days?"
q4 = "What is the average number of days a user stays at the 'super' user level?"
q1_answer = transition[0][1].to_percentage
puts "#{q1} #{q1_answer}%"
q2_answer = (transition[1][1] * transition[1][1] * transition[1][1]).to_percentage
puts "#{q2} #{q2_answer}%"
q3_answer = (transition[1][0] * transition[0][0] * transition[0][0] * transition[0][0] * transition[0][0]).to_percentage
puts "#{q3} #{q3_answer}%"
q4_answer = (1/(1-transition[2][2]))
puts "#{q4} #{q4_answer} days"


# So as we see, a sample of 1 is not great, especially in evaluating the super category
# The next step will be to either do a sampling of our twitter friends or of the twitter population at large....
# But that appears to be a fight for another day.
	# In this model, we'll use a Markov chain to calculate probabilities associated with "Tweeting"

	# A few code helpers for our exercise
	class Float
	def to_percentage
	self*100
	end
	end

	# Now let's define our states
	# State 0 will represent a 'dormant' user with no tweets in a given 24 hour period
	# State 1 will represent an 'active' user with between 1 and 10 tweets in a given 24 hour period
	# State 2 will represent a 'super' user with over 10 tweets in a given 24 hour period

	# We want to answer questions like:
	q1 = "If a user is dormant today, what's the chance they'll be active tomorrow?"
	q2 = "If a user is active today, what's the chance they'll be active for the next 3 days?"
	q3 = "If a user is active today, what's the chance that they'll slip into being dormant for the next 5 days?"
	q4 = "What is the average number of days a user stays at the 'super' user level?"

	# We'll start with these questions focused on an individual user, but the implications of what we find could have application in other areas of twitter.

	###
	### Question 1
	###

	# The first question is one we are going to guess at for our model today.
	# We may come back and actually calculate it based on some twitter sampling, but for now, let's guess

	# We're going to setup a matrix of these transition possibilities as such:

	# This is our first transition
	# Given a user is dormant today, what is the probability he will:
	# 0) Be dormant tomorrow (0.7)
	# 1) Be active tomorrow (0.2)
	# 2) Be super tomorrow (0.1)
	t0 = [ 0.7, 0.2, 0.1 ]

	# Now for our second
	# Given a user is active today, what is the probability he will:
	# 0) Be dormant tomorrow
	# 1) Be active tomorrow
	# 2) Be super tomorrow
	t1 = [ 0.3, 0.5, 0.2 ]

	# Our third and final transition
	# Given a user is super today, what is the probability he will:
	# 1) Be dormant tomorrow
	# 2) Be active tomorrow
	# 3) Be super tomorrow
	t2 = [ 0.2, 0.6, 0.2 ]

	# Now we create our full transition matrix
	# Using this matrix we can quickly access the probability of moving from one state (x1) to another (x2) with
	# transition[x1][x2]
	#
	# Ex. If we want to find the probability that a dormant user today will be an active user tomorrow, we do
	# transition[0][1]
	transition = [ t0, t1, t2 ]

	# So to answer question #1:
	q1_answer = transition[0][1].to_percentage
	puts "#{q1} #{q1_answer}%"


	###
	### Question 2
	###

	# Addressing question is where we start the heavy lifting of the day.
	# If we assume that our model is a Markov process, we assume that a user's history does not effect their future.
	# We want to know the probability of a given sequence, we simply multiple the probabilities together.

	q2_answer = (transition[1][1] * transition[1][1] * transition[1][1]).to_percentage
	puts "#{q2} #{q2_answer}%"


	###
	### Question 3
	###

	# The third question is very similar

	q3_answer = (transition[1][0] * transition[0][0] * transition[0][0] * transition[0][0] * transition[0][0]).to_percentage
	puts "#{q3} #{q3_answer}%"



	###
	### Question 4
	###

	# We use a variation for the fourth.

	q4_answer = (1/(1-transition[2][2]))
	puts "#{q4} #{q4_answer} days"


	###
	### Conclusion
	###

	# Using an assumed Markov property, it fairly easy to compute these simple probabilities.
	# The challenges come in:
	# 1) Actually creating a model from REAL data
	# 2) Comparing our model's results to occurances found in the real data set
	# 3) Computing more complex and meaningful questions
	#
	# I hope to take a stab at least one of these in Exercise #3.


	### Exercise 3

	# So we've looked at pulling transition numbers out of the air
	# But what if we wanted to actually to base those on something

	# Let's start with a sample size of 1...me
	# I'll use the twitter gem to pull down my last 6 months worth of tweets
	# Then we'll compare each day to the next to generate our transition numbers

	require 'rubygems' # I know...I know
	require 'active_support'
	require 'httparty'
	require 'json'

	def fetch_statuses
	all_statuses = []

	last_updated_at = Time.now
	page = 1


	while(last_updated_at > Time.now.advance(:months => -6))
	puts "------------------"
	puts "REQUEST FOR PAGE ##{page}"
	puts "------------------"

	my_timeline = "http://twitter.com/statuses/user_timeline/markmcspadden.json?page=#{page}"

	response = HTTParty.get(my_timeline, :format => :json)
	statuses = JSON.parse(response.body)

	statuses.each do \|status\|
	y status
	puts status['created_at']

	last_updated_at = status['created_at'].to_time

	all_statuses << status
	end

	page += 1
	end

	# Cache the dates into a file
	filename = File.expand_path(File.dirname(__FILE__) + "/my_statuses")
	File.open(filename, "w") do \|f\|
	all_statuses.each do \|s\|
	f.puts s['created_at']
	end
	end
	end


	# If I haven't already fetched my twitter statuses, go get them
	fetch_statuses unless File.exists?(File.expand_path(File.dirname(__FILE__) + "/my_statuses"))



	# Get how many tweets per day on days where I tweeted
	dates = ActiveSupport::OrderedHash.new
	File.open(File.expand_path(File.dirname(__FILE__) + "/my_statuses")) do \|file\|
	file.lines.each do \|line\|
	date = line.to_time.beginning_of_day
	entry = dates["#{date}"]
	count = entry ? entry+1 : 1
	dates["#{date}"] = count
	end
	end

	# y dates

	@dates = dates

	# Now check for transitions over the period of the last 6 months
	def find_count(date)
	date = date.to_time.beginning_of_day
	@dates["#{date}"] \|\| 0
	end
	def find_status(date)
	case find_count(date)
	when 0: "dormant"
	when 1..9: "active"
	else "super"
	end
	end
	def find_status_code(status)
	case status
	when "dormant": 0
	when "active": 1
	when "super": 2
	end
	end


	now = Time.now.utc # The utc will get ya ;)
	days = (now - now.advance(:months => -6))/(606024)

	all_transitions = []

	(1..days).each do \|i\|
	today_status = find_status(now.advance(:days => -i))
	yesterday_status = find_status(now.advance(:days => -i-1))

	# puts "#{yesterday_status} -> #{today_status}"

	all_transitions << [find_status_code(yesterday_status), find_status_code(today_status)]
	end

	# Sort out the types of transitions
	dormant_transitions = all_transitions.select{ \|t\| t[0] == 0 }
	active_transitions = all_transitions.select{ \|t\| t[0] == 1 }
	super_transitions = all_transitions.select{ \|t\| t[0] == 2 }

	# Go through each transition and figure out it's probabilities of moving
	transition = [[0.0, 0.0, 0.0],[0.0, 0.0, 0.0],[0.0, 0.0, 0.0]]

	# Let's start with the dormant transitions
	transition[0][0] = dormant_transitions.select{ \|t\| t[1] == 0 }.size/dormant_transitions.size.to_f
	transition[0][1] = dormant_transitions.select{ \|t\| t[1] == 1 }.size/dormant_transitions.size.to_f
	transition[0][2] = dormant_transitions.select{ \|t\| t[1] == 2 }.size/dormant_transitions.size.to_f

	# Now the active transitions
	(0..2).each do \|i\|
	transition[1][i] = active_transitions.select{ \|t\| t[1] == i}.size/active_transitions.size.to_f
	end

	# And the super transitions
	(0..2).each do \|i\|
	transition[2][i] = super_transitions.select{ \|t\| t[1] == i}.size/super_transitions.size.to_f
	end

	transition_string =<<-EOS
	Transition probabilities

	[ #{transition[0][0]} #{transition[0][1]} #{transition[0][2]} ]
	[ #{transition[1][0]} #{transition[1][1]} #{transition[1][2]} ]
	[ #{transition[2][0]} #{transition[2][1]} #{transition[2][2]} ]

	EOS

	puts transition_string

	puts "Simple Probabability of Tweeting on a Given Day: #{active_transitions.size/days.to_f}"

	puts ""

	# When using our new numbers with our previous exercise we get something like

	# puts "=== Random Guessing"
	# require '2'

	puts ""

	puts "=== Sample of Mark McSpadden"
	t0 = [ 0.460526315789474, 0.539473684210526, 0.0 ]
	t1 = [ 0.403846153846154, 0.586538461538462, 0.00961538461538462 ]
	t2 = [ 0.0, 1.0, 0.0 ]

	# This is ugly...but I really don't want to mess with Classifying 2 right now
	q1 = "If a user is dormant today, what's the chance they'll be active tomorrow?"
	q2 = "If a user is active today, what's the chance they'll be active for the next 3 days?"
	q3 = "If a user is active today, what's the chance that they'll slip into being dormant for the next 5 days?"
	q4 = "What is the average number of days a user stays at the 'super' user level?"
	q1_answer = transition[0][1].to_percentage
	puts "#{q1} #{q1_answer}%"
	q2_answer = (transition[1][1] * transition[1][1] * transition[1][1]).to_percentage
	puts "#{q2} #{q2_answer}%"
	q3_answer = (transition[1][0] * transition[0][0] * transition[0][0] * transition[0][0] * transition[0][0]).to_percentage
	puts "#{q3} #{q3_answer}%"
	q4_answer = (1/(1-transition[2][2]))
	puts "#{q4} #{q4_answer} days"


	# So as we see, a sample of 1 is not great, especially in evaluating the super category
	# The next step will be to either do a sampling of our twitter friends or of the twitter population at large....
	# But that appears to be a fight for another day.