Skip to content

Instantly share code, notes, and snippets.

@mmower
Created June 3, 2013 12:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mmower/5697717 to your computer and use it in GitHub Desktop.
Save mmower/5697717 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby -w
require 'csv'
require 'json'
require 'ostruct'
require 'open-uri'
require 'optparse'
def setup_options
options = OpenStruct.new
options.tag = nil
options.pages = 1
options.file = nil
options.delay = 5
parser = OptionParser.new do |opts|
opts.banner = "Usage: tweetcons.rb [options]"
opts.separator ""
opts.separator "Specific options:"
opts.on( "-t", "--tag TAGNAME", "Tag to cons" ) do |tag|
options.tag = tag
end
opts.on( "-n", "--numpages [NUMBER]", Integer, "Number of pages to retrieve (default: 1)" ) do |pages|
options.pages = pages
end
opts.on( "-o", "--outfile [FILENAME]", "Output filename to write (default: TAGNAME.csv)" ) do |file|
options.file = file
end
opts.on( "-d", "--delay [SECONDS]", Integer, "Number of seconds to pause between pages (default: 5)" ) do |delay|
options.delay = delay
end
end
parser.parse!
if options.tag.nil?
puts parser.to_s
puts "Error: No TAGNAME specified"
exit
end
if options.file.nil?
options.file = "#{options.tag}.csv"
end
options
end
OPTIONS = setup_options
FIELDS = [
"from_user_name",
"created_at",
"profile_image_url",
"from_user_id",
"text",
"from_user",
"iso_language_code",
]
def page_url( page )
"http://search.twitter.com/search.json?q=#{OPTIONS.tag}&include_entities=true&with_twitter_user_id=true&result_type=mixed&rpp=100&page=#{page}"
end
def extract_tags( tweet )
tweet['entities']['hashtags'].map { |tagging| tagging['text'] }.join(",")
end
def extract_links( tweet )
tweet['entities']['urls'].map { |link| link['expanded_url'] }.join(",")
end
def process_tweet( tweet )
tweet.select { |k,v| FIELDS.include?( k ) }.
merge( { "tags" => extract_tags( tweet ) } ).
merge( { "links" => extract_links( tweet ) } )
end
def process_page( tweets, number, json )
json['results'].inject( tweets ) do |acc,tweet|
acc << process_tweet( tweet )
end
end
tweets = []
puts "Grabbing #{OPTIONS.pages} pages of the hashtag #{OPTIONS.tag}"
(OPTIONS.pages).times do |page_number|
puts "Getting page #{page_number+1}"
url = page_url( page_number+1 )
puts "\tURL=#{url}"
open( page_url( page_number+1 ) ) do |f|
puts "\tProcessing tweets"
tweets = process_page( tweets, page_number, JSON.parse( f.read ) )
end
puts "\tPausing..."
sleep OPTIONS.delay
end
puts "Writing to #{OPTIONS.file}"
CSV.open( "#{OPTIONS.file}", "wb", { :force_quotes => true } ) do |csv|
tweets.each do |tweet|
csv << tweet.values
end
end
puts "Done"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment