Created
September 17, 2012 18:46
-
-
Save rob-murray/3739021 to your computer and use it in GitHub Desktop.
basic script to scrape twitter for tweets specific to custom filter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "twitter" | |
require "logger" | |
require "csv" | |
require "geocoder" | |
# Need to get: | |
# all mr_c posts | |
# if is reply to then who mr_c replied to | |
# from that who can then work out where | |
#csv data will be | |
#mr_c tweet_id, mr_c tweet text, reply_to tweet id, | |
#reply_to tweet text, reply_to_userid,reply_to_username, reply_lon, lat, place | |
#get page | |
file_path = '/root/scripts/mr-correcter/page.tmp' | |
file = File.open(file_path, "rb") | |
page_str = file.read | |
PAGE = Integer(page_str) | |
options = { | |
:page => PAGE.to_s, | |
:include_entities => true, | |
:include_rts => true, | |
:exclude_replies => false, | |
:count => 25 | |
} | |
puts "Init with page #{PAGE.to_s}" | |
pages = [1]# roughly 246 tweets so 3 pages of 100, 6 of 50, 12 of 25, etc | |
tweets = Array.new | |
pages.each do |i| | |
#options[:page] = i | |
temp = Twitter.user_timeline("mr_correcter", options) | |
tweets = tweets + temp#concat arrays | |
end | |
puts "Downladed #{tweets.count} tweets" | |
puts "Processing..." | |
CSV.open("tw_#{PAGE.to_s}.csv", "wb", {:col_sep => ","}) do |csv| | |
csv << ["mr_c-tweet_id", "mr_c-tweet_text", "reply_to-tweet id", "reply_to-username", "reply_to-tweet_text", "reply_long", "reply_lat", "reply_geo_loc"] | |
tweets.each do |t| | |
temp = [] | |
#puts "id: #{t.id}" | |
#puts "text: #{t.text}" | |
#puts "reply_to id: #{t.in_reply_to_status_id}" | |
#puts "to_user_id: #{t.in_reply_to_user_id}" | |
temp.push("#{t.id}") | |
temp.push("#{t.text}") | |
temp.push("#{t.in_reply_to_status_id}") | |
temp.push("#{t.in_reply_to_user_id}") | |
temp.push("#{t.in_reply_to_screen_name}") | |
if t.in_reply_to_status_id | |
begin | |
tweet = Twitter.status(t.in_reply_to_status_id) | |
temp.push("#{tweet.text}") | |
if tweet.place | |
lon = tweet.place.bounding_box.coordinates[0][0][0]#long | |
lat = tweet.place.bounding_box.coordinates[0][0][1]#lat | |
geo_res = Geocoder.search("#{lat},#{lon}").first | |
temp.push(lon) | |
temp.push(lat) | |
temp.push("#{geo_res.city}, #{geo_res.country}") | |
else | |
usr_loc = Twitter.user(t.in_reply_to_user_id).location | |
if usr_loc | |
usr_loc_geo = Geocoder.search("#{usr_loc}").first | |
if usr_loc_geo | |
temp.push(usr_loc_geo.longitude) | |
temp.push(usr_loc_geo.latitude) | |
temp.push("#{usr_loc_geo.city}, #{usr_loc_geo.country}") | |
end | |
temp.push("#{usr_loc}") | |
end | |
end | |
rescue Exception => e | |
temp.push("Error: #{e.message}") | |
puts "Error: #{e.message}" | |
end | |
end | |
csv << temp | |
end | |
end | |
#puts Twitter.status(227376948400230400).place.bounding_box.coordinates | |
puts "done." | |
next_page = PAGE+1 | |
File.open(file_path, 'w+') {|f| f.write(next_page.to_s) } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thats not pretty ;)