Created
September 15, 2012 20:24
-
-
Save zQueal/3729595 to your computer and use it in GitHub Desktop.
Twitter Words of Interest Scrape
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# This script takes a list of twitter usernames or search terms and generates a | |
# word list based on them. For usernames it requests the last 500 tweets from | |
# that user, for a search term it requests 500 tweets including that term. | |
# | |
# The script is based on an original idea from the | |
# "7 Habits of Highly Effective Hackers" blog | |
# http://7habitsofhighlyeffectivehackers.blogspot.com.au/2012/05/using-twitter-to-build-password.html | |
# | |
# Author:: Robin Wood (robin@digininja.org) | |
# Copyright:: Copyright (c) Robin Wood 2012 | |
# Licence:: Creative Commons Attribution-Share Alike 2.0 | |
# | |
require 'rubygems' | |
require 'json' | |
require 'net/http' | |
require 'getoptlong' | |
opts = GetoptLong.new( | |
[ '--help', '-h', GetoptLong::NO_ARGUMENT ], | |
[ '--count', '-c', GetoptLong::NO_ARGUMENT ], | |
[ '--min_word_length', "-m" , GetoptLong::REQUIRED_ARGUMENT ], | |
[ '--term_file', "-T" , GetoptLong::REQUIRED_ARGUMENT ], | |
[ '--terms', "-t" , GetoptLong::REQUIRED_ARGUMENT ], | |
[ '--user_file', "-U" , GetoptLong::REQUIRED_ARGUMENT ], | |
[ '--users', "-u" , GetoptLong::REQUIRED_ARGUMENT ], | |
[ '--verbose', "-v" , GetoptLong::NO_ARGUMENT ] | |
) | |
def usage | |
puts 'twofi 1.0 Robin Wood (robin@digininja.org) (www.digininja.org) | |
twofi - Twitter Words Of Interest | |
Usage: twofi [OPTIONS] | |
--help, -h: show help | |
--count, -c: include the count with the words | |
--min_word_length, -m: minimum word length | |
--term_file, -T file: a file containing a list of terms | |
--terms, -t: comma separated usernames | |
quote words containing spaces, no space after commas | |
--user_file, -U file: a file containing a list of users | |
--users, -u: comma separated search terms | |
quote words containing spaces, no space after commas | |
--verbose, -v: verbose | |
' | |
exit | |
end | |
def twitter_search(query, results=500) | |
url = "http://search.twitter.com/search.json?q=" + URI.encode(query) + "&rpp=" + results.to_s | |
resp = Net::HTTP.get_response(URI.parse(url)) | |
data = resp.body | |
# Should probably do some error handling here but not really sure | |
# what errors could come back | |
result = JSON.parse(data) | |
return result | |
end | |
users=[] | |
terms=[] | |
min_word_length=3 | |
show_count=false | |
begin | |
opts.each do |opt, arg| | |
case opt | |
when '--count' | |
show_count = true | |
when '--help' | |
usage | |
when "--user_file" | |
begin | |
File.new(arg, 'r').each_line do |line| | |
username = 'from:' + line.chomp.sub(/^@/, '') | |
terms << username | |
end | |
rescue | |
puts "Unable to read the users file\n" | |
exit | |
end | |
when "--term_file" | |
begin | |
File.new(arg, 'r').each_line do |line| | |
terms << line.chomp | |
end | |
rescue | |
puts "Unable to read the terms file\n" | |
exit | |
end | |
when '--terms' | |
arg.split(',').each do |term| | |
terms << term | |
end | |
when '--users' | |
arg.split(',').each do |user| | |
username = 'from:' + user.chomp.sub(/^@/, '') | |
terms << username | |
end | |
when '--min_word_length' | |
min_word_length=arg.to_i | |
if min_word_length<1 | |
usage | |
end | |
when '--verbose' | |
verbose=true | |
when '--write' | |
outfile=arg | |
end | |
end | |
rescue => e | |
usage | |
end | |
if terms.count == 0 | |
puts 'You must specify at least one search term or username' | |
puts | |
usage | |
end | |
results = [] | |
#puts terms.inspect | |
terms.each do |term| | |
data = twitter_search(term, 500) | |
results += data['results'] | |
end | |
if results.count == 0 | |
puts "No search results" | |
else | |
wordlist = {} | |
results.each do |result| | |
text = result['text'] | |
# Strip any non word type characters | |
text.gsub!(/[^\w \s \d]/, ' ') | |
words = text.split(/\s/) | |
words.each do |word| | |
#Empty or shorter than required | |
if word == '' or word.length < min_word_length | |
next | |
end | |
if wordlist.key?(word) | |
wordlist[word] += 1 | |
else | |
wordlist[word] = 1 | |
end | |
end | |
end | |
sorted_wordlist = wordlist.sort_by do |word, count| -count end | |
sorted_wordlist.each do |word, count| | |
if show_count | |
puts word + ', ' + count.to_s | |
else | |
puts word | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment