Created
January 6, 2012 09:45
-
-
Save plindberg/1569898 to your computer and use it in GitHub Desktop.
The script that powers fleecelabs.se/jaikuslurp/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# See this Gist for how to run this: https://gist.github.com/1562129 | |
if [ -z '$1' ]; then | |
echo 'Run me again, but with -s followed by your Jaiku username at the end.' | |
echo 'Like so: bash < <(curl -s fleecelabs.se/jaikuslurp/) -s plindberg' | |
exit | |
fi | |
USERNAME=$1 | |
echo "Hey @$USERNAME, I am jaikuslurp. Let's get your jaiks archived." | |
echo 'Should things go wrong, let @plindberg know on Twitter.' | |
teardown() { | |
[ -n '$SAVED_GEM_HOME' ] && export GEM_HOME='$SAVED_GEM_HOME' | |
exit $? | |
} | |
trap teardown EXIT INT TERM | |
{ gem li | grep -q mechanize; } || { | |
SAVED_GEM_HOME='$GEM_HOME' | |
export GEM_HOME=/tmp/jaikuslurp_gems | |
{ gem li | grep -q mechanize; } || { | |
echo 'First, I need to install some libraries, hang on for a bit...' | |
echo '(No worries, I install them under /tmp.)' | |
gem i mechanize | |
} | |
} | |
ruby <<RUBY | |
require 'rubygems' | |
require 'mechanize' | |
start_uri = if File.exists?(statefile = '/tmp/jaikuslurp-$USERNAME-state.txt') | |
File.read(statefile) | |
else | |
'http://$USERNAME.jaiku.com' | |
end | |
agent = Mechanize.new | |
agent.read_timeout = 5 | |
page = agent.get(start_uri) | |
while Mechanize::Page === page | |
File.open(statefile, 'w') {|f| f.write(page.uri)} | |
puts "--- On #{page.uri}" | |
delay = 0 | |
page.links_with(:href => /\.jaiku\.com\/presence\/(?!last)/).each do |link| | |
uri = link.uri | |
uri.host = uri.host.gsub(/(\.jaiku)(?=\.com$)/, '\1archive') | |
uri.fragment = nil | |
unless agent.visited?(uri) | |
puts uri | |
begin | |
agent.get(uri) | |
rescue Mechanize::ResponseCodeError => e | |
\$stderr.puts "Error #{e.response_code} on #{uri}" | |
end | |
sleep 0.5 | |
delay += 0.5 | |
end | |
end | |
next_page_link = page.link_with(:text => /Older/, :href => /offset/) | |
break unless next_page_link | |
sleep 10 - delay unless delay > 10 # is what jaiku.com/robots.txt says | |
page = next_page_link.click | |
end | |
puts 'Looks like I am done. Thanks!' | |
RUBY |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment