Created
December 4, 2011 16:10
-
-
Save dmdeller/1430558 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby -w | |
require 'cgi' | |
require 'net/http' | |
require 'uri' | |
# Method 1: | |
# Rewrite article links to Google 'I'm Feeling Lucky' search. | |
# Site will show article content if referer(sic) is Google. | |
# Downside: breaks navigation (other links on site lead back to original bostonglobe.com) | |
REWRITE_METHOD = 'google' | |
# Method 2: | |
# Rewrite all links back to this proxy. | |
# Site will show article content if user-agent is Googlebot. | |
# Downside: breaks some JavaScript, maybe some layout. | |
#REWRITE_METHOD = 'proxy' | |
cgi = CGI.new | |
myUrl = 'http://' + ENV['SERVER_NAME'] + ENV['REQUEST_URI'].split('?')[0] | |
baseUrl = 'http://www.bostonglobe.com' | |
imFeelingLuckyUrl = 'http://www.google.com/search?q=%s+site:bostonglobe.com&btnI=3564' | |
googlebotUserAgentString = 'Googlebot/2.1 (+http://www.googlebot.com/bot.html)' | |
if (defined? cgi['path']) && (!cgi['path'].empty?) then | |
fetchUrl = baseUrl + '/' + cgi['path'] | |
else | |
fetchUrl = baseUrl + '/' | |
end | |
url = URI.parse(fetchUrl) | |
html = String.new | |
res = Net::HTTP.start(url.host, url.port) do |http| | |
response = http.get(url.path, { 'User-Agent' => googlebotUserAgentString }) | |
if (response.code == '200') then | |
html = response.body | |
else | |
cgi.out { "Response %s received from server" % response.code } | |
exit | |
end | |
end | |
# HACKITY HACK HACK! | |
# Rewrite CSS to absolute URLs | |
html.gsub!(/(<link[^>]+href=['"]+)((?!http)[^'"]+)/i, '\1' + baseUrl + '\2') | |
# Rewrite images to absolute URLs | |
html.gsub!(/(<img[^>]+src=['"]+)((?!http)[^'"]+)/i, '\1' + baseUrl + '\2') | |
# Rewrite JavaScript to absolute URLs | |
html.gsub!(/(<script[^>]+src=['"]+)((?!http)[^'"]+)/i, '\1' + baseUrl + '\2') | |
# Add back fonts CSS... JavaScript is supposed to do it, but it fails due to relative URLs that we didn't catch | |
fontsCssUrl = 'http://www.bostonglobe.com/css/globe-fonts.css,globe-comments.css' | |
fontsCssTag = '<link rel="stylesheet" type="text/css" href="%s" />' % fontsCssUrl | |
html.gsub!(/<\/head>/i, fontsCssTag + '\0') | |
if (REWRITE_METHOD == 'google') then | |
html.gsub!(/(<a[^>]+href=["'])([^'"]+\/)([^\/'"]+)(\/[^\/'"]+\/story\.html)/, '\1' + (imFeelingLuckyUrl % '\3')) | |
elsif (REWRITE_METHOD == 'proxy') then | |
html.gsub!(/(<a[^>]+href=["'])(\/[^'"]*)/) do |s| | |
$1 + myUrl + '?path=' + CGI::escape($2[1..-1]) | |
end | |
else | |
raise "unknown REWRITE_METHOD" | |
end | |
cgi.out('charset' => 'utf-8') { html } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment