Skip to content

Instantly share code, notes, and snippets.

@mschueler
Created September 20, 2011 09:46
Show Gist options
  • Save mschueler/1228753 to your computer and use it in GitHub Desktop.
Save mschueler/1228753 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
require 'rubygems'
require 'hpricot'
require 'open-uri'
#a Trac repo scraper. pass the url to scrape (the root of a repo)
# and optionally the local path to write to. defaults to .
class TracRepoScraper
def initialize(trac_url, local_path='.')
@trac_url = trac_url
trac_url =~ /(http:\/\/.*?)\//
@trac_server = $1
@local_path = local_path
end
def getallfiles(url,cur_localpath)
if cur_localpath != '.'
Dir.mkdir(cur_localpath)
end
doc = Hpricot(open(url).read)
doc.search("//tbody//tr//td//a[@class='file']").each do |file_anchor|
#get the file as curpath+/file_name
#following gives us absolute path (excluding domain)
actual_file_url = @trac_server + file_anchor['href']+'?format=raw'
#temp
puts "Saving #{actual_file_url} to #{cur_localpath}/#{file_anchor.inner_html}"
#read the file and write to a file in the correct directory
File.open(cur_localpath+"/"+file_anchor.inner_html, 'w') do |f|
remote_file = open(actual_file_url)
remote_file.each { |line|
f.puts(line)
}
end
end
doc.search("//tbody//tr//td//a[@class='dir']").each do |dir_anchor|
#go into the directory
dir_url = @trac_server + dir_anchor['href']
puts "*** stepping into #{dir_url}"
#dir_anchor.inner_html is the name of the subdirectory (relative)
getallfiles(dir_url, cur_localpath+"/"+dir_anchor.inner_html)
end
end
def start
getallfiles(@trac_url, @local_path)
end
end
#### main
trac_url = ARGV[0]
localpath = '.'
if ARGV[1]
if !ARGV[1].strip.empty?
localpath = ARGV[1].strip
end
end
TracRepoScraper.new(trac_url, localpath).start
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment