Skip to content

Instantly share code, notes, and snippets.

@rampion
Created December 8, 2012 01:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rampion/4238080 to your computer and use it in GitHub Desktop.
Save rampion/4238080 to your computer and use it in GitHub Desktop.
quick and dirty ruby solution
#!/usr/bin/env ruby -I .
require 'file_status'
per_url = Hash.new { |h,k| h[k] = [] }
File.open( ARGV[0], 'r:UTF-8' ) do |f|
FileStatus.status( "processing scrapings", :file => f ) do |update|
update[]
line = f.gets
while line
cat = line.chomp
urls = []
urls << $1 while /^\d+. (.*)$/ =~ (line = f.gets)
max = urls.length
urls.each_with_index do |url, index|
per_url[url] << [cat, index, max]
end
update[]
end
end
end
File.open( ARGV[1], 'w:UTF-8' ) do |f|
per_url.each do |url, tuples|
f.puts url
tuples.each do |(cat, index, max)|
f.puts "(#{index}/#{max}) #{cat}"
end
end
end
require 'rubygems'
require 'ruby-progressbar'
module FileStatus
STATUS_FORMAT = "%t: [%B] (%a, %E)"
# simple status display for processing input files
#
# Users can use CTRL-C to interrupt input processing
# (but continue the script).
#
# prefix - prefix to display on each line
# options
# file - file object to use for progress checks
# total - number of progress checks to make
#
# yields
#
# progress - proc to call at each progress check point
#
# returns boolean
# true iff file processing was interrupted
def self.status(prefix, options)
interrupted = false
prev = Signal.trap("INT") do
exit if interrupted # double CTRL-C to quit full program
interrupted = true
end
prog = nil
if options[:file]
# if using a file check the location relative to the total
file = options[:file]
total ||= file.stat.size
callback ||= lambda do
prog.progress = file.tell
throw :interrupted if interrupted
end
else
# if given an absolute total, just increment
total ||= options[:total]
callback ||= lambda do
prog.progress.increment
throw :interrupted if interrupted
end
end
prog = ProgressBar.create( title: prefix, format: STATUS_FORMAT, total: total )
catch(:interrupted) { yield callback }
prog.stop
# restore previous handler
Signal.trap("INT", prev)
# return value
interrupted
end
end
% /alexa_invert.rb alexa_scrape.txt alexa_invert.txt
processing scrapings: [=======================================================================================================================================================] (Time: 00:11:02, Time: 00:11:02)
%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment