public
Created

quick and dirty ruby solution

  • Download Gist
alexa_invert.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
#!/usr/bin/env ruby -I .
require 'file_status'
 
per_url = Hash.new { |h,k| h[k] = [] }
File.open( ARGV[0], 'r:UTF-8' ) do |f|
FileStatus.status( "processing scrapings", :file => f ) do |update|
update[]
line = f.gets
while line
cat = line.chomp
urls = []
urls << $1 while /^\d+. (.*)$/ =~ (line = f.gets)
max = urls.length
urls.each_with_index do |url, index|
per_url[url] << [cat, index, max]
end
update[]
end
end
end
File.open( ARGV[1], 'w:UTF-8' ) do |f|
per_url.each do |url, tuples|
f.puts url
tuples.each do |(cat, index, max)|
f.puts "(#{index}/#{max}) #{cat}"
end
end
end
file_status.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
require 'rubygems'
require 'ruby-progressbar'
 
module FileStatus
STATUS_FORMAT = "%t: [%B] (%a, %E)"
 
# simple status display for processing input files
#
# Users can use CTRL-C to interrupt input processing
# (but continue the script).
#
# prefix - prefix to display on each line
# options
# file - file object to use for progress checks
# total - number of progress checks to make
#
# yields
#
# progress - proc to call at each progress check point
#
# returns boolean
# true iff file processing was interrupted
def self.status(prefix, options)
interrupted = false
prev = Signal.trap("INT") do
exit if interrupted # double CTRL-C to quit full program
interrupted = true
end
 
prog = nil
if options[:file]
# if using a file check the location relative to the total
file = options[:file]
total ||= file.stat.size
callback ||= lambda do
prog.progress = file.tell
throw :interrupted if interrupted
end
else
# if given an absolute total, just increment
total ||= options[:total]
callback ||= lambda do
prog.progress.increment
throw :interrupted if interrupted
end
end
 
prog = ProgressBar.create( title: prefix, format: STATUS_FORMAT, total: total )
catch(:interrupted) { yield callback }
prog.stop
 
# restore previous handler
Signal.trap("INT", prev)
 
# return value
interrupted
end
end
run
1 2 3
% /alexa_invert.rb alexa_scrape.txt alexa_invert.txt
processing scrapings: [=======================================================================================================================================================] (Time: 00:11:02, Time: 00:11:02)
%

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.