Skip to content

Instantly share code, notes, and snippets.

@dgutov
Created April 20, 2015 14:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dgutov/addc25762453c5a205ea to your computer and use it in GitHub Desktop.
Save dgutov/addc25762453c5a205ea to your computer and use it in GitHub Desktop.
Tabs or Spaces
source :rubygems
group :development do
gem 'pry'
end
gem 'yajl-ruby'
gem 'em-synchrony'
gem 'em-http-request'
GEM
remote: http://rubygems.org/
specs:
addressable (2.3.7)
coderay (1.1.0)
cookiejar (0.3.2)
em-http-request (1.1.2)
addressable (>= 2.3.4)
cookiejar
em-socksify (>= 0.3)
eventmachine (>= 1.0.3)
http_parser.rb (>= 0.6.0)
em-socksify (0.3.0)
eventmachine (>= 1.0.0.beta.4)
em-synchrony (1.0.4)
eventmachine (>= 1.0.0.beta.1)
eventmachine (1.0.7)
http_parser.rb (0.6.0)
method_source (0.8.2)
pry (0.10.1)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
slop (3.6.0)
yajl-ruby (1.2.1)
PLATFORMS
ruby
DEPENDENCIES
em-http-request
em-synchrony
pry
yajl-ruby
require 'open-uri'
require 'zlib'
require 'yajl'
require 'pry'
require 'ostruct'
require 'em-synchrony'
require 'em-synchrony/em-http'
require 'em-synchrony/fiber_iterator'
$results = Hash[%w(c cpp js java el).map { |ext| [ext, {tabs: 0, spaces: 0}]}]
$ext_re = /\.(#{$results.keys.join("|")})\z/
def process_lines(path, lines, repo_name, sha)
return unless path && lines.any?
return unless ext = path[$ext_re, 1]
spaces = 0
tabs = 0
lines.each do |l|
case l
when /^ {8}/
spaces += 1
when /^\t/
tabs += 1
end
end
puts "https://raw.githubusercontent.com/#{repo_name}/#{sha}/#{path}: spaces #{spaces}, tabs #{tabs}"
lang_results = $results[ext]
lang_results[:spaces] += spaces
lang_results[:tabs] += tabs
end
def output_progress(current, total)
size = 60
ss = (current * size / total)
print "[#{'=' * ss}#{' ' * (size - ss)}] #{current}/#{total}\n"
end
def process_file(name)
puts "Processing #{name}..."
gz = open(name)
js = Zlib::GzipReader.new(gz).read
diffs = []
Yajl::Parser.parse(js) do |event|
next unless event["type"] == "PushEvent"
shas = event["payload"]["commits"].map { |c| c["sha"] }
repo_name = event["repo"]["name"]
shas.each { |sha| diffs << OpenStruct.new(repo_name: repo_name, sha: sha)}
end
counter = 0
output_progress(counter, diffs.size)
EM::Synchrony::FiberIterator.new(diffs, 30).each do |diff|
repo_url = "https://github.com/#{diff.repo_name}"
sha = diff.sha
url = "#{repo_url}/commit/#{sha}.diff"
begin
start = Time.now.to_f
http = EM::HttpRequest.new(url).get
text = http.response.to_s
print "read #{url} (#{Time.now.to_f - start} s)\n"
counter += 1
output_progress(counter, diffs.size)
path = nil
added_lines = []
text.each_line do |l|
case l
when /^\+\+\+ b\/(.*)/
match = $~[1]
process_lines(path, added_lines, diff.repo_name, sha)
path = match
added_lines = []
when /^\+/
added_lines << l[1..-1]
end
end
process_lines(path, added_lines, diff.repo_name, sha)
rescue OpenURI::HTTPError => e
print "#{e.message}\n"
rescue ArgumentError => e
if e.message =~ /invalid byte sequence/
puts "Invalid byte sequence in UTF-8, skipping..."
else
raise
end
rescue => e
binding.pry
end
end
ensure
puts "Intermediate results:"
puts $results
end
EM.synchrony do
("01".."30").each do |day|
(0..23).each do |hour|
process_file("archives/2015-01-#{day}-#{hour}.json.gz")
end
end
EM.stop
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment