pgdaniel/gh_repo_stats.rb

## gh_repo_stats.rb
# Wanted to do this just with stdlib and do it in a firehose
# type fashion
# Could have used their bigdata interface but hey
#
# A lot more that could be done, validation of input dates, retry queue,
# logging etc...

require 'date'
require 'open-uri'
require 'zlib'
require 'json'

class GithubRepo
  # Could of just of just used a struct here but maybe I'll
  # give it some behavior later.
  # Add an increment method so you could make it thread safe?
  attr_accessor :owner, :name, :event_count
end


class GitHubArchiveClient
  attr_accessor :github_repos

  BASE_URL = "http://data.githubarchive.org/"
  GH_DT_FORMAT = "%Y-%m-%d-%k"

  def initialize
    self.github_repos = {}
    @url_queue = []
    @options = {dt_after: nil, dt_before: nil, event: nil, n: nil}
  end

  def start
    parse_options
    validate_argv
    read_n_parse
  end

  def read_n_parse
    #This could be abstracted more cleanly
    # Could add a switch for local files
    #Dir["*.gz"].each do |f|

    begin
    github_file_fetcher.each do |f|
      puts "Fetching #{f}"
      gz_file = open(f)
      Zlib::GzipReader.new(gz_file).readlines.each do |line|
        parsa_line(line)
      end
    end

    rescue
      # Could add a failed queue here for retries
      puts "Can't open file"
    end
  end

  def parsa_line(line)
    if valid_json? line
      json_hash = JSON.parse(line)

      # Could check against valid event types but what if they added one?
      # Add a force switch?

      if  json_hash["type"] == @options[:event]
        repo = json_hash["repository"] || {}
        add_or_increment(repo)
      end
    end
  end

  def add_or_increment(repo)
    # Could have done this many ways but keeping references in memory
    # probably makes this pretty fast
    repo_owner = repo["owner"]
    repo_name = repo["name"]

    # Key this so we can look it up quick style with a symbol
    # Can't just use repo name might be multiple repos with the same name

    if repo_owner && repo_name
      repo_key = (repo_owner + repo_name).to_sym
    else
      # Edge case
      # Hash it with junk if we dont have both
      repo_key = :junk
    end

    # call create or update
    github_repos.has_key?(repo_key) ? update_github_repo(repo_key) : create_github_repo(repo_key, repo_name, repo_owner)
  end

  def create_github_repo(key, name, owner)
    github_repo = GithubRepo.new
    github_repo.name = name
    github_repo.owner = owner
    github_repo.event_count = 1
    self.github_repos[key] = github_repo
  end

  def update_github_repo(key)
    github_repo = github_repos[key]
    github_repo.event_count += 1
  end

  def github_file_fetcher
    # Use the vailidator and bail
    begin
      dt_after = DateTime.rfc3339(@options[:dt_after])
      dt_before = DateTime.rfc3339(@options[:dt_before])
    rescue
      puts "Invalid date"
    end

    generate_file_queue(dt_after, dt_before)

    @url_queue
  end

  def generate_file_queue(from_dt, to_dt)
    # Good candidate for recursion so lets use it
    # Increment by an hour until we reach target date (to_dt)

    from_dt = (from_dt + Rational(60, 1440))

    a = from_dt.strftime GH_DT_FORMAT
    b = to_dt.strftime GH_DT_FORMAT

    # %k gives a padded hour strip it
    @url_queue << "#{BASE_URL}#{a}.json.gz".gsub(/\s+/, "")

    # Base case is when from datetime format string reaches to datetime format string
    generate_file_queue(from_dt, to_dt) unless a == b
  end

  def parse_options
    # optsparse is too much sometimes
    ARGV.each_with_index do |arg, i|
      case arg
      when "--after"
        dt_after = ARGV[i + 1]
        @options[:dt_after] = dt_after
      when "--before"
        dt_before = ARGV[i + 1]
        @options[:dt_before] = dt_before
      when "--event"
        @options[:event] = ARGV[i + 1]
      when "-n"
        @options[:n] = ARGV[i + 1]
      end
    end
  end

  def validate_argv
    # add stuff
    true
  end

  def valid_json?(json)
    begin
      !!JSON.parse(json)
    rescue
      false
    end
  end

  def valid_date?(dt)
    true
  end

end


time_start = Time.now

@gh = GitHubArchiveClient.new
@gh.start

time_end = Time.now
et = time_end - time_start

@gh.github_repos.each do |k, v|
  v.event_count > 1 ? event_string = "events" : event_string = "event"
  puts "#{v.owner}/#{v.name} - #{v.event_count} #{event_string}"
end

puts "ELAPSED TIME: #{et}"

# Speed could be reduced by running multiple threads to fetch files and unpdate in parallel
# Could write all the data to a fast key value store
# Could make a class for writing diff outputs such as CSV, JSON, XML
	# Wanted to do this just with stdlib and do it in a firehose
	# type fashion
	# Could have used their bigdata interface but hey
	#
	# A lot more that could be done, validation of input dates, retry queue,
	# logging etc...

	require 'date'
	require 'open-uri'
	require 'zlib'
	require 'json'

	class GithubRepo
	# Could of just of just used a struct here but maybe I'll
	# give it some behavior later.
	# Add an increment method so you could make it thread safe?
	attr_accessor :owner, :name, :event_count
	end


	class GitHubArchiveClient
	attr_accessor :github_repos

	BASE_URL = "http://data.githubarchive.org/"
	GH_DT_FORMAT = "%Y-%m-%d-%k"

	def initialize
	self.github_repos = {}
	@url_queue = []
	@options = {dt_after: nil, dt_before: nil, event: nil, n: nil}
	end

	def start
	parse_options
	validate_argv
	read_n_parse
	end

	def read_n_parse
	#This could be abstracted more cleanly
	# Could add a switch for local files
	#Dir["*.gz"].each do \|f\|

	begin
	github_file_fetcher.each do \|f\|
	puts "Fetching #{f}"
	gz_file = open(f)
	Zlib::GzipReader.new(gz_file).readlines.each do \|line\|
	parsa_line(line)
	end
	end

	rescue
	# Could add a failed queue here for retries
	puts "Can't open file"
	end
	end

	def parsa_line(line)
	if valid_json? line
	json_hash = JSON.parse(line)

	# Could check against valid event types but what if they added one?
	# Add a force switch?

	if json_hash["type"] == @options[:event]
	repo = json_hash["repository"] \|\| {}
	add_or_increment(repo)
	end
	end
	end

	def add_or_increment(repo)
	# Could have done this many ways but keeping references in memory
	# probably makes this pretty fast
	repo_owner = repo["owner"]
	repo_name = repo["name"]

	# Key this so we can look it up quick style with a symbol
	# Can't just use repo name might be multiple repos with the same name

	if repo_owner && repo_name
	repo_key = (repo_owner + repo_name).to_sym
	else
	# Edge case
	# Hash it with junk if we dont have both
	repo_key = :junk
	end

	# call create or update
	github_repos.has_key?(repo_key) ? update_github_repo(repo_key) : create_github_repo(repo_key, repo_name, repo_owner)
	end

	def create_github_repo(key, name, owner)
	github_repo = GithubRepo.new
	github_repo.name = name
	github_repo.owner = owner
	github_repo.event_count = 1
	self.github_repos[key] = github_repo
	end

	def update_github_repo(key)
	github_repo = github_repos[key]
	github_repo.event_count += 1
	end

	def github_file_fetcher
	# Use the vailidator and bail
	begin
	dt_after = DateTime.rfc3339(@options[:dt_after])
	dt_before = DateTime.rfc3339(@options[:dt_before])
	rescue
	puts "Invalid date"
	end

	generate_file_queue(dt_after, dt_before)

	@url_queue
	end

	def generate_file_queue(from_dt, to_dt)
	# Good candidate for recursion so lets use it
	# Increment by an hour until we reach target date (to_dt)

	from_dt = (from_dt + Rational(60, 1440))

	a = from_dt.strftime GH_DT_FORMAT
	b = to_dt.strftime GH_DT_FORMAT

	# %k gives a padded hour strip it
	@url_queue << "#{BASE_URL}#{a}.json.gz".gsub(/\s+/, "")

	# Base case is when from datetime format string reaches to datetime format string
	generate_file_queue(from_dt, to_dt) unless a == b
	end

	def parse_options
	# optsparse is too much sometimes
	ARGV.each_with_index do \|arg, i\|
	case arg
	when "--after"
	dt_after = ARGV[i + 1]
	@options[:dt_after] = dt_after
	when "--before"
	dt_before = ARGV[i + 1]
	@options[:dt_before] = dt_before
	when "--event"
	@options[:event] = ARGV[i + 1]
	when "-n"
	@options[:n] = ARGV[i + 1]
	end
	end
	end

	def validate_argv
	# add stuff
	true
	end

	def valid_json?(json)
	begin
	!!JSON.parse(json)
	rescue
	false
	end
	end

	def valid_date?(dt)
	true
	end

	end


	time_start = Time.now

	@gh = GitHubArchiveClient.new
	@gh.start

	time_end = Time.now
	et = time_end - time_start

	@gh.github_repos.each do \|k, v\|
	v.event_count > 1 ? event_string = "events" : event_string = "event"
	puts "#{v.owner}/#{v.name} - #{v.event_count} #{event_string}"
	end

	puts "ELAPSED TIME: #{et}"

	# Speed could be reduced by running multiple threads to fetch files and unpdate in parallel
	# Could write all the data to a fast key value store
	# Could make a class for writing diff outputs such as CSV, JSON, XML