Skip to content

Instantly share code, notes, and snippets.

@pgdaniel
Created April 25, 2014 01:24
Show Gist options
  • Save pgdaniel/11275091 to your computer and use it in GitHub Desktop.
Save pgdaniel/11275091 to your computer and use it in GitHub Desktop.
Github archive client
# Wanted to do this just with stdlib and do it in a firehose
# type fashion
# Could have used their bigdata interface but hey
#
# A lot more that could be done, validation of input dates, retry queue,
# logging etc...
require 'date'
require 'open-uri'
require 'zlib'
require 'json'
class GithubRepo
# Could of just of just used a struct here but maybe I'll
# give it some behavior later.
# Add an increment method so you could make it thread safe?
attr_accessor :owner, :name, :event_count
end
class GitHubArchiveClient
attr_accessor :github_repos
BASE_URL = "http://data.githubarchive.org/"
GH_DT_FORMAT = "%Y-%m-%d-%k"
def initialize
self.github_repos = {}
@url_queue = []
@options = {dt_after: nil, dt_before: nil, event: nil, n: nil}
end
def start
parse_options
validate_argv
read_n_parse
end
def read_n_parse
#This could be abstracted more cleanly
# Could add a switch for local files
#Dir["*.gz"].each do |f|
begin
github_file_fetcher.each do |f|
puts "Fetching #{f}"
gz_file = open(f)
Zlib::GzipReader.new(gz_file).readlines.each do |line|
parsa_line(line)
end
end
rescue
# Could add a failed queue here for retries
puts "Can't open file"
end
end
def parsa_line(line)
if valid_json? line
json_hash = JSON.parse(line)
# Could check against valid event types but what if they added one?
# Add a force switch?
if json_hash["type"] == @options[:event]
repo = json_hash["repository"] || {}
add_or_increment(repo)
end
end
end
def add_or_increment(repo)
# Could have done this many ways but keeping references in memory
# probably makes this pretty fast
repo_owner = repo["owner"]
repo_name = repo["name"]
# Key this so we can look it up quick style with a symbol
# Can't just use repo name might be multiple repos with the same name
if repo_owner && repo_name
repo_key = (repo_owner + repo_name).to_sym
else
# Edge case
# Hash it with junk if we dont have both
repo_key = :junk
end
# call create or update
github_repos.has_key?(repo_key) ? update_github_repo(repo_key) : create_github_repo(repo_key, repo_name, repo_owner)
end
def create_github_repo(key, name, owner)
github_repo = GithubRepo.new
github_repo.name = name
github_repo.owner = owner
github_repo.event_count = 1
self.github_repos[key] = github_repo
end
def update_github_repo(key)
github_repo = github_repos[key]
github_repo.event_count += 1
end
def github_file_fetcher
# Use the vailidator and bail
begin
dt_after = DateTime.rfc3339(@options[:dt_after])
dt_before = DateTime.rfc3339(@options[:dt_before])
rescue
puts "Invalid date"
end
generate_file_queue(dt_after, dt_before)
@url_queue
end
def generate_file_queue(from_dt, to_dt)
# Good candidate for recursion so lets use it
# Increment by an hour until we reach target date (to_dt)
from_dt = (from_dt + Rational(60, 1440))
a = from_dt.strftime GH_DT_FORMAT
b = to_dt.strftime GH_DT_FORMAT
# %k gives a padded hour strip it
@url_queue << "#{BASE_URL}#{a}.json.gz".gsub(/\s+/, "")
# Base case is when from datetime format string reaches to datetime format string
generate_file_queue(from_dt, to_dt) unless a == b
end
def parse_options
# optsparse is too much sometimes
ARGV.each_with_index do |arg, i|
case arg
when "--after"
dt_after = ARGV[i + 1]
@options[:dt_after] = dt_after
when "--before"
dt_before = ARGV[i + 1]
@options[:dt_before] = dt_before
when "--event"
@options[:event] = ARGV[i + 1]
when "-n"
@options[:n] = ARGV[i + 1]
end
end
end
def validate_argv
# add stuff
true
end
def valid_json?(json)
begin
!!JSON.parse(json)
rescue
false
end
end
def valid_date?(dt)
true
end
end
time_start = Time.now
@gh = GitHubArchiveClient.new
@gh.start
time_end = Time.now
et = time_end - time_start
@gh.github_repos.each do |k, v|
v.event_count > 1 ? event_string = "events" : event_string = "event"
puts "#{v.owner}/#{v.name} - #{v.event_count} #{event_string}"
end
puts "ELAPSED TIME: #{et}"
# Speed could be reduced by running multiple threads to fetch files and unpdate in parallel
# Could write all the data to a fast key value store
# Could make a class for writing diff outputs such as CSV, JSON, XML
@pgdaniel
Copy link
Author

@patbenatar The sample I was running takes around a total of a minute to pull down 14 - 1M gziped files of around 5K lines...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment