Created
April 25, 2014 01:24
-
-
Save pgdaniel/11275091 to your computer and use it in GitHub Desktop.
Github archive client
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Wanted to do this just with stdlib and do it in a firehose | |
# type fashion | |
# Could have used their bigdata interface but hey | |
# | |
# A lot more that could be done, validation of input dates, retry queue, | |
# logging etc... | |
require 'date' | |
require 'open-uri' | |
require 'zlib' | |
require 'json' | |
class GithubRepo | |
# Could of just of just used a struct here but maybe I'll | |
# give it some behavior later. | |
# Add an increment method so you could make it thread safe? | |
attr_accessor :owner, :name, :event_count | |
end | |
class GitHubArchiveClient | |
attr_accessor :github_repos | |
BASE_URL = "http://data.githubarchive.org/" | |
GH_DT_FORMAT = "%Y-%m-%d-%k" | |
def initialize | |
self.github_repos = {} | |
@url_queue = [] | |
@options = {dt_after: nil, dt_before: nil, event: nil, n: nil} | |
end | |
def start | |
parse_options | |
validate_argv | |
read_n_parse | |
end | |
def read_n_parse | |
#This could be abstracted more cleanly | |
# Could add a switch for local files | |
#Dir["*.gz"].each do |f| | |
begin | |
github_file_fetcher.each do |f| | |
puts "Fetching #{f}" | |
gz_file = open(f) | |
Zlib::GzipReader.new(gz_file).readlines.each do |line| | |
parsa_line(line) | |
end | |
end | |
rescue | |
# Could add a failed queue here for retries | |
puts "Can't open file" | |
end | |
end | |
def parsa_line(line) | |
if valid_json? line | |
json_hash = JSON.parse(line) | |
# Could check against valid event types but what if they added one? | |
# Add a force switch? | |
if json_hash["type"] == @options[:event] | |
repo = json_hash["repository"] || {} | |
add_or_increment(repo) | |
end | |
end | |
end | |
def add_or_increment(repo) | |
# Could have done this many ways but keeping references in memory | |
# probably makes this pretty fast | |
repo_owner = repo["owner"] | |
repo_name = repo["name"] | |
# Key this so we can look it up quick style with a symbol | |
# Can't just use repo name might be multiple repos with the same name | |
if repo_owner && repo_name | |
repo_key = (repo_owner + repo_name).to_sym | |
else | |
# Edge case | |
# Hash it with junk if we dont have both | |
repo_key = :junk | |
end | |
# call create or update | |
github_repos.has_key?(repo_key) ? update_github_repo(repo_key) : create_github_repo(repo_key, repo_name, repo_owner) | |
end | |
def create_github_repo(key, name, owner) | |
github_repo = GithubRepo.new | |
github_repo.name = name | |
github_repo.owner = owner | |
github_repo.event_count = 1 | |
self.github_repos[key] = github_repo | |
end | |
def update_github_repo(key) | |
github_repo = github_repos[key] | |
github_repo.event_count += 1 | |
end | |
def github_file_fetcher | |
# Use the vailidator and bail | |
begin | |
dt_after = DateTime.rfc3339(@options[:dt_after]) | |
dt_before = DateTime.rfc3339(@options[:dt_before]) | |
rescue | |
puts "Invalid date" | |
end | |
generate_file_queue(dt_after, dt_before) | |
@url_queue | |
end | |
def generate_file_queue(from_dt, to_dt) | |
# Good candidate for recursion so lets use it | |
# Increment by an hour until we reach target date (to_dt) | |
from_dt = (from_dt + Rational(60, 1440)) | |
a = from_dt.strftime GH_DT_FORMAT | |
b = to_dt.strftime GH_DT_FORMAT | |
# %k gives a padded hour strip it | |
@url_queue << "#{BASE_URL}#{a}.json.gz".gsub(/\s+/, "") | |
# Base case is when from datetime format string reaches to datetime format string | |
generate_file_queue(from_dt, to_dt) unless a == b | |
end | |
def parse_options | |
# optsparse is too much sometimes | |
ARGV.each_with_index do |arg, i| | |
case arg | |
when "--after" | |
dt_after = ARGV[i + 1] | |
@options[:dt_after] = dt_after | |
when "--before" | |
dt_before = ARGV[i + 1] | |
@options[:dt_before] = dt_before | |
when "--event" | |
@options[:event] = ARGV[i + 1] | |
when "-n" | |
@options[:n] = ARGV[i + 1] | |
end | |
end | |
end | |
def validate_argv | |
# add stuff | |
true | |
end | |
def valid_json?(json) | |
begin | |
!!JSON.parse(json) | |
rescue | |
false | |
end | |
end | |
def valid_date?(dt) | |
true | |
end | |
end | |
time_start = Time.now | |
@gh = GitHubArchiveClient.new | |
@gh.start | |
time_end = Time.now | |
et = time_end - time_start | |
@gh.github_repos.each do |k, v| | |
v.event_count > 1 ? event_string = "events" : event_string = "event" | |
puts "#{v.owner}/#{v.name} - #{v.event_count} #{event_string}" | |
end | |
puts "ELAPSED TIME: #{et}" | |
# Speed could be reduced by running multiple threads to fetch files and unpdate in parallel | |
# Could write all the data to a fast key value store | |
# Could make a class for writing diff outputs such as CSV, JSON, XML |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@patbenatar The sample I was running takes around a total of a minute to pull down 14 - 1M gziped files of around 5K lines...