Skip to content

Instantly share code, notes, and snippets.

@arfon
Last active Jun 2, 2016
Embed
What would you like to do?
require 'digest'
class EventParseError < StandardError; end
class EventTransform
attr_accessor :actor, :created_at, :raw_event, :id, :org, :other, :payload, :is_public, :repo, :type
def initialize(event_json)
@raw_event = event_json
@other = Hash.new
end
def process
extract_and_set_fields
sanitize
end
# Return the event hash as described in
# https://github.com/igrigorik/githubarchive.org/blob/master/bigquery/schema.js
def parsed_event
{
'actor' => actor,
'created_at' => created_at,
'id' => id,
'org' => org,
'other' => other,
'payload' => payload,
'public' => is_public,
'repo' => repo,
'type' => type
}
end
# Scrub emails from push events. Could include further logic in future
def sanitize
scrub_payload_emails if type == 'PushEvent'
end
# Extract the top-level schema fields from the raw event body and do any
# necessary processing of the fields
def extract_and_set_fields
@type = raw_event['type']
@is_public = raw_event['public']
@payload = raw_event['payload']
@id = raw_event['id']
# https://github.com/igrigorik/githubarchive.org/blob/c9ae11426e5bcc30fe15617d009dfc602697ecde/bigquery/schema.js#L17-L38
@repo = parse_field('repo', %w{id url name})
# https://github.com/igrigorik/githubarchive.org/blob/c9ae11426e5bcc30fe15617d009dfc602697ecde/bigquery/schema.js#L39-L70
@actor = parse_field('actor', %w{id login gravatar_id avatar_url url})
# https://github.com/igrigorik/githubarchive.org/blob/c9ae11426e5bcc30fe15617d009dfc602697ecde/bigquery/schema.js#L71-L102
@org = parse_field('org', %w{id login gravatar_id avatar_url url})
@created_at = parse_created_at
end
# Extract a field from the raw event body and extract the expected entries
# for extraneous entries add them to the 'other' field.
def parse_field(field_name, expected_entries)
event_field = raw_event.delete(field_name)
# Sometimes this is blank (e.g. for anonymous Gists actor is nil)
if event_field.nil?
return nil
end
parsed = {}
expected_entries.each do |field|
parsed[field] = event_field.delete(field)
end
# Are there extra fields?
# If so, throw them into the other key
if event_field.keys.any?
@other[field_name] = event_field
end
return parsed
end
def parse_created_at
return Time.parse(raw_event['created_at']).utc.strftime('%Y-%m-%d %T')
end
def scrub_payload_emails
if payload.has_key?('shas')
commits = payload['shas']
# Older format PushEvents have commits described as 'shas'. These
# have format ['git sha', 'author email', 'commit message', 'author name']
commits.each do |commit|
commit[1] = sanitize_email(commit[1])
end
elsif payload.has_key?('commits')
# Newer PushEvents have a 'commits' key with nested attributes:
# "commits": [
# {
# "sha": "5636aa2f6f249f22e76b20e5caeb84096b7302ce",
# "author" : {
# "email": "email@example.com",
# "name": "commiter_login"
# },
# "message": "Commit message",
# "distinct": true,
# "url": "API commit URL"
# }
# ]
commits = payload['commits']
commits.each do |commit|
commit['author']['email'] = sanitize_email(commit['author']['email'])
end
else
raise EventParseError
end
end
# If the email doesn't look to be valid, let's just create a SHA1 of
# the whole thing. This happens when:
# - The email is empty (nil)
# - The email field is a string that's not a valid email
# - The email doesn't include an '@' symbol
def sanitize_email(email)
if email.nil? || email.strip.length < 3 || !email.include?('@')
return Digest::SHA1.hexdigest(email.to_s)
else
prefix, domain = email.strip.split('@')
return "#{Digest::SHA1.hexdigest(prefix)}@#{domain}"
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment