Skip to content

Instantly share code, notes, and snippets.

@omalley
Last active September 26, 2018 08:16
Show Gist options
  • Save omalley/5125691 to your computer and use it in GitHub Desktop.
Save omalley/5125691 to your computer and use it in GitHub Desktop.
Auto-discovered Apache Hive schema for githubarchive.org's JSON logs
create table github (
actor: string,
actor_attributes: struct <
blog: string,
company: string,
email: string,
gravatar_id: binary,
location: string,
login: string,
name: string,
type: string>,
created_at: timestamp,
payload: struct <
action: string,
comment: struct <
_links: struct <
html: struct <
href: string>,
pull_request: struct <
href: string>,
self: struct <
href: string>>,
body: string,
commit_id: binary,
created_at: timestamp,
id: int,
original_commit_id: binary,
original_position: tinyint,
path: string,
position: tinyint,
updated_at: timestamp,
url: string,
user: struct <
avatar_url: string,
events_url: string,
followers_url: string,
following_url: string,
gists_url: string,
gravatar_id: binary,
id: smallint,
login: string,
organizations_url: string,
received_events_url: string,
repos_url: string,
starred_url: string,
subscriptions_url: string,
type: string,
url: string>>,
comment_id: int,
commit: binary,
desc: string,
description: string,
head: binary,
id: int,
issue: int,
issue_id: int,
master_branch: string,
member: struct <
avatar_url: string,
events_url: string,
followers_url: string,
following_url: string,
gists_url: string,
gravatar_id: binary,
id: int,
login: string,
organizations_url: string,
received_events_url: string,
repos_url: string,
starred_url: string,
subscriptions_url: string,
type: string,
url: string>,
name: string,
number: smallint,
pages: array <struct <
action: string,
html_url: string,
page_name: string,
sha: binary,
summary: string,
title: string>>,
pull_request: struct <
_links: struct <
comments: struct <
href: string>,
html: struct <
href: string>,
issue: struct <
href: string>,
review_comments: struct <
href: string>,
self: struct <
href: string>>,
additions: tinyint,
assignee: struct <
avatar_url: string,
events_url: string,
followers_url: string,
following_url: string,
gists_url: string,
gravatar_id: binary,
id: int,
login: string,
organizations_url: string,
received_events_url: string,
repos_url: string,
starred_url: string,
subscriptions_url: string,
type: string,
url: string>,
base: struct <
label: string,
ref: string,
repo: struct <
archive_url: string,
assignees_url: string,
blobs_url: string,
branches_url: string,
clone_url: string,
collaborators_url: string,
comments_url: string,
commits_url: string,
compare_url: string,
contents_url: string,
contributors_url: string,
created_at: timestamp,
default_branch: string,
description: string,
downloads_url: string,
events_url: string,
fork: boolean,
forks: tinyint,
forks_count: tinyint,
forks_url: string,
full_name: string,
git_commits_url: string,
git_refs_url: string,
git_tags_url: string,
git_url: string,
has_downloads: boolean,
has_issues: boolean,
has_wiki: boolean,
homepage: string,
hooks_url: string,
html_url: string,
id: int,
issue_comment_url: string,
issue_events_url: string,
issues_url: string,
keys_url: string,
labels_url: string,
language: string,
languages_url: string,
master_branch: string,
merges_url: string,
milestones_url: string,
mirror_url: string,
name: string,
notifications_url: string,
open_issues: tinyint,
open_issues_count: tinyint,
owner: struct <
avatar_url: string,
events_url: string,
followers_url: string,
following_url: string,
gists_url: string,
gravatar_id: binary,
id: int,
login: string,
organizations_url: string,
received_events_url: string,
repos_url: string,
starred_url: string,
subscriptions_url: string,
type: string,
url: string>,
private: boolean,
pulls_url: string,
pushed_at: timestamp,
size: smallint,
ssh_url: string,
stargazers_url: string,
statuses_url: string,
subscribers_url: string,
subscription_url: string,
svn_url: string,
tags_url: string,
teams_url: string,
trees_url: string,
updated_at: timestamp,
url: string,
watchers: tinyint,
watchers_count: tinyint>,
sha: binary,
user: struct <
avatar_url: string,
events_url: string,
followers_url: string,
following_url: string,
gists_url: string,
gravatar_id: binary,
id: int,
login: string,
organizations_url: string,
received_events_url: string,
repos_url: string,
starred_url: string,
subscriptions_url: string,
type: string,
url: string>>,
body: string,
changed_files: tinyint,
closed_at: timestamp,
comments: tinyint,
comments_url: string,
commits: tinyint,
commits_url: string,
created_at: timestamp,
deletions: tinyint,
diff_url: string,
head: struct <
label: string,
ref: string,
repo: struct <
archive_url: string,
assignees_url: string,
blobs_url: string,
branches_url: string,
clone_url: string,
collaborators_url: string,
comments_url: string,
commits_url: string,
compare_url: string,
contents_url: string,
contributors_url: string,
created_at: timestamp,
default_branch: string,
description: string,
downloads_url: string,
events_url: string,
fork: boolean,
forks: tinyint,
forks_count: tinyint,
forks_url: string,
full_name: string,
git_commits_url: string,
git_refs_url: string,
git_tags_url: string,
git_url: string,
has_downloads: boolean,
has_issues: boolean,
has_wiki: boolean,
homepage: string,
hooks_url: string,
html_url: string,
id: int,
issue_comment_url: string,
issue_events_url: string,
issues_url: string,
keys_url: string,
labels_url: string,
language: string,
languages_url: string,
master_branch: string,
merges_url: string,
milestones_url: string,
mirror_url: string,
name: string,
notifications_url: string,
open_issues: tinyint,
open_issues_count: tinyint,
owner: struct <
avatar_url: string,
events_url: string,
followers_url: string,
following_url: string,
gists_url: string,
gravatar_id: binary,
id: int,
login: string,
organizations_url: string,
received_events_url: string,
repos_url: string,
starred_url: string,
subscriptions_url: string,
type: string,
url: string>,
private: boolean,
pulls_url: string,
pushed_at: timestamp,
size: smallint,
ssh_url: string,
stargazers_url: string,
statuses_url: string,
subscribers_url: string,
subscription_url: string,
svn_url: string,
tags_url: string,
teams_url: string,
trees_url: string,
updated_at: timestamp,
url: string,
watchers: tinyint,
watchers_count: tinyint>,
sha: binary,
user: struct <
avatar_url: string,
events_url: string,
followers_url: string,
following_url: string,
gists_url: string,
gravatar_id: binary,
id: int,
login: string,
organizations_url: string,
received_events_url: string,
repos_url: string,
starred_url: string,
subscriptions_url: string,
type: string,
url: string>>,
html_url: string,
id: int,
issue_url: string,
merge_commit_sha: binary,
mergeable: boolean,
mergeable_state: string,
merged: boolean,
merged_at: timestamp,
merged_by: struct <
avatar_url: string,
events_url: string,
followers_url: string,
following_url: string,
gists_url: string,
gravatar_id: binary,
id: int,
login: string,
organizations_url: string,
received_events_url: string,
repos_url: string,
starred_url: string,
subscriptions_url: string,
type: string,
url: string>,
milestone: struct <
closed_issues: tinyint,
created_at: timestamp,
creator: struct <
avatar_url: string,
events_url: string,
followers_url: string,
following_url: string,
gists_url: string,
gravatar_id: binary,
id: int,
login: string,
organizations_url: string,
received_events_url: string,
repos_url: string,
starred_url: string,
subscriptions_url: string,
type: string,
url: string>,
description: string,
due_on: timestamp,
id: int,
labels_url: string,
number: tinyint,
open_issues: tinyint,
state: string,
title: string,
updated_at: timestamp,
url: string>,
number: tinyint,
patch_url: string,
review_comment_url: string,
review_comments: tinyint,
review_comments_url: string,
state: string,
title: string,
updated_at: timestamp,
url: string,
user: struct <
avatar_url: string,
events_url: string,
followers_url: string,
following_url: string,
gists_url: string,
gravatar_id: binary,
id: int,
login: string,
organizations_url: string,
received_events_url: string,
repos_url: string,
starred_url: string,
subscriptions_url: string,
type: string,
url: string>>,
ref: string,
ref_type: string,
shas: array <array <uniontype <boolean,string,timestamp>>>,
size: tinyint,
target: struct <
followers: tinyint,
gravatar_id: binary,
id: int,
login: string,
repos: tinyint>,
url: string>,
public: boolean,
repository: struct <
created_at: timestamp,
description: string,
fork: boolean,
forks: tinyint,
has_downloads: boolean,
has_issues: boolean,
has_wiki: boolean,
homepage: string,
id: int,
integrate_branch: string,
language: string,
master_branch: string,
name: string,
open_issues: tinyint,
organization: string,
owner: string,
private: boolean,
pushed_at: timestamp,
size: smallint,
stargazers: tinyint,
url: string,
watchers: tinyint>,
type: string,
url: string
)
@omalley
Copy link
Author

omalley commented Mar 9, 2013

This was discovered by looking at the Jan 2013 logs.

events: 5365969
gzip json: 1.6GB
json: 8.3GB

@omalley
Copy link
Author

omalley commented Mar 13, 2013

The code I used is posted here: https://github.com/hortonworks/hive-json

@omalley
Copy link
Author

omalley commented Apr 18, 2018

Ok, the code has been incorporated into the ORC tools.

JSON Schema discovery tool documentation
JSON Schema discovery tool code

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment