Last active
September 26, 2018 08:16
-
-
Save omalley/5125691 to your computer and use it in GitHub Desktop.
Auto-discovered Apache Hive schema for githubarchive.org's JSON logs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create table github ( | |
actor: string, | |
actor_attributes: struct < | |
blog: string, | |
company: string, | |
email: string, | |
gravatar_id: binary, | |
location: string, | |
login: string, | |
name: string, | |
type: string>, | |
created_at: timestamp, | |
payload: struct < | |
action: string, | |
comment: struct < | |
_links: struct < | |
html: struct < | |
href: string>, | |
pull_request: struct < | |
href: string>, | |
self: struct < | |
href: string>>, | |
body: string, | |
commit_id: binary, | |
created_at: timestamp, | |
id: int, | |
original_commit_id: binary, | |
original_position: tinyint, | |
path: string, | |
position: tinyint, | |
updated_at: timestamp, | |
url: string, | |
user: struct < | |
avatar_url: string, | |
events_url: string, | |
followers_url: string, | |
following_url: string, | |
gists_url: string, | |
gravatar_id: binary, | |
id: smallint, | |
login: string, | |
organizations_url: string, | |
received_events_url: string, | |
repos_url: string, | |
starred_url: string, | |
subscriptions_url: string, | |
type: string, | |
url: string>>, | |
comment_id: int, | |
commit: binary, | |
desc: string, | |
description: string, | |
head: binary, | |
id: int, | |
issue: int, | |
issue_id: int, | |
master_branch: string, | |
member: struct < | |
avatar_url: string, | |
events_url: string, | |
followers_url: string, | |
following_url: string, | |
gists_url: string, | |
gravatar_id: binary, | |
id: int, | |
login: string, | |
organizations_url: string, | |
received_events_url: string, | |
repos_url: string, | |
starred_url: string, | |
subscriptions_url: string, | |
type: string, | |
url: string>, | |
name: string, | |
number: smallint, | |
pages: array <struct < | |
action: string, | |
html_url: string, | |
page_name: string, | |
sha: binary, | |
summary: string, | |
title: string>>, | |
pull_request: struct < | |
_links: struct < | |
comments: struct < | |
href: string>, | |
html: struct < | |
href: string>, | |
issue: struct < | |
href: string>, | |
review_comments: struct < | |
href: string>, | |
self: struct < | |
href: string>>, | |
additions: tinyint, | |
assignee: struct < | |
avatar_url: string, | |
events_url: string, | |
followers_url: string, | |
following_url: string, | |
gists_url: string, | |
gravatar_id: binary, | |
id: int, | |
login: string, | |
organizations_url: string, | |
received_events_url: string, | |
repos_url: string, | |
starred_url: string, | |
subscriptions_url: string, | |
type: string, | |
url: string>, | |
base: struct < | |
label: string, | |
ref: string, | |
repo: struct < | |
archive_url: string, | |
assignees_url: string, | |
blobs_url: string, | |
branches_url: string, | |
clone_url: string, | |
collaborators_url: string, | |
comments_url: string, | |
commits_url: string, | |
compare_url: string, | |
contents_url: string, | |
contributors_url: string, | |
created_at: timestamp, | |
default_branch: string, | |
description: string, | |
downloads_url: string, | |
events_url: string, | |
fork: boolean, | |
forks: tinyint, | |
forks_count: tinyint, | |
forks_url: string, | |
full_name: string, | |
git_commits_url: string, | |
git_refs_url: string, | |
git_tags_url: string, | |
git_url: string, | |
has_downloads: boolean, | |
has_issues: boolean, | |
has_wiki: boolean, | |
homepage: string, | |
hooks_url: string, | |
html_url: string, | |
id: int, | |
issue_comment_url: string, | |
issue_events_url: string, | |
issues_url: string, | |
keys_url: string, | |
labels_url: string, | |
language: string, | |
languages_url: string, | |
master_branch: string, | |
merges_url: string, | |
milestones_url: string, | |
mirror_url: string, | |
name: string, | |
notifications_url: string, | |
open_issues: tinyint, | |
open_issues_count: tinyint, | |
owner: struct < | |
avatar_url: string, | |
events_url: string, | |
followers_url: string, | |
following_url: string, | |
gists_url: string, | |
gravatar_id: binary, | |
id: int, | |
login: string, | |
organizations_url: string, | |
received_events_url: string, | |
repos_url: string, | |
starred_url: string, | |
subscriptions_url: string, | |
type: string, | |
url: string>, | |
private: boolean, | |
pulls_url: string, | |
pushed_at: timestamp, | |
size: smallint, | |
ssh_url: string, | |
stargazers_url: string, | |
statuses_url: string, | |
subscribers_url: string, | |
subscription_url: string, | |
svn_url: string, | |
tags_url: string, | |
teams_url: string, | |
trees_url: string, | |
updated_at: timestamp, | |
url: string, | |
watchers: tinyint, | |
watchers_count: tinyint>, | |
sha: binary, | |
user: struct < | |
avatar_url: string, | |
events_url: string, | |
followers_url: string, | |
following_url: string, | |
gists_url: string, | |
gravatar_id: binary, | |
id: int, | |
login: string, | |
organizations_url: string, | |
received_events_url: string, | |
repos_url: string, | |
starred_url: string, | |
subscriptions_url: string, | |
type: string, | |
url: string>>, | |
body: string, | |
changed_files: tinyint, | |
closed_at: timestamp, | |
comments: tinyint, | |
comments_url: string, | |
commits: tinyint, | |
commits_url: string, | |
created_at: timestamp, | |
deletions: tinyint, | |
diff_url: string, | |
head: struct < | |
label: string, | |
ref: string, | |
repo: struct < | |
archive_url: string, | |
assignees_url: string, | |
blobs_url: string, | |
branches_url: string, | |
clone_url: string, | |
collaborators_url: string, | |
comments_url: string, | |
commits_url: string, | |
compare_url: string, | |
contents_url: string, | |
contributors_url: string, | |
created_at: timestamp, | |
default_branch: string, | |
description: string, | |
downloads_url: string, | |
events_url: string, | |
fork: boolean, | |
forks: tinyint, | |
forks_count: tinyint, | |
forks_url: string, | |
full_name: string, | |
git_commits_url: string, | |
git_refs_url: string, | |
git_tags_url: string, | |
git_url: string, | |
has_downloads: boolean, | |
has_issues: boolean, | |
has_wiki: boolean, | |
homepage: string, | |
hooks_url: string, | |
html_url: string, | |
id: int, | |
issue_comment_url: string, | |
issue_events_url: string, | |
issues_url: string, | |
keys_url: string, | |
labels_url: string, | |
language: string, | |
languages_url: string, | |
master_branch: string, | |
merges_url: string, | |
milestones_url: string, | |
mirror_url: string, | |
name: string, | |
notifications_url: string, | |
open_issues: tinyint, | |
open_issues_count: tinyint, | |
owner: struct < | |
avatar_url: string, | |
events_url: string, | |
followers_url: string, | |
following_url: string, | |
gists_url: string, | |
gravatar_id: binary, | |
id: int, | |
login: string, | |
organizations_url: string, | |
received_events_url: string, | |
repos_url: string, | |
starred_url: string, | |
subscriptions_url: string, | |
type: string, | |
url: string>, | |
private: boolean, | |
pulls_url: string, | |
pushed_at: timestamp, | |
size: smallint, | |
ssh_url: string, | |
stargazers_url: string, | |
statuses_url: string, | |
subscribers_url: string, | |
subscription_url: string, | |
svn_url: string, | |
tags_url: string, | |
teams_url: string, | |
trees_url: string, | |
updated_at: timestamp, | |
url: string, | |
watchers: tinyint, | |
watchers_count: tinyint>, | |
sha: binary, | |
user: struct < | |
avatar_url: string, | |
events_url: string, | |
followers_url: string, | |
following_url: string, | |
gists_url: string, | |
gravatar_id: binary, | |
id: int, | |
login: string, | |
organizations_url: string, | |
received_events_url: string, | |
repos_url: string, | |
starred_url: string, | |
subscriptions_url: string, | |
type: string, | |
url: string>>, | |
html_url: string, | |
id: int, | |
issue_url: string, | |
merge_commit_sha: binary, | |
mergeable: boolean, | |
mergeable_state: string, | |
merged: boolean, | |
merged_at: timestamp, | |
merged_by: struct < | |
avatar_url: string, | |
events_url: string, | |
followers_url: string, | |
following_url: string, | |
gists_url: string, | |
gravatar_id: binary, | |
id: int, | |
login: string, | |
organizations_url: string, | |
received_events_url: string, | |
repos_url: string, | |
starred_url: string, | |
subscriptions_url: string, | |
type: string, | |
url: string>, | |
milestone: struct < | |
closed_issues: tinyint, | |
created_at: timestamp, | |
creator: struct < | |
avatar_url: string, | |
events_url: string, | |
followers_url: string, | |
following_url: string, | |
gists_url: string, | |
gravatar_id: binary, | |
id: int, | |
login: string, | |
organizations_url: string, | |
received_events_url: string, | |
repos_url: string, | |
starred_url: string, | |
subscriptions_url: string, | |
type: string, | |
url: string>, | |
description: string, | |
due_on: timestamp, | |
id: int, | |
labels_url: string, | |
number: tinyint, | |
open_issues: tinyint, | |
state: string, | |
title: string, | |
updated_at: timestamp, | |
url: string>, | |
number: tinyint, | |
patch_url: string, | |
review_comment_url: string, | |
review_comments: tinyint, | |
review_comments_url: string, | |
state: string, | |
title: string, | |
updated_at: timestamp, | |
url: string, | |
user: struct < | |
avatar_url: string, | |
events_url: string, | |
followers_url: string, | |
following_url: string, | |
gists_url: string, | |
gravatar_id: binary, | |
id: int, | |
login: string, | |
organizations_url: string, | |
received_events_url: string, | |
repos_url: string, | |
starred_url: string, | |
subscriptions_url: string, | |
type: string, | |
url: string>>, | |
ref: string, | |
ref_type: string, | |
shas: array <array <uniontype <boolean,string,timestamp>>>, | |
size: tinyint, | |
target: struct < | |
followers: tinyint, | |
gravatar_id: binary, | |
id: int, | |
login: string, | |
repos: tinyint>, | |
url: string>, | |
public: boolean, | |
repository: struct < | |
created_at: timestamp, | |
description: string, | |
fork: boolean, | |
forks: tinyint, | |
has_downloads: boolean, | |
has_issues: boolean, | |
has_wiki: boolean, | |
homepage: string, | |
id: int, | |
integrate_branch: string, | |
language: string, | |
master_branch: string, | |
name: string, | |
open_issues: tinyint, | |
organization: string, | |
owner: string, | |
private: boolean, | |
pushed_at: timestamp, | |
size: smallint, | |
stargazers: tinyint, | |
url: string, | |
watchers: tinyint>, | |
type: string, | |
url: string | |
) |
The code I used is posted here: https://github.com/hortonworks/hive-json
Ok, the code has been incorporated into the ORC tools.
JSON Schema discovery tool documentation
JSON Schema discovery tool code
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This was discovered by looking at the Jan 2013 logs.
events: 5365969
gzip json: 1.6GB
json: 8.3GB