Skip to content

Instantly share code, notes, and snippets.

@hrz6976
Created May 9, 2023 20:25
Show Gist options
  • Save hrz6976/de31f0549c4e18c543f30b4f36662e22 to your computer and use it in GitHub Desktop.
Save hrz6976/de31f0549c4e18c543f30b4f36662e22 to your computer and use it in GitHub Desktop.
Import ghtorrent dataset to clickhouse (clickhouse-client --password <PASSWORD> --send-timeout=3600 --receive-timeout=600 < clickhouse_ghtorrent.sql
SET input_format_null_as_default = 1;
SET input_format_allow_errors_num = 100000;
SET input_format_allow_errors_ratio = 0.1;
SET date_time_input_format = 'best_effort';
CREATE DATABASE IF NOT EXISTS `ghtorrent`;
DROP TABLE IF EXISTS `ghtorrent`.`organization_members` ;
CREATE TABLE `ghtorrent`.`organization_members` (
`org_id` Int32,
`user_id` Int32,
`created_at` DateTime
)
ENGINE = MergeTree() PRIMARY KEY `org_id` ORDER BY (`org_id`, `user_id`) SETTINGS allow_nullable_key =1;
-- import from organization_members format CSV
INSERT INTO `ghtorrent`.`organization_members` FROM INFILE 'organization_members.csv' FORMAT CSV;
-- create primary key org_id, ORDER BY (org_id, user_id)
OPTIMIZE TABLE `ghtorrent`.`organization_members` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`users` ;
CREATE TABLE `ghtorrent`.`users` (
`id` Int32,
`login` String,
`company` Nullable(String),
`created_at` DateTime,
`type` String,
`fake` Int8,
`deleted` Int8,
`long` Nullable(String),
`lat` Nullable(String),
`country_code` Nullable(String),
`state` Nullable(String),
`city` Nullable(String),
`location` Nullable(String)
)
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `login`, `created_at`) SETTINGS allow_nullable_key = 1
;
-- import from users format CSV
INSERT INTO `ghtorrent`.`users` FROM INFILE 'users.csv' FORMAT CSV;
-- create primary key id, ORDER BY (id, login, created_at)
OPTIMIZE TABLE `ghtorrent`.`users` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`project_languages` ;
CREATE TABLE `ghtorrent`.`project_languages` (
`project_id` Int32,
`language` Nullable(String),
`bytes` Nullable(Int32),
`created_at` DateTime
)
ENGINE = MergeTree() PRIMARY KEY `project_id` ORDER BY (`project_id`, `language`, `created_at`) SETTINGS allow_nullable_key = 1
;
-- import from project_languages format CSV
INSERT INTO `ghtorrent`.`project_languages` FROM INFILE 'project_languages.csv' FORMAT CSV;
-- create primary key project_id, ORDER BY (project_id, language, created_at)
OPTIMIZE TABLE `ghtorrent`.`project_languages` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`project_topics` ;
CREATE TABLE `ghtorrent`.`project_topics` (
`project_id` Int32,
`topic_name` String,
`created_at` DateTime,
`deleted` Int8
)
ENGINE = MergeTree() PRIMARY KEY `project_id` ORDER BY (`project_id`, `topic_name`, `created_at`) SETTINGS allow_nullable_key = 1
;
-- import from project_topics format CSV
INSERT INTO `ghtorrent`.`project_topics` FROM INFILE 'project_topics.csv' FORMAT CSV;
-- create primary key project_id, ORDER BY (project_id, topic_name, created_at)
OPTIMIZE TABLE `ghtorrent`.`project_topics` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`issue_comments` ;
CREATE TABLE `ghtorrent`.`issue_comments` (
`issue_id` Int32,
`user_id` Int32,
`comment_id` String,
`created_at` DateTime
)
ENGINE = MergeTree() PRIMARY KEY `issue_id` ORDER BY (`issue_id`, `user_id`, `comment_id`, `created_at`) SETTINGS allow_nullable_key = 1
;
-- import from issue_comments format CSV
INSERT INTO `ghtorrent`.`issue_comments` FROM INFILE 'issue_comments.csv' FORMAT CSV;
-- create primary key issue_id, ORDER BY (issue_id, user_id, comment_id, created_at)
OPTIMIZE TABLE `ghtorrent`.`issue_comments` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`project_commits` ;
CREATE TABLE `ghtorrent`.`project_commits` (
`project_id` Int32,
`commit_id` Int32
)
ENGINE = MergeTree() PRIMARY KEY `project_id` ORDER BY (`project_id`, `commit_id`) SETTINGS allow_nullable_key = 1
;
-- import from project_commits format CSV
INSERT INTO `ghtorrent`.`project_commits` FROM INFILE 'project_commits.csv' FORMAT CSV;
-- create primary key project_id, ORDER BY (project_id, commit_id)
OPTIMIZE TABLE `ghtorrent`.`project_commits` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`projects` ;
CREATE TABLE IF NOT EXISTS `ghtorrent`.`projects` (
`id` Int32,
`url` Nullable(String),
`owner_id` Nullable(Int32),
`name` String,
`description` Nullable(String),
`language` Nullable(String),
`created_at` DateTime,
`forked_from` Nullable(Int32),
`deleted` Int8,
`updated_at` DateTime,
`forked_commit_id` Nullable(Int32)
)
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `name`, `created_at`) SETTINGS allow_nullable_key = 1
;
-- import from projects format CSV
INSERT INTO `ghtorrent`.`projects` FROM INFILE 'projects.csv' FORMAT CSV;
-- create primary key id, ORDER BY (id, name, created_at)
OPTIMIZE TABLE `ghtorrent`.`projects` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`commit_parents` ;
CREATE TABLE `ghtorrent`.`commit_parents` (
`commit_id` Int32,
`parent_id` Int32
)
ENGINE = MergeTree() PRIMARY KEY `commit_id` ORDER BY (`commit_id`, `parent_id`) SETTINGS allow_nullable_key = 1
;
-- import from commit_parents format CSV
INSERT INTO `ghtorrent`.`commit_parents` FROM INFILE 'commit_parents.csv' FORMAT CSV;
-- create primary key commit_id, ORDER BY (commit_id, parent_id)
OPTIMIZE TABLE `ghtorrent`.`commit_parents` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`commits` ;
CREATE TABLE `ghtorrent`.`commits` (
`id` Int32,
`sha` Nullable(String),
`author_id` Nullable(Int32),
`committer_id` Nullable(Int32),
`project_id` Nullable(Int32),
`created_at` DateTime
)
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `sha`, `created_at`) SETTINGS allow_nullable_key = 1;
-- import from commits format CSV
INSERT INTO `ghtorrent`.`commits` FROM INFILE 'commits.csv' FORMAT CSV;
-- create primary key id, ORDER BY (id, sha, created_at)
OPTIMIZE TABLE `ghtorrent`.`commits` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`pull_request_history` ;
CREATE TABLE `ghtorrent`.`pull_request_history` (
`id` Int32,
`pull_request_id` Int32,
`created_at` DateTime,
`action` String,
`actor_id` Nullable(Int32)
)
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `pull_request_id`, `created_at`) SETTINGS allow_nullable_key = 1;
;
-- import from pull_request_history format CSV
INSERT INTO `ghtorrent`.`pull_request_history` FROM INFILE 'pull_request_history.csv' FORMAT CSV;
-- create primary key id, ORDER BY (id, pull_request_id, created_at)
OPTIMIZE TABLE `ghtorrent`.`pull_request_history` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`followers` ;
CREATE TABLE `ghtorrent`.`followers` (
`follower_id` Int32,
`user_id` Int32,
`created_at` DateTime
)
ENGINE = MergeTree() PRIMARY KEY `follower_id` ORDER BY (`follower_id`, `user_id`, `created_at`) SETTINGS allow_nullable_key = 1
;
-- import from followers format CSV
INSERT INTO `ghtorrent`.`followers` FROM INFILE 'followers.csv' FORMAT CSV;
-- create primary key follower_id, ORDER BY (follower_id, user_id, created_at)
OPTIMIZE TABLE `ghtorrent`.`followers` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`commit_comments` ;
CREATE TABLE `ghtorrent`.`commit_comments` (
`id` Int32,
`commit_id` Int32,
`user_id` Int32,
`body` Nullable(String),
`line` Nullable(Int32),
`position` Nullable(Int32),
`comment_id` Int32,
`created_at` DateTime
)
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `commit_id`, `created_at`) SETTINGS allow_nullable_key = 1
;
-- import from commit_comments format CSV
INSERT INTO `ghtorrent`.`commit_comments` FROM INFILE 'commit_comments.csv' FORMAT CSV;
-- create primary key id, ORDER BY (id, commit_id, created_at)
OPTIMIZE TABLE `ghtorrent`.`commit_comments` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`issue_labels` ;
CREATE TABLE `ghtorrent`.`issue_labels` (
`label_id` Int32,
`issue_id` Int32
)
ENGINE = MergeTree() PRIMARY KEY `label_id` ORDER BY (`label_id`, `issue_id`) SETTINGS allow_nullable_key = 1
;
-- import from issue_labels format CSV
INSERT INTO `ghtorrent`.`issue_labels` FROM INFILE 'issue_labels.csv' FORMAT CSV;
-- create primary key label_id, ORDER BY (label_id, issue_id)
OPTIMIZE TABLE `ghtorrent`.`issue_labels` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`project_members` ;
CREATE TABLE `ghtorrent`.`project_members` (
`repo_id` Int32,
`user_id` Int32,
`created_at` DateTime
)
ENGINE = MergeTree() PRIMARY KEY `repo_id` ORDER BY (`repo_id`, `user_id`, `created_at`) SETTINGS allow_nullable_key = 1
;
-- import from project_members format CSV
INSERT INTO `ghtorrent`.`project_members` FROM INFILE 'project_members.csv' FORMAT CSV;
-- create primary key repo_id, ORDER BY (repo_id, user_id, created_at)
OPTIMIZE TABLE `ghtorrent`.`project_members` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`pull_request_commits` ;
CREATE TABLE `ghtorrent`.`pull_request_commits` (
`pull_request_id` Int32,
`commit_id` Int32
)
ENGINE = MergeTree() PRIMARY KEY `pull_request_id` ORDER BY (`pull_request_id`, `commit_id`) SETTINGS allow_nullable_key = 1
;
-- import from pull_request_commits format CSV
INSERT INTO `ghtorrent`.`pull_request_commits` FROM INFILE 'pull_request_commits.csv' FORMAT CSV;
-- create primary key pull_request_id, ORDER BY (pull_request_id, commit_id)
OPTIMIZE TABLE `ghtorrent`.`pull_request_commits` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`repo_milestones` ;
CREATE TABLE `ghtorrent`.`repo_milestones` (
`id` Int32,
`repo_id` Nullable(Int32),
`name` String
)
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `repo_id`) SETTINGS allow_nullable_key = 1
;
-- import from repo_milestones format CSV
INSERT INTO `ghtorrent`.`repo_milestones` FROM INFILE 'repo_milestones.csv' FORMAT CSV;
-- create primary key id, ORDER BY (id, repo_id)
OPTIMIZE TABLE `ghtorrent`.`repo_milestones` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`pull_request_comments` ;
CREATE TABLE `ghtorrent`.`pull_request_comments` (
`pull_request_id` Int32,
`user_id` Int32,
`comment_id` String,
`position` Nullable(Int32),
`body` Nullable(String),
`commit_id` Int32,
`created_at` DateTime
)
ENGINE = MergeTree() PRIMARY KEY `pull_request_id` ORDER BY (`pull_request_id`, `user_id`, `created_at`) SETTINGS allow_nullable_key = 1
;
-- import from pull_request_comments format CSV
INSERT INTO `ghtorrent`.`pull_request_comments` FROM INFILE 'pull_request_comments.csv' FORMAT CSV;
-- create primary key pull_request_id, ORDER BY (pull_request_id, user_id, created_at)
OPTIMIZE TABLE `ghtorrent`.`pull_request_comments` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`watchers` ;
CREATE TABLE `ghtorrent`.`watchers` (
`repo_id` Int32,
`user_id` Int32,
`created_at` DateTime
)
ENGINE = MergeTree() PRIMARY KEY `repo_id` ORDER BY (`repo_id`, `user_id`, `created_at`) SETTINGS allow_nullable_key = 1
;
-- import from watchers format CSV
INSERT INTO `ghtorrent`.`watchers` FROM INFILE 'watchers.csv' FORMAT CSV;
-- create primary key repo_id, ORDER BY (repo_id, user_id, created_at)
OPTIMIZE TABLE `ghtorrent`.`watchers` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`repo_labels` ;
CREATE TABLE `ghtorrent`.`repo_labels` (
`id` Int32,
`repo_id` Nullable(Int32),
`name` String
)
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `repo_id`) SETTINGS allow_nullable_key = 1
;
-- import from repo_labels format CSV
INSERT INTO `ghtorrent`.`repo_labels` FROM INFILE 'repo_labels.csv' FORMAT CSV;
-- create primary key id, ORDER BY (id, repo_id)
OPTIMIZE TABLE `ghtorrent`.`repo_labels` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`issue_events` ;
CREATE TABLE `ghtorrent`.`issue_events` (
`event_id` String,
`issue_id` Int32,
`actor_id` Int32,
`action` String,
`action_specific` Nullable(String),
`created_at` DateTime
)
ENGINE = MergeTree() PRIMARY KEY `event_id` ORDER BY (`event_id`, `issue_id`, `actor_id`, `created_at`) SETTINGS allow_nullable_key = 1
;
-- import from issue_events format CSV
INSERT INTO `ghtorrent`.`issue_events` FROM INFILE 'issue_events.csv' FORMAT CSV;
-- create primary key event_id, ORDER BY (event_id, issue_id, actor_id, created_at)
OPTIMIZE TABLE `ghtorrent`.`issue_events` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`pull_requests` ;
CREATE TABLE `ghtorrent`.`pull_requests` (
`id` Int32,
`head_repo_id` Nullable(Int32),
`base_repo_id` Int32,
`head_commit_id` Nullable(Int32),
`base_commit_id` Int32,
`pullreq_id` Int32,
`intra_branch` Int8
)
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `head_repo_id`, `base_repo_id`) SETTINGS allow_nullable_key = 1
;
-- import from pull_requests format CSV
INSERT INTO `ghtorrent`.`pull_requests` FROM INFILE 'pull_requests.csv' FORMAT CSV;
-- create primary key id, ORDER BY (id, head_repo_id, base_repo_id)
OPTIMIZE TABLE `ghtorrent`.`pull_requests` FINAL;
DROP TABLE IF EXISTS `ghtorrent`.`issues` ;
CREATE TABLE `ghtorrent`.`issues` (
`id` Int32,
`repo_id` Nullable(Int32),
`reporter_id` Nullable(Int32),
`assignee_id` Nullable(Int32),
`pull_request` Int8,
`pull_request_id` Nullable(Int32),
`created_at` DateTime,
`issue_id` Int32
)
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `repo_id`, `reporter_id`, `assignee_id`, `created_at`) SETTINGS allow_nullable_key = 1
;
-- import from issues format CSV
INSERT INTO `ghtorrent`.`issues` FROM INFILE 'issues.csv' FORMAT CSV;
-- create primary key id, ORDER BY (id, repo_id, reporter_id, assignee_id, created_at)
OPTIMIZE TABLE `ghtorrent`.`issues` FINAL;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment