Created
May 9, 2023 20:25
-
-
Save hrz6976/de31f0549c4e18c543f30b4f36662e22 to your computer and use it in GitHub Desktop.
Import ghtorrent dataset to clickhouse (clickhouse-client --password <PASSWORD> --send-timeout=3600 --receive-timeout=600 < clickhouse_ghtorrent.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SET input_format_null_as_default = 1; | |
SET input_format_allow_errors_num = 100000; | |
SET input_format_allow_errors_ratio = 0.1; | |
SET date_time_input_format = 'best_effort'; | |
CREATE DATABASE IF NOT EXISTS `ghtorrent`; | |
DROP TABLE IF EXISTS `ghtorrent`.`organization_members` ; | |
CREATE TABLE `ghtorrent`.`organization_members` ( | |
`org_id` Int32, | |
`user_id` Int32, | |
`created_at` DateTime | |
) | |
ENGINE = MergeTree() PRIMARY KEY `org_id` ORDER BY (`org_id`, `user_id`) SETTINGS allow_nullable_key =1; | |
-- import from organization_members format CSV | |
INSERT INTO `ghtorrent`.`organization_members` FROM INFILE 'organization_members.csv' FORMAT CSV; | |
-- create primary key org_id, ORDER BY (org_id, user_id) | |
OPTIMIZE TABLE `ghtorrent`.`organization_members` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`users` ; | |
CREATE TABLE `ghtorrent`.`users` ( | |
`id` Int32, | |
`login` String, | |
`company` Nullable(String), | |
`created_at` DateTime, | |
`type` String, | |
`fake` Int8, | |
`deleted` Int8, | |
`long` Nullable(String), | |
`lat` Nullable(String), | |
`country_code` Nullable(String), | |
`state` Nullable(String), | |
`city` Nullable(String), | |
`location` Nullable(String) | |
) | |
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `login`, `created_at`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from users format CSV | |
INSERT INTO `ghtorrent`.`users` FROM INFILE 'users.csv' FORMAT CSV; | |
-- create primary key id, ORDER BY (id, login, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`users` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`project_languages` ; | |
CREATE TABLE `ghtorrent`.`project_languages` ( | |
`project_id` Int32, | |
`language` Nullable(String), | |
`bytes` Nullable(Int32), | |
`created_at` DateTime | |
) | |
ENGINE = MergeTree() PRIMARY KEY `project_id` ORDER BY (`project_id`, `language`, `created_at`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from project_languages format CSV | |
INSERT INTO `ghtorrent`.`project_languages` FROM INFILE 'project_languages.csv' FORMAT CSV; | |
-- create primary key project_id, ORDER BY (project_id, language, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`project_languages` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`project_topics` ; | |
CREATE TABLE `ghtorrent`.`project_topics` ( | |
`project_id` Int32, | |
`topic_name` String, | |
`created_at` DateTime, | |
`deleted` Int8 | |
) | |
ENGINE = MergeTree() PRIMARY KEY `project_id` ORDER BY (`project_id`, `topic_name`, `created_at`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from project_topics format CSV | |
INSERT INTO `ghtorrent`.`project_topics` FROM INFILE 'project_topics.csv' FORMAT CSV; | |
-- create primary key project_id, ORDER BY (project_id, topic_name, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`project_topics` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`issue_comments` ; | |
CREATE TABLE `ghtorrent`.`issue_comments` ( | |
`issue_id` Int32, | |
`user_id` Int32, | |
`comment_id` String, | |
`created_at` DateTime | |
) | |
ENGINE = MergeTree() PRIMARY KEY `issue_id` ORDER BY (`issue_id`, `user_id`, `comment_id`, `created_at`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from issue_comments format CSV | |
INSERT INTO `ghtorrent`.`issue_comments` FROM INFILE 'issue_comments.csv' FORMAT CSV; | |
-- create primary key issue_id, ORDER BY (issue_id, user_id, comment_id, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`issue_comments` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`project_commits` ; | |
CREATE TABLE `ghtorrent`.`project_commits` ( | |
`project_id` Int32, | |
`commit_id` Int32 | |
) | |
ENGINE = MergeTree() PRIMARY KEY `project_id` ORDER BY (`project_id`, `commit_id`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from project_commits format CSV | |
INSERT INTO `ghtorrent`.`project_commits` FROM INFILE 'project_commits.csv' FORMAT CSV; | |
-- create primary key project_id, ORDER BY (project_id, commit_id) | |
OPTIMIZE TABLE `ghtorrent`.`project_commits` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`projects` ; | |
CREATE TABLE IF NOT EXISTS `ghtorrent`.`projects` ( | |
`id` Int32, | |
`url` Nullable(String), | |
`owner_id` Nullable(Int32), | |
`name` String, | |
`description` Nullable(String), | |
`language` Nullable(String), | |
`created_at` DateTime, | |
`forked_from` Nullable(Int32), | |
`deleted` Int8, | |
`updated_at` DateTime, | |
`forked_commit_id` Nullable(Int32) | |
) | |
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `name`, `created_at`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from projects format CSV | |
INSERT INTO `ghtorrent`.`projects` FROM INFILE 'projects.csv' FORMAT CSV; | |
-- create primary key id, ORDER BY (id, name, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`projects` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`commit_parents` ; | |
CREATE TABLE `ghtorrent`.`commit_parents` ( | |
`commit_id` Int32, | |
`parent_id` Int32 | |
) | |
ENGINE = MergeTree() PRIMARY KEY `commit_id` ORDER BY (`commit_id`, `parent_id`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from commit_parents format CSV | |
INSERT INTO `ghtorrent`.`commit_parents` FROM INFILE 'commit_parents.csv' FORMAT CSV; | |
-- create primary key commit_id, ORDER BY (commit_id, parent_id) | |
OPTIMIZE TABLE `ghtorrent`.`commit_parents` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`commits` ; | |
CREATE TABLE `ghtorrent`.`commits` ( | |
`id` Int32, | |
`sha` Nullable(String), | |
`author_id` Nullable(Int32), | |
`committer_id` Nullable(Int32), | |
`project_id` Nullable(Int32), | |
`created_at` DateTime | |
) | |
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `sha`, `created_at`) SETTINGS allow_nullable_key = 1; | |
-- import from commits format CSV | |
INSERT INTO `ghtorrent`.`commits` FROM INFILE 'commits.csv' FORMAT CSV; | |
-- create primary key id, ORDER BY (id, sha, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`commits` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`pull_request_history` ; | |
CREATE TABLE `ghtorrent`.`pull_request_history` ( | |
`id` Int32, | |
`pull_request_id` Int32, | |
`created_at` DateTime, | |
`action` String, | |
`actor_id` Nullable(Int32) | |
) | |
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `pull_request_id`, `created_at`) SETTINGS allow_nullable_key = 1; | |
; | |
-- import from pull_request_history format CSV | |
INSERT INTO `ghtorrent`.`pull_request_history` FROM INFILE 'pull_request_history.csv' FORMAT CSV; | |
-- create primary key id, ORDER BY (id, pull_request_id, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`pull_request_history` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`followers` ; | |
CREATE TABLE `ghtorrent`.`followers` ( | |
`follower_id` Int32, | |
`user_id` Int32, | |
`created_at` DateTime | |
) | |
ENGINE = MergeTree() PRIMARY KEY `follower_id` ORDER BY (`follower_id`, `user_id`, `created_at`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from followers format CSV | |
INSERT INTO `ghtorrent`.`followers` FROM INFILE 'followers.csv' FORMAT CSV; | |
-- create primary key follower_id, ORDER BY (follower_id, user_id, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`followers` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`commit_comments` ; | |
CREATE TABLE `ghtorrent`.`commit_comments` ( | |
`id` Int32, | |
`commit_id` Int32, | |
`user_id` Int32, | |
`body` Nullable(String), | |
`line` Nullable(Int32), | |
`position` Nullable(Int32), | |
`comment_id` Int32, | |
`created_at` DateTime | |
) | |
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `commit_id`, `created_at`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from commit_comments format CSV | |
INSERT INTO `ghtorrent`.`commit_comments` FROM INFILE 'commit_comments.csv' FORMAT CSV; | |
-- create primary key id, ORDER BY (id, commit_id, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`commit_comments` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`issue_labels` ; | |
CREATE TABLE `ghtorrent`.`issue_labels` ( | |
`label_id` Int32, | |
`issue_id` Int32 | |
) | |
ENGINE = MergeTree() PRIMARY KEY `label_id` ORDER BY (`label_id`, `issue_id`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from issue_labels format CSV | |
INSERT INTO `ghtorrent`.`issue_labels` FROM INFILE 'issue_labels.csv' FORMAT CSV; | |
-- create primary key label_id, ORDER BY (label_id, issue_id) | |
OPTIMIZE TABLE `ghtorrent`.`issue_labels` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`project_members` ; | |
CREATE TABLE `ghtorrent`.`project_members` ( | |
`repo_id` Int32, | |
`user_id` Int32, | |
`created_at` DateTime | |
) | |
ENGINE = MergeTree() PRIMARY KEY `repo_id` ORDER BY (`repo_id`, `user_id`, `created_at`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from project_members format CSV | |
INSERT INTO `ghtorrent`.`project_members` FROM INFILE 'project_members.csv' FORMAT CSV; | |
-- create primary key repo_id, ORDER BY (repo_id, user_id, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`project_members` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`pull_request_commits` ; | |
CREATE TABLE `ghtorrent`.`pull_request_commits` ( | |
`pull_request_id` Int32, | |
`commit_id` Int32 | |
) | |
ENGINE = MergeTree() PRIMARY KEY `pull_request_id` ORDER BY (`pull_request_id`, `commit_id`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from pull_request_commits format CSV | |
INSERT INTO `ghtorrent`.`pull_request_commits` FROM INFILE 'pull_request_commits.csv' FORMAT CSV; | |
-- create primary key pull_request_id, ORDER BY (pull_request_id, commit_id) | |
OPTIMIZE TABLE `ghtorrent`.`pull_request_commits` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`repo_milestones` ; | |
CREATE TABLE `ghtorrent`.`repo_milestones` ( | |
`id` Int32, | |
`repo_id` Nullable(Int32), | |
`name` String | |
) | |
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `repo_id`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from repo_milestones format CSV | |
INSERT INTO `ghtorrent`.`repo_milestones` FROM INFILE 'repo_milestones.csv' FORMAT CSV; | |
-- create primary key id, ORDER BY (id, repo_id) | |
OPTIMIZE TABLE `ghtorrent`.`repo_milestones` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`pull_request_comments` ; | |
CREATE TABLE `ghtorrent`.`pull_request_comments` ( | |
`pull_request_id` Int32, | |
`user_id` Int32, | |
`comment_id` String, | |
`position` Nullable(Int32), | |
`body` Nullable(String), | |
`commit_id` Int32, | |
`created_at` DateTime | |
) | |
ENGINE = MergeTree() PRIMARY KEY `pull_request_id` ORDER BY (`pull_request_id`, `user_id`, `created_at`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from pull_request_comments format CSV | |
INSERT INTO `ghtorrent`.`pull_request_comments` FROM INFILE 'pull_request_comments.csv' FORMAT CSV; | |
-- create primary key pull_request_id, ORDER BY (pull_request_id, user_id, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`pull_request_comments` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`watchers` ; | |
CREATE TABLE `ghtorrent`.`watchers` ( | |
`repo_id` Int32, | |
`user_id` Int32, | |
`created_at` DateTime | |
) | |
ENGINE = MergeTree() PRIMARY KEY `repo_id` ORDER BY (`repo_id`, `user_id`, `created_at`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from watchers format CSV | |
INSERT INTO `ghtorrent`.`watchers` FROM INFILE 'watchers.csv' FORMAT CSV; | |
-- create primary key repo_id, ORDER BY (repo_id, user_id, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`watchers` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`repo_labels` ; | |
CREATE TABLE `ghtorrent`.`repo_labels` ( | |
`id` Int32, | |
`repo_id` Nullable(Int32), | |
`name` String | |
) | |
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `repo_id`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from repo_labels format CSV | |
INSERT INTO `ghtorrent`.`repo_labels` FROM INFILE 'repo_labels.csv' FORMAT CSV; | |
-- create primary key id, ORDER BY (id, repo_id) | |
OPTIMIZE TABLE `ghtorrent`.`repo_labels` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`issue_events` ; | |
CREATE TABLE `ghtorrent`.`issue_events` ( | |
`event_id` String, | |
`issue_id` Int32, | |
`actor_id` Int32, | |
`action` String, | |
`action_specific` Nullable(String), | |
`created_at` DateTime | |
) | |
ENGINE = MergeTree() PRIMARY KEY `event_id` ORDER BY (`event_id`, `issue_id`, `actor_id`, `created_at`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from issue_events format CSV | |
INSERT INTO `ghtorrent`.`issue_events` FROM INFILE 'issue_events.csv' FORMAT CSV; | |
-- create primary key event_id, ORDER BY (event_id, issue_id, actor_id, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`issue_events` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`pull_requests` ; | |
CREATE TABLE `ghtorrent`.`pull_requests` ( | |
`id` Int32, | |
`head_repo_id` Nullable(Int32), | |
`base_repo_id` Int32, | |
`head_commit_id` Nullable(Int32), | |
`base_commit_id` Int32, | |
`pullreq_id` Int32, | |
`intra_branch` Int8 | |
) | |
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `head_repo_id`, `base_repo_id`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from pull_requests format CSV | |
INSERT INTO `ghtorrent`.`pull_requests` FROM INFILE 'pull_requests.csv' FORMAT CSV; | |
-- create primary key id, ORDER BY (id, head_repo_id, base_repo_id) | |
OPTIMIZE TABLE `ghtorrent`.`pull_requests` FINAL; | |
DROP TABLE IF EXISTS `ghtorrent`.`issues` ; | |
CREATE TABLE `ghtorrent`.`issues` ( | |
`id` Int32, | |
`repo_id` Nullable(Int32), | |
`reporter_id` Nullable(Int32), | |
`assignee_id` Nullable(Int32), | |
`pull_request` Int8, | |
`pull_request_id` Nullable(Int32), | |
`created_at` DateTime, | |
`issue_id` Int32 | |
) | |
ENGINE = MergeTree() PRIMARY KEY `id` ORDER BY (`id`, `repo_id`, `reporter_id`, `assignee_id`, `created_at`) SETTINGS allow_nullable_key = 1 | |
; | |
-- import from issues format CSV | |
INSERT INTO `ghtorrent`.`issues` FROM INFILE 'issues.csv' FORMAT CSV; | |
-- create primary key id, ORDER BY (id, repo_id, reporter_id, assignee_id, created_at) | |
OPTIMIZE TABLE `ghtorrent`.`issues` FINAL; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment