Created
June 16, 2024 20:23
-
-
Save Jong-Sig/b82f3a7a9001e10c93d1bbc6c72a8bd8 to your computer and use it in GitHub Desktop.
Create SQL Table with a given programming languages (via BigQuery)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/**************************** | |
1. CREATE FULL SAMPLES | |
IN GIVEN LANGUAGES: | |
JavaScript, Python, Java, PHP, Ruby, Go, C++, TypeScript, C#, C | |
****************************/ | |
-- ** Create table of repo IDs of repositories from given languages | |
CREATE OR REPLACE TABLE `github-416320.github_full.github_sample` | |
OPTIONS (description = "Unique Samples from GitHub Archive of Top 10 Programming Languages") | |
AS | |
SELECT DISTINCT repo.id as RepoID, | |
repo.name as RepoName, | |
JSON_VALUE(payload, '$.pull_request.base.repo.language') as PRRepoLanguage | |
FROM `githubarchive.year.20*` | |
WHERE | |
-- ** years between 2015 and 2023 | |
_TABLE_SUFFIX IN ('15', '16', '17', '18', '19', '20', '21', '22', '23') | |
-- ** Identify languages from PR event | |
AND type = 'PullRequestEvent' | |
-- ** Languages in top 10 (source: https://madnight.github.io/githut/#/pull_requests/2023/1) | |
AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.base.repo.language') IN ('JavaScript', 'Python', 'Java', 'PHP', 'Ruby', 'Go', 'C++', 'TypeScript', 'C#', 'C') | |
ORDER BY PRRepoLanguage, RepoID; | |
/**************************** | |
2. CREATE MAIN TABLES WITH RAW DATA | |
BASED ON TREATED SAMPLES | |
****************************/ | |
-- ** Create table of raw data of trending repositories | |
CREATE OR REPLACE TABLE `github-416320.github_full.github_sample_data` | |
OPTIONS (description = "Raw Data from GitHub Archive of Top 10 Programming Languages") | |
AS | |
-- ** Remove duplicated rows | |
SELECT k.* | |
FROM ( | |
SELECT ARRAY_AGG(x LIMIT 1)[OFFSET(0)] k | |
FROM `githubarchive.year.20*` x | |
WHERE | |
-- ** years between 2015 and 2023 | |
_TABLE_SUFFIX IN ('15', '16', '17', '18', '19', '20', '21', '22', '23') | |
-- ** All repos in top 10 languages | |
AND repo.id IN | |
(SELECT DISTINCT RepoID | |
FROM `github-416320.github_full.github_sample`) | |
GROUP BY id | |
) | |
ORDER BY created_at, repo.id; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment