Skip to content

Instantly share code, notes, and snippets.

@Jong-Sig
Created June 16, 2024 20:23
Show Gist options
  • Save Jong-Sig/b82f3a7a9001e10c93d1bbc6c72a8bd8 to your computer and use it in GitHub Desktop.
Save Jong-Sig/b82f3a7a9001e10c93d1bbc6c72a8bd8 to your computer and use it in GitHub Desktop.
Create SQL Table with a given programming languages (via BigQuery)
/****************************
1. CREATE FULL SAMPLES
IN GIVEN LANGUAGES:
JavaScript, Python, Java, PHP, Ruby, Go, C++, TypeScript, C#, C
****************************/
-- ** Create table of repo IDs of repositories from given languages
CREATE OR REPLACE TABLE `github-416320.github_full.github_sample`
OPTIONS (description = "Unique Samples from GitHub Archive of Top 10 Programming Languages")
AS
SELECT DISTINCT repo.id as RepoID,
repo.name as RepoName,
JSON_VALUE(payload, '$.pull_request.base.repo.language') as PRRepoLanguage
FROM `githubarchive.year.20*`
WHERE
-- ** years between 2015 and 2023
_TABLE_SUFFIX IN ('15', '16', '17', '18', '19', '20', '21', '22', '23')
-- ** Identify languages from PR event
AND type = 'PullRequestEvent'
-- ** Languages in top 10 (source: https://madnight.github.io/githut/#/pull_requests/2023/1)
AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.base.repo.language') IN ('JavaScript', 'Python', 'Java', 'PHP', 'Ruby', 'Go', 'C++', 'TypeScript', 'C#', 'C')
ORDER BY PRRepoLanguage, RepoID;
/****************************
2. CREATE MAIN TABLES WITH RAW DATA
BASED ON TREATED SAMPLES
****************************/
-- ** Create table of raw data of trending repositories
CREATE OR REPLACE TABLE `github-416320.github_full.github_sample_data`
OPTIONS (description = "Raw Data from GitHub Archive of Top 10 Programming Languages")
AS
-- ** Remove duplicated rows
SELECT k.*
FROM (
SELECT ARRAY_AGG(x LIMIT 1)[OFFSET(0)] k
FROM `githubarchive.year.20*` x
WHERE
-- ** years between 2015 and 2023
_TABLE_SUFFIX IN ('15', '16', '17', '18', '19', '20', '21', '22', '23')
-- ** All repos in top 10 languages
AND repo.id IN
(SELECT DISTINCT RepoID
FROM `github-416320.github_full.github_sample`)
GROUP BY id
)
ORDER BY created_at, repo.id;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment