Last active
July 23, 2022 16:31
-
-
Save davidlaprade/cda5775d39c4f6b42b149d1d01111c74 to your computer and use it in GitHub Desktop.
Google BigQuery of Open Source Github Repos to Determine Average Indentation Per line
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Original query can be found here: | |
-- https://console.cloud.google.com/bigquery?sq=226172199733:c6cfab4e7c6b49e791513d3229f9ddf4 | |
-- CAVEATS: | |
-- * This query is quite expensive to run on the full public dataset, ~$13. If | |
-- you are just interested in trying it out, I recommend commenting out the | |
-- first subquery and replacing with the version below, which uses a much | |
-- smaller sample table. | |
-- * This includes indentation on code comments, which should be fine as comments | |
-- usually inherit the indentation of the code context | |
WITH cleaned_contents as ( | |
SELECT | |
c.id as id, | |
-- cleaned_content == content with standardized indentation and line break chars | |
REGEXP_REPLACE( | |
REGEXP_REPLACE(content, r'\t', ' '), -- standardize all indentation | |
r'(\r|\v)', -- standardize all line breaks | |
'\n' | |
) cleaned_content, | |
REGEXP_SUBSTR(f.path, r'\.\w+$') as language | |
FROM bigquery-public-data.github_repos.contents c | |
JOIN bigquery-public-data.github_repos.files f ON c.id = f.id | |
WHERE 0=0 | |
and not c.binary | |
-- non-text files (e.g. images) have no content but have sizes > 0 | |
and c.content is not null | |
-- exclude files that are really large, as they likely weren't hand written | |
and c.size < 15000 | |
-- the file needs to have a non-trivial amount of content | |
and c.size > 100 | |
and REGEXP_SUBSTR(f.path, r'\.\w+$') in ( | |
-- file extensions for the most popular languages | |
'.js', '.ts', '.rb', '.py', '.cs', '.cc', '.php', '.cpp', '.cxx', | |
'.java', '.c', '.sh', '.r', '.sql', '.swift', '.h', '.m', '.html', | |
'.css', '.scss', '.less', '.sass', '.rs', '.rlib', '.go', '.sc', | |
'.scala', '.ps1' | |
) | |
-- Comment out the query above and replace with this query if you want to | |
-- test against a *much* smaller data set | |
-- | |
-- SELECT | |
-- id, | |
-- REGEXP_REPLACE( | |
-- REGEXP_REPLACE(content, r'\t', ' '), -- standardize all indentation | |
-- r'(\r|\v)', -- standardize all line breaks | |
-- '\n' | |
-- ) cleaned_content, | |
-- REGEXP_SUBSTR(sample_path, r'\.\w+$') as language | |
-- FROM bigquery-public-data.github_repos.sample_contents | |
-- WHERE 0=0 | |
-- and not binary | |
-- -- non-text files (e.g. images) have no content but have sizes > 0 | |
-- and content is not null | |
-- -- exclude files that are really large, as they likely weren't hand written | |
-- and size < 15000 | |
-- -- the file needs to have a non-trivial amount of content | |
-- and size > 100 | |
-- and REGEXP_SUBSTR(sample_path, r'\.\w+$') in ( | |
-- -- file extensions for the most popular languages | |
-- '.js', '.ts', '.rb', '.py', '.cs', '.cc', '.php', '.cpp', '.cxx', | |
-- '.java', '.c', '.sh', '.r', '.sql', '.swift', '.h', '.m', '.html', | |
-- '.css', '.scss', '.less', '.sass', '.rs', '.rlib', '.go', '.sc', | |
-- '.scala', '.ps1' | |
-- ) | |
), cleaned_lines as ( | |
SELECT | |
id, | |
(CASE language | |
WHEN '.py' then 'python' | |
WHEN '.ts' then 'javascript' | |
WHEN '.js' then 'javascript' | |
WHEN '.rb' then 'ruby' | |
WHEN '.sh' then 'shell' | |
WHEN '.r' then 'r' | |
WHEN '.sql' then 'sql' | |
WHEN '.swift' then 'swift' | |
WHEN '.php' then 'php' | |
WHEN '.java' then 'java' | |
WHEN '.cs' then 'c#' | |
WHEN '.cpp' then 'c++' | |
WHEN '.cc' then 'c++' | |
WHEN '.cxx' then 'c++' | |
WHEN '.c' then 'c' | |
WHEN '.h' then 'objective c' | |
WHEN '.m' then 'objective c' | |
WHEN '.html' then 'html' | |
WHEN '.css' then 'css' | |
WHEN '.scss' then 'css' | |
WHEN '.sass' then 'css' | |
WHEN '.less' then 'css' | |
WHEN '.rs' then 'rust' | |
WHEN '.rlib' then 'rust' | |
WHEN '.go' then 'go' | |
WHEN '.scala' then 'scala' | |
WHEN '.sc' then 'scala' | |
WHEN '.ps1' then 'powershell' | |
ELSE language | |
END) as language, | |
SPLIT(cleaned_content, '\n') as lines_of_code | |
FROM cleaned_contents | |
WHERE 0=0 | |
-- There needs to be at least one line with leading whitespace, otherwise | |
-- we assume it was auto-generated. | |
and REGEXP_CONTAINS(cleaned_content, r'\n\s+') | |
), line_data as ( | |
SELECT | |
id, | |
language, | |
line, | |
LENGTH(line) as line_length, | |
LENGTH(COALESCE(REGEXP_EXTRACT(line, r'\A\s+'), '')) as indentation, | |
FROM cleaned_lines | |
-- https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays#flattening_arrays | |
CROSS JOIN UNNEST(cleaned_lines.lines_of_code) AS line | |
WHERE 0=0 | |
AND LENGTH(line) > 0 -- we don't want blank lines | |
AND LENGTH(line) < 300 -- a line that is 300+ char is likely autogenerated | |
), ranked_line_lengths as ( | |
SELECT | |
language, | |
line_length, | |
ROW_NUMBER() OVER ( | |
PARTITION BY language ORDER BY COUNT(line_length) DESC | |
) as line_length_ranking, | |
FROM line_data | |
GROUP BY language, line_length | |
ORDER BY COUNT(line_length) DESC | |
), ranked_indentations as ( | |
SELECT | |
language, | |
indentation, | |
ROW_NUMBER() OVER ( | |
PARTITION BY language ORDER BY COUNT(indentation) DESC | |
) as indentation_ranking, | |
FROM line_data | |
GROUP BY language, indentation | |
ORDER BY COUNT(indentation) DESC | |
), averaged_data as ( | |
SELECT | |
language, | |
ROUND(AVG(line_length), 1) as avg_line_length, | |
ROUND(AVG(indentation), 1) as avg_indentation, | |
APPROX_QUANTILES(line_length, 100)[OFFSET(50)] as median_line_length, | |
APPROX_QUANTILES(indentation, 100)[OFFSET(50)] as median_indentation, | |
APPROX_QUANTILES(line_length, 100)[OFFSET(99)] as line_length_99th_pcnt, | |
APPROX_QUANTILES(indentation, 100)[OFFSET(99)] as indentation_99th_pcnt, | |
COUNT(DISTINCT(line)) as lines_analyzed, | |
COUNT(DISTINCT(id)) as files_analyzed | |
FROM line_data | |
GROUP BY language | |
ORDER BY AVG(line_length) DESC | |
) | |
SELECT | |
data.*, | |
rl.line_length as mode_line_length, | |
ri.indentation as mode_indentation | |
FROM averaged_data data | |
JOIN ranked_indentations ri ON data.language = ri.language | |
JOIN ranked_line_lengths rl ON data.language = rl.language | |
WHERE 0=0 | |
AND ri.indentation_ranking=1 | |
AND rl.line_length_ranking=1 | |
ORDER BY avg_line_length DESC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment