Skip to content

Instantly share code, notes, and snippets.

@davidlaprade
Last active July 23, 2022 16:31
Show Gist options
  • Save davidlaprade/cda5775d39c4f6b42b149d1d01111c74 to your computer and use it in GitHub Desktop.
Save davidlaprade/cda5775d39c4f6b42b149d1d01111c74 to your computer and use it in GitHub Desktop.
Google BigQuery of Open Source Github Repos to Determine Average Indentation Per line
-- Original query can be found here:
-- https://console.cloud.google.com/bigquery?sq=226172199733:c6cfab4e7c6b49e791513d3229f9ddf4
-- CAVEATS:
-- * This query is quite expensive to run on the full public dataset, ~$13. If
-- you are just interested in trying it out, I recommend commenting out the
-- first subquery and replacing with the version below, which uses a much
-- smaller sample table.
-- * This includes indentation on code comments, which should be fine as comments
-- usually inherit the indentation of the code context
WITH cleaned_contents as (
SELECT
c.id as id,
-- cleaned_content == content with standardized indentation and line break chars
REGEXP_REPLACE(
REGEXP_REPLACE(content, r'\t', ' '), -- standardize all indentation
r'(\r|\v)', -- standardize all line breaks
'\n'
) cleaned_content,
REGEXP_SUBSTR(f.path, r'\.\w+$') as language
FROM bigquery-public-data.github_repos.contents c
JOIN bigquery-public-data.github_repos.files f ON c.id = f.id
WHERE 0=0
and not c.binary
-- non-text files (e.g. images) have no content but have sizes > 0
and c.content is not null
-- exclude files that are really large, as they likely weren't hand written
and c.size < 15000
-- the file needs to have a non-trivial amount of content
and c.size > 100
and REGEXP_SUBSTR(f.path, r'\.\w+$') in (
-- file extensions for the most popular languages
'.js', '.ts', '.rb', '.py', '.cs', '.cc', '.php', '.cpp', '.cxx',
'.java', '.c', '.sh', '.r', '.sql', '.swift', '.h', '.m', '.html',
'.css', '.scss', '.less', '.sass', '.rs', '.rlib', '.go', '.sc',
'.scala', '.ps1'
)
-- Comment out the query above and replace with this query if you want to
-- test against a *much* smaller data set
--
-- SELECT
-- id,
-- REGEXP_REPLACE(
-- REGEXP_REPLACE(content, r'\t', ' '), -- standardize all indentation
-- r'(\r|\v)', -- standardize all line breaks
-- '\n'
-- ) cleaned_content,
-- REGEXP_SUBSTR(sample_path, r'\.\w+$') as language
-- FROM bigquery-public-data.github_repos.sample_contents
-- WHERE 0=0
-- and not binary
-- -- non-text files (e.g. images) have no content but have sizes > 0
-- and content is not null
-- -- exclude files that are really large, as they likely weren't hand written
-- and size < 15000
-- -- the file needs to have a non-trivial amount of content
-- and size > 100
-- and REGEXP_SUBSTR(sample_path, r'\.\w+$') in (
-- -- file extensions for the most popular languages
-- '.js', '.ts', '.rb', '.py', '.cs', '.cc', '.php', '.cpp', '.cxx',
-- '.java', '.c', '.sh', '.r', '.sql', '.swift', '.h', '.m', '.html',
-- '.css', '.scss', '.less', '.sass', '.rs', '.rlib', '.go', '.sc',
-- '.scala', '.ps1'
-- )
), cleaned_lines as (
SELECT
id,
(CASE language
WHEN '.py' then 'python'
WHEN '.ts' then 'javascript'
WHEN '.js' then 'javascript'
WHEN '.rb' then 'ruby'
WHEN '.sh' then 'shell'
WHEN '.r' then 'r'
WHEN '.sql' then 'sql'
WHEN '.swift' then 'swift'
WHEN '.php' then 'php'
WHEN '.java' then 'java'
WHEN '.cs' then 'c#'
WHEN '.cpp' then 'c++'
WHEN '.cc' then 'c++'
WHEN '.cxx' then 'c++'
WHEN '.c' then 'c'
WHEN '.h' then 'objective c'
WHEN '.m' then 'objective c'
WHEN '.html' then 'html'
WHEN '.css' then 'css'
WHEN '.scss' then 'css'
WHEN '.sass' then 'css'
WHEN '.less' then 'css'
WHEN '.rs' then 'rust'
WHEN '.rlib' then 'rust'
WHEN '.go' then 'go'
WHEN '.scala' then 'scala'
WHEN '.sc' then 'scala'
WHEN '.ps1' then 'powershell'
ELSE language
END) as language,
SPLIT(cleaned_content, '\n') as lines_of_code
FROM cleaned_contents
WHERE 0=0
-- There needs to be at least one line with leading whitespace, otherwise
-- we assume it was auto-generated.
and REGEXP_CONTAINS(cleaned_content, r'\n\s+')
), line_data as (
SELECT
id,
language,
line,
LENGTH(line) as line_length,
LENGTH(COALESCE(REGEXP_EXTRACT(line, r'\A\s+'), '')) as indentation,
FROM cleaned_lines
-- https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays#flattening_arrays
CROSS JOIN UNNEST(cleaned_lines.lines_of_code) AS line
WHERE 0=0
AND LENGTH(line) > 0 -- we don't want blank lines
AND LENGTH(line) < 300 -- a line that is 300+ char is likely autogenerated
), ranked_line_lengths as (
SELECT
language,
line_length,
ROW_NUMBER() OVER (
PARTITION BY language ORDER BY COUNT(line_length) DESC
) as line_length_ranking,
FROM line_data
GROUP BY language, line_length
ORDER BY COUNT(line_length) DESC
), ranked_indentations as (
SELECT
language,
indentation,
ROW_NUMBER() OVER (
PARTITION BY language ORDER BY COUNT(indentation) DESC
) as indentation_ranking,
FROM line_data
GROUP BY language, indentation
ORDER BY COUNT(indentation) DESC
), averaged_data as (
SELECT
language,
ROUND(AVG(line_length), 1) as avg_line_length,
ROUND(AVG(indentation), 1) as avg_indentation,
APPROX_QUANTILES(line_length, 100)[OFFSET(50)] as median_line_length,
APPROX_QUANTILES(indentation, 100)[OFFSET(50)] as median_indentation,
APPROX_QUANTILES(line_length, 100)[OFFSET(99)] as line_length_99th_pcnt,
APPROX_QUANTILES(indentation, 100)[OFFSET(99)] as indentation_99th_pcnt,
COUNT(DISTINCT(line)) as lines_analyzed,
COUNT(DISTINCT(id)) as files_analyzed
FROM line_data
GROUP BY language
ORDER BY AVG(line_length) DESC
)
SELECT
data.*,
rl.line_length as mode_line_length,
ri.indentation as mode_indentation
FROM averaged_data data
JOIN ranked_indentations ri ON data.language = ri.language
JOIN ranked_line_lengths rl ON data.language = rl.language
WHERE 0=0
AND ri.indentation_ranking=1
AND rl.line_length_ranking=1
ORDER BY avg_line_length DESC
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment