davidlaprade/indentation-per-line.sql

## indentation-per-line.sql
-- Original query can be found here:
-- https://console.cloud.google.com/bigquery?sq=226172199733:c6cfab4e7c6b49e791513d3229f9ddf4

-- CAVEATS:
-- * This query is quite expensive to run on the full public dataset, ~$13. If
--   you are just interested in trying it out, I recommend commenting out the
--   first subquery and replacing with the version below, which uses a much
--   smaller sample table.
-- * This includes indentation on code comments, which should be fine as comments
--   usually inherit the indentation of the code context

WITH cleaned_contents as (
  SELECT
    c.id as id,
    -- cleaned_content == content with standardized indentation and line break chars
    REGEXP_REPLACE(
      REGEXP_REPLACE(content, r'\t', ' '), -- standardize all indentation
      r'(\r|\v)', -- standardize all line breaks
      '\n'
    ) cleaned_content,
    REGEXP_SUBSTR(f.path, r'\.\w+$') as language
  FROM bigquery-public-data.github_repos.contents c
  JOIN bigquery-public-data.github_repos.files f ON c.id = f.id
  WHERE 0=0
    and not c.binary
    -- non-text files (e.g. images) have no content but have sizes > 0
    and c.content is not null
    -- exclude files that are really large, as they likely weren't hand written
    and c.size < 15000
    -- the file needs to have a non-trivial amount of content
    and c.size > 100
    and REGEXP_SUBSTR(f.path, r'\.\w+$') in (
        -- file extensions for the most popular languages
          '.js', '.ts', '.rb', '.py', '.cs', '.cc', '.php', '.cpp', '.cxx',
          '.java', '.c', '.sh', '.r', '.sql', '.swift', '.h', '.m', '.html',
          '.css', '.scss', '.less', '.sass', '.rs', '.rlib', '.go', '.sc',
          '.scala', '.ps1'
    )
  -- Comment out the query above and replace with this query if you want to
  -- test against a *much* smaller data set
  --
  -- SELECT
  --   id,
  --   REGEXP_REPLACE(
  --     REGEXP_REPLACE(content, r'\t', ' '), -- standardize all indentation
  --     r'(\r|\v)', -- standardize all line breaks
  --     '\n'
  --   ) cleaned_content,
  --   REGEXP_SUBSTR(sample_path, r'\.\w+$') as language
  -- FROM bigquery-public-data.github_repos.sample_contents
  -- WHERE 0=0
  --   and not binary
  --   -- non-text files (e.g. images) have no content but have sizes > 0
  --   and content is not null
  --   -- exclude files that are really large, as they likely weren't hand written
  --   and size < 15000
  --   -- the file needs to have a non-trivial amount of content
  --   and size > 100
  --   and REGEXP_SUBSTR(sample_path, r'\.\w+$') in (
  --     -- file extensions for the most popular languages
  --     '.js', '.ts', '.rb', '.py', '.cs', '.cc', '.php', '.cpp', '.cxx',
  --     '.java', '.c', '.sh', '.r', '.sql', '.swift', '.h', '.m', '.html',
  --     '.css', '.scss', '.less', '.sass', '.rs', '.rlib', '.go', '.sc',
  --     '.scala', '.ps1'
  --   )
), cleaned_lines as (
  SELECT
    id,
    (CASE language
      WHEN '.py' then 'python'
      WHEN '.ts' then 'javascript'
      WHEN '.js' then 'javascript'
      WHEN '.rb' then 'ruby'
      WHEN '.sh' then 'shell'
      WHEN '.r' then 'r'
      WHEN '.sql' then 'sql'
      WHEN '.swift' then 'swift'
      WHEN '.php' then 'php'
      WHEN '.java' then 'java'
      WHEN '.cs' then 'c#'
      WHEN '.cpp' then 'c++'
      WHEN '.cc' then 'c++'
      WHEN '.cxx' then 'c++'
      WHEN '.c' then 'c'
      WHEN '.h' then 'objective c'
      WHEN '.m' then 'objective c'
      WHEN '.html' then 'html'
      WHEN '.css' then 'css'
      WHEN '.scss' then 'css'
      WHEN '.sass' then 'css'
      WHEN '.less' then 'css'
      WHEN '.rs' then 'rust'
      WHEN '.rlib' then 'rust'
      WHEN '.go' then 'go'
      WHEN '.scala' then 'scala'
      WHEN '.sc' then 'scala'
      WHEN '.ps1' then 'powershell'
      ELSE language
      END) as language,
    SPLIT(cleaned_content, '\n') as lines_of_code
  FROM cleaned_contents
  WHERE 0=0
    -- There needs to be at least one line with leading whitespace, otherwise
    -- we assume it was auto-generated.
    and REGEXP_CONTAINS(cleaned_content, r'\n\s+')
), line_data as (
  SELECT
    id,
    language,
    line,
    LENGTH(line) as line_length,
    LENGTH(COALESCE(REGEXP_EXTRACT(line, r'\A\s+'), '')) as indentation,
  FROM cleaned_lines
  -- https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays#flattening_arrays
  CROSS JOIN UNNEST(cleaned_lines.lines_of_code) AS line
  WHERE 0=0
    AND LENGTH(line) > 0 -- we don't want blank lines
    AND LENGTH(line) < 300 -- a line that is 300+ char is likely autogenerated
), ranked_line_lengths as (
  SELECT
    language,
    line_length,
    ROW_NUMBER() OVER (
      PARTITION BY language ORDER BY COUNT(line_length) DESC
    ) as line_length_ranking,
  FROM line_data
  GROUP BY language, line_length
  ORDER BY COUNT(line_length) DESC
), ranked_indentations as (
    SELECT
    language,
    indentation,
    ROW_NUMBER() OVER (
      PARTITION BY language ORDER BY COUNT(indentation) DESC
    ) as indentation_ranking,
  FROM line_data
  GROUP BY language, indentation
  ORDER BY COUNT(indentation) DESC
), averaged_data as (
  SELECT
    language,
    ROUND(AVG(line_length), 1) as avg_line_length,
    ROUND(AVG(indentation), 1) as avg_indentation,
    APPROX_QUANTILES(line_length, 100)[OFFSET(50)] as median_line_length,
    APPROX_QUANTILES(indentation, 100)[OFFSET(50)] as median_indentation,
    APPROX_QUANTILES(line_length, 100)[OFFSET(99)] as line_length_99th_pcnt,
    APPROX_QUANTILES(indentation, 100)[OFFSET(99)] as indentation_99th_pcnt,
    COUNT(DISTINCT(line)) as lines_analyzed,
    COUNT(DISTINCT(id)) as files_analyzed
  FROM line_data
  GROUP BY language
  ORDER BY AVG(line_length) DESC
)

SELECT
  data.*,
  rl.line_length as mode_line_length,
  ri.indentation as mode_indentation
FROM averaged_data data
JOIN ranked_indentations ri ON data.language = ri.language
JOIN ranked_line_lengths rl ON data.language = rl.language
WHERE 0=0
 AND ri.indentation_ranking=1
 AND rl.line_length_ranking=1
ORDER BY avg_line_length DESC
	-- Original query can be found here:
	-- https://console.cloud.google.com/bigquery?sq=226172199733:c6cfab4e7c6b49e791513d3229f9ddf4

	-- CAVEATS:
	-- * This query is quite expensive to run on the full public dataset, ~$13. If
	-- you are just interested in trying it out, I recommend commenting out the
	-- first subquery and replacing with the version below, which uses a much
	-- smaller sample table.
	-- * This includes indentation on code comments, which should be fine as comments
	-- usually inherit the indentation of the code context

	WITH cleaned_contents as (
	SELECT
	c.id as id,
	-- cleaned_content == content with standardized indentation and line break chars
	REGEXP_REPLACE(
	REGEXP_REPLACE(content, r'\t', ' '), -- standardize all indentation
	r'(\r\|\v)', -- standardize all line breaks
	'\n'
	) cleaned_content,
	REGEXP_SUBSTR(f.path, r'\.\w+$') as language
	FROM bigquery-public-data.github_repos.contents c
	JOIN bigquery-public-data.github_repos.files f ON c.id = f.id
	WHERE 0=0
	and not c.binary
	-- non-text files (e.g. images) have no content but have sizes > 0
	and c.content is not null
	-- exclude files that are really large, as they likely weren't hand written
	and c.size < 15000
	-- the file needs to have a non-trivial amount of content
	and c.size > 100
	and REGEXP_SUBSTR(f.path, r'\.\w+$') in (
	-- file extensions for the most popular languages
	'.js', '.ts', '.rb', '.py', '.cs', '.cc', '.php', '.cpp', '.cxx',
	'.java', '.c', '.sh', '.r', '.sql', '.swift', '.h', '.m', '.html',
	'.css', '.scss', '.less', '.sass', '.rs', '.rlib', '.go', '.sc',
	'.scala', '.ps1'
	)
	-- Comment out the query above and replace with this query if you want to
	-- test against a much smaller data set
	--
	-- SELECT
	-- id,
	-- REGEXP_REPLACE(
	-- REGEXP_REPLACE(content, r'\t', ' '), -- standardize all indentation
	-- r'(\r\|\v)', -- standardize all line breaks
	-- '\n'
	-- ) cleaned_content,
	-- REGEXP_SUBSTR(sample_path, r'\.\w+$') as language
	-- FROM bigquery-public-data.github_repos.sample_contents
	-- WHERE 0=0
	-- and not binary
	-- -- non-text files (e.g. images) have no content but have sizes > 0
	-- and content is not null
	-- -- exclude files that are really large, as they likely weren't hand written
	-- and size < 15000
	-- -- the file needs to have a non-trivial amount of content
	-- and size > 100
	-- and REGEXP_SUBSTR(sample_path, r'\.\w+$') in (
	-- -- file extensions for the most popular languages
	-- '.js', '.ts', '.rb', '.py', '.cs', '.cc', '.php', '.cpp', '.cxx',
	-- '.java', '.c', '.sh', '.r', '.sql', '.swift', '.h', '.m', '.html',
	-- '.css', '.scss', '.less', '.sass', '.rs', '.rlib', '.go', '.sc',
	-- '.scala', '.ps1'
	-- )
	), cleaned_lines as (
	SELECT
	id,
	(CASE language
	WHEN '.py' then 'python'
	WHEN '.ts' then 'javascript'
	WHEN '.js' then 'javascript'
	WHEN '.rb' then 'ruby'
	WHEN '.sh' then 'shell'
	WHEN '.r' then 'r'
	WHEN '.sql' then 'sql'
	WHEN '.swift' then 'swift'
	WHEN '.php' then 'php'
	WHEN '.java' then 'java'
	WHEN '.cs' then 'c#'
	WHEN '.cpp' then 'c++'
	WHEN '.cc' then 'c++'
	WHEN '.cxx' then 'c++'
	WHEN '.c' then 'c'
	WHEN '.h' then 'objective c'
	WHEN '.m' then 'objective c'
	WHEN '.html' then 'html'
	WHEN '.css' then 'css'
	WHEN '.scss' then 'css'
	WHEN '.sass' then 'css'
	WHEN '.less' then 'css'
	WHEN '.rs' then 'rust'
	WHEN '.rlib' then 'rust'
	WHEN '.go' then 'go'
	WHEN '.scala' then 'scala'
	WHEN '.sc' then 'scala'
	WHEN '.ps1' then 'powershell'
	ELSE language
	END) as language,
	SPLIT(cleaned_content, '\n') as lines_of_code
	FROM cleaned_contents
	WHERE 0=0
	-- There needs to be at least one line with leading whitespace, otherwise
	-- we assume it was auto-generated.
	and REGEXP_CONTAINS(cleaned_content, r'\n\s+')
	), line_data as (
	SELECT
	id,
	language,
	line,
	LENGTH(line) as line_length,
	LENGTH(COALESCE(REGEXP_EXTRACT(line, r'\A\s+'), '')) as indentation,
	FROM cleaned_lines
	-- https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays#flattening_arrays
	CROSS JOIN UNNEST(cleaned_lines.lines_of_code) AS line
	WHERE 0=0
	AND LENGTH(line) > 0 -- we don't want blank lines
	AND LENGTH(line) < 300 -- a line that is 300+ char is likely autogenerated
	), ranked_line_lengths as (
	SELECT
	language,
	line_length,
	ROW_NUMBER() OVER (
	PARTITION BY language ORDER BY COUNT(line_length) DESC
	) as line_length_ranking,
	FROM line_data
	GROUP BY language, line_length
	ORDER BY COUNT(line_length) DESC
	), ranked_indentations as (
	SELECT
	language,
	indentation,
	ROW_NUMBER() OVER (
	PARTITION BY language ORDER BY COUNT(indentation) DESC
	) as indentation_ranking,
	FROM line_data
	GROUP BY language, indentation
	ORDER BY COUNT(indentation) DESC
	), averaged_data as (
	SELECT
	language,
	ROUND(AVG(line_length), 1) as avg_line_length,
	ROUND(AVG(indentation), 1) as avg_indentation,
	APPROX_QUANTILES(line_length, 100)[OFFSET(50)] as median_line_length,
	APPROX_QUANTILES(indentation, 100)[OFFSET(50)] as median_indentation,
	APPROX_QUANTILES(line_length, 100)[OFFSET(99)] as line_length_99th_pcnt,
	APPROX_QUANTILES(indentation, 100)[OFFSET(99)] as indentation_99th_pcnt,
	COUNT(DISTINCT(line)) as lines_analyzed,
	COUNT(DISTINCT(id)) as files_analyzed
	FROM line_data
	GROUP BY language
	ORDER BY AVG(line_length) DESC
	)

	SELECT
	data.*,
	rl.line_length as mode_line_length,
	ri.indentation as mode_indentation
	FROM averaged_data data
	JOIN ranked_indentations ri ON data.language = ri.language
	JOIN ranked_line_lengths rl ON data.language = rl.language
	WHERE 0=0
	AND ri.indentation_ranking=1
	AND rl.line_length_ranking=1
	ORDER BY avg_line_length DESC