mattwarren/Analysis_of_C#_on_GitHub_using_Google_BigQuery.md Secret

## Analysis_of_C#_on_GitHub_using_Google_BigQuery.md

      
    Raw
  

              Analysis_of_C#_on_GitHub_using_Google_BigQuery.md
            
          
Async-Await
Brace_Placement
Class_Names
CSharp-Most-Popular-Filenames
CSharp-Using-Statements
FileLength-Quantiles
FileLength-Longest
Files_Per_Repo_Most
Files_Per_Repo_Quantiles
Lambdas
Most-Popular-Execeptions
Most_Popular_C#_Repos
Regions
Tabs_v_Spaces
var-Keyword-Usage


Async-Await

SELECT Count(*) count
FROM
  [fh-bigquery:github_extracts.contents_net_cs]
WHERE
  REGEXP_MATCH(content, r'\sasync\s|\sawait\s')
Brace_Placement

SELECT  
  SUM(CASE WHEN (REGEXP_MATCH(line, r'^\s*{\s*$')) THEN 1 ELSE 0 END) seperate_line,  
  SUM(CASE WHEN (REGEXP_MATCH(line, r'^\s*{\s*$')) THEN 0 ELSE 1 END) same_line,
  ## Catch object initializers, i.e. 'new MyClass {  }', new { ... }, or 'new [] { 1, 3 }'
  SUM(CASE WHEN (REGEXP_MATCH(line, r'^.*\b(new\s?{|new\s+\S*\s*{).*$')) THEN 1 ELSE 0 END) same_line_initializer,
  COUNT(*) total
FROM (
  SELECT *
  FROM (
    SELECT
      SPLIT(content, '\n') line,      
    FROM
      [fh-bigquery:github_extracts.contents_net_cs] )
  WHERE
    REGEXP_MATCH(line, r'.*{.*') ) ## only lines containing '{'
Class_Names

SELECT
  class,
  COUNT(class) count,
FROM (
  SELECT
    REGEXP_EXTRACT(line, r'(\bclass\s+[@_a-zA-Z][^ \t]*)') class,
  FROM (
    SELECT
      SPLIT(content, '\n') line,
      size,
      binary,
      sample_path
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    WHERE
      binary = FALSE
      AND size > 100
      AND REGEXP_MATCH(sample_path, r"^wp-content.*") = FALSE
      AND REGEXP_MATCH(sample_path, r".*AssemblyInfo.cs$") == FALSE )
  WHERE
    REGEXP_MATCH(line, r'(\bclass\s+[@_a-zA-Z][^ \t]*)')
    AND REGEXP_MATCH(line, r'^\s*\/\/') == FALSE ## comment line
    AND SUBSTRING(line, 0, 2) != '@"' ) ## Roslyn tests
GROUP BY
  class
ORDER BY
  count DESC
CSharp-Most-Popular-Filenames

SELECT filename, COUNT(*) as count
FROM (
    SELECT LAST(SPLIT(sample_path, '/')) as filename
    FROM [fh-bigquery:github_extracts.contents_net_cs]
)
GROUP BY filename
HAVING count > 1
ORDER BY count DESC
LIMIT 1000
CSharp-Using-Statements

SELECT
  line,
  COUNT(*) AS count,
FROM (
  SELECT
    id,
    SPLIT(content, "\n") AS line
  FROM
    [fh-bigquery:github_extracts.contents_net_cs] )
WHERE
  REGEXP_MATCH(line, r"^\w*using.*;\w*$")
GROUP BY
  line
ORDER BY
  count DESC
LIMIT
  10000
FileLength-Quantiles

SELECT
  ## See https://cloud.google.com/bigquery/docs/reference/legacy-sql#quantiles
  ## 101, gives percentiles (including Min/Max)
  QUANTILES(lines, 101) AS Quantiles 
FROM (
  SELECT
    COUNT(SPLIT(content, '\n')) AS lines,    
    size,
    binary
  FROM
    [fh-bigquery:github_extracts.contents_net_cs] )
WHERE
  ## try and filter our any 'binary' files that sneak in
  binary = false AND
  (lines > 1 OR (lines = 1 && size < 250))
FileLength-Longest

SELECT  *
FROM (
  SELECT
    COUNT(SPLIT(content, '\n')) lines,
    size,
    binary,
    CONCAT('https://github.com/', sample_repo_name, '/blob/master/', sample_path) url
  FROM
    [fh-bigquery:github_extracts.contents_net_cs] )
WHERE
  ## try and filter our any 'binary' files that sneak in
  FileLength-Longest
  (lines > 1 OR (lines = 1 && size < 250))
ORDER BY
  lines DESC, size DESC
LIMIT
  1000
Files_Per_Repo_Most

SELECT
  *
FROM (
  SELECT
    CONCAT("https://github.com/", sample_repo_name) repo,
    COUNT(*) num_cs_files
  FROM (
    SELECT
      binary,
      sample_repo_name,
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    WHERE
      binary = FALSE )
  GROUP BY
    repo )
ORDER BY
  num_cs_files DESC
LIMIT
  1000
Files_Per_Repo_Quantiles

SELECT
  ## See https://cloud.google.com/bigquery/docs/reference/legacy-sql#quantiles
  ## 101, gives percentiles (including Min/Max)
  QUANTILES(num_cs_files, 101) AS Quantiles
FROM (
  SELECT
    sample_repo_name,
    COUNT(*) num_cs_files
  FROM (
    SELECT
      binary,
      sample_repo_name,
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    WHERE
      binary = FALSE )
  GROUP BY
    sample_repo_name )
Lambdas

SELECT
  ## See https://cloud.google.com/bigquery/docs/reference/legacy-sql#quantiles
  ## 101, gives percentiles (including Min/Max)
  QUANTILES(lambdas / num_lines * 100.0, 101) AS Quantiles
FROM (
  SELECT
    SUM(REGEXP_MATCH(line, r'^.*=>.*\b')) WITHIN RECORD lambdas,
    COUNT(line) WITHIN RECORD num_lines,    
    *
  FROM (
    SELECT
      SPLIT(content, '\n') line,
      binary,
      sample_repo_name,
      sample_path
    FROM
      [fh-bigquery:github_extracts.contents_net_cs] )    
  WHERE
    binary = FALSE)
WHERE
  lambdas > 0
Most-Popular-Execeptions

SELECT
  exception,
  COUNT(exception) count
FROM (
  SELECT
    sample_path,
    sample_repo_name,
    REGEXP_EXTRACT(line, r'(throw\snew\s\w+Exception)(?:\()') exception,
  FROM (
    SELECT
      SPLIT(content, '\n') line,
      sample_path,
      sample_repo_name
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    HAVING
      REGEXP_MATCH(line, r'throw\snew\s\w+Exception(?:\()') ) )
GROUP BY
  exception
ORDER BY
  count DESC
Most_Popular_C#_Repos

SELECT
  *
FROM (
  SELECT
    CONCAT("https://github.com/", sample_repo_name) repo,
    sample_stars stars,
    COUNT(*) files ## as "C# files"
  FROM (
    SELECT
      binary,
      sample_repo_name,
      sample_stars,
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    WHERE
      binary = FALSE )
  GROUP BY
    repo,
    stars
  HAVING
    files > 50 )
WHERE
  stars > 1000
ORDER BY
  stars DESC
LIMIT
  1000
Regions

SELECT
  COUNT(*) files_with_a_region
FROM
  [fh-bigquery:github_extracts.contents_net_cs]
WHERE
  ## new-line, zero or more spaces, '#region', word-boundary
  REGEXP_MATCH(content, r'(?:\n|\r\n)\s*(#region)\b') 
Tabs_v_Spaces

SELECT
  SUM(best='tab') tabs,
  SUM(best='space') spaces,  
  COUNT(*) total
FROM (
  SELECT
    sample_path,
    sample_repo_name,
    COUNT(lineStart) WITHIN RECORD lineCount,
    IF(SUM(lineStart=' ') > SUM(lineStart='\t'), 'space', 'tab') WITHIN RECORD best,
  FROM (
    SELECT
      LEFT(SPLIT(content, '\n'), 1) lineStart,
      sample_path,
      sample_repo_name
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    HAVING
      REGEXP_MATCH(lineStart, r'[ \t]') ) # only lines with a space or tab in them
  HAVING
    lineCount > 10 ) # at least 10 lines that start with space or tab sample_repo_name, sample_repo_name
var-Keyword-Usage

SELECT Count(*) count
FROM
  [fh-bigquery:github_extracts.contents_net_cs]
WHERE
  REGEXP_MATCH(content, r'[\t ]*var\s\w*\s*=\s*(\d|\d.\d)+.*;')