Skip to content

Instantly share code, notes, and snippets.

@mattwarren
Last active May 4, 2023 14:38
Show Gist options
  • Star 9 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mattwarren/42100ffe488bce5d48be22b59124b752 to your computer and use it in GitHub Desktop.
Save mattwarren/42100ffe488bce5d48be22b59124b752 to your computer and use it in GitHub Desktop.
Analysis of C# on GitHub using Google BigQuery
  1. Async-Await
  2. Brace_Placement
  3. Class_Names
  4. CSharp-Most-Popular-Filenames
  5. CSharp-Using-Statements
  6. FileLength-Quantiles
  7. FileLength-Longest
  8. Files_Per_Repo_Most
  9. Files_Per_Repo_Quantiles
  10. Lambdas
  11. Most-Popular-Execeptions
  12. Most_Popular_C#_Repos
  13. Regions
  14. Tabs_v_Spaces
  15. var-Keyword-Usage

Async-Await

SELECT Count(*) count
FROM
  [fh-bigquery:github_extracts.contents_net_cs]
WHERE
  REGEXP_MATCH(content, r'\sasync\s|\sawait\s')

Brace_Placement

SELECT  
  SUM(CASE WHEN (REGEXP_MATCH(line, r'^\s*{\s*$')) THEN 1 ELSE 0 END) seperate_line,  
  SUM(CASE WHEN (REGEXP_MATCH(line, r'^\s*{\s*$')) THEN 0 ELSE 1 END) same_line,
  ## Catch object initializers, i.e. 'new MyClass {  }', new { ... }, or 'new [] { 1, 3 }'
  SUM(CASE WHEN (REGEXP_MATCH(line, r'^.*\b(new\s?{|new\s+\S*\s*{).*$')) THEN 1 ELSE 0 END) same_line_initializer,
  COUNT(*) total
FROM (
  SELECT *
  FROM (
    SELECT
      SPLIT(content, '\n') line,      
    FROM
      [fh-bigquery:github_extracts.contents_net_cs] )
  WHERE
    REGEXP_MATCH(line, r'.*{.*') ) ## only lines containing '{'

Class_Names

SELECT
  class,
  COUNT(class) count,
FROM (
  SELECT
    REGEXP_EXTRACT(line, r'(\bclass\s+[@_a-zA-Z][^ \t]*)') class,
  FROM (
    SELECT
      SPLIT(content, '\n') line,
      size,
      binary,
      sample_path
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    WHERE
      binary = FALSE
      AND size > 100
      AND REGEXP_MATCH(sample_path, r"^wp-content.*") = FALSE
      AND REGEXP_MATCH(sample_path, r".*AssemblyInfo.cs$") == FALSE )
  WHERE
    REGEXP_MATCH(line, r'(\bclass\s+[@_a-zA-Z][^ \t]*)')
    AND REGEXP_MATCH(line, r'^\s*\/\/') == FALSE ## comment line
    AND SUBSTRING(line, 0, 2) != '@"' ) ## Roslyn tests
GROUP BY
  class
ORDER BY
  count DESC

CSharp-Most-Popular-Filenames

SELECT filename, COUNT(*) as count
FROM (
    SELECT LAST(SPLIT(sample_path, '/')) as filename
    FROM [fh-bigquery:github_extracts.contents_net_cs]
)
GROUP BY filename
HAVING count > 1
ORDER BY count DESC
LIMIT 1000

CSharp-Using-Statements

SELECT
  line,
  COUNT(*) AS count,
FROM (
  SELECT
    id,
    SPLIT(content, "\n") AS line
  FROM
    [fh-bigquery:github_extracts.contents_net_cs] )
WHERE
  REGEXP_MATCH(line, r"^\w*using.*;\w*$")
GROUP BY
  line
ORDER BY
  count DESC
LIMIT
  10000

FileLength-Quantiles

SELECT
  ## See https://cloud.google.com/bigquery/docs/reference/legacy-sql#quantiles
  ## 101, gives percentiles (including Min/Max)
  QUANTILES(lines, 101) AS Quantiles 
FROM (
  SELECT
    COUNT(SPLIT(content, '\n')) AS lines,    
    size,
    binary
  FROM
    [fh-bigquery:github_extracts.contents_net_cs] )
WHERE
  ## try and filter our any 'binary' files that sneak in
  binary = false AND
  (lines > 1 OR (lines = 1 && size < 250))

FileLength-Longest

SELECT  *
FROM (
  SELECT
    COUNT(SPLIT(content, '\n')) lines,
    size,
    binary,
    CONCAT('https://github.com/', sample_repo_name, '/blob/master/', sample_path) url
  FROM
    [fh-bigquery:github_extracts.contents_net_cs] )
WHERE
  ## try and filter our any 'binary' files that sneak in
  FileLength-Longest
  (lines > 1 OR (lines = 1 && size < 250))
ORDER BY
  lines DESC, size DESC
LIMIT
  1000

Files_Per_Repo_Most

SELECT
  *
FROM (
  SELECT
    CONCAT("https://github.com/", sample_repo_name) repo,
    COUNT(*) num_cs_files
  FROM (
    SELECT
      binary,
      sample_repo_name,
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    WHERE
      binary = FALSE )
  GROUP BY
    repo )
ORDER BY
  num_cs_files DESC
LIMIT
  1000

Files_Per_Repo_Quantiles

SELECT
  ## See https://cloud.google.com/bigquery/docs/reference/legacy-sql#quantiles
  ## 101, gives percentiles (including Min/Max)
  QUANTILES(num_cs_files, 101) AS Quantiles
FROM (
  SELECT
    sample_repo_name,
    COUNT(*) num_cs_files
  FROM (
    SELECT
      binary,
      sample_repo_name,
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    WHERE
      binary = FALSE )
  GROUP BY
    sample_repo_name )

Lambdas

SELECT
  ## See https://cloud.google.com/bigquery/docs/reference/legacy-sql#quantiles
  ## 101, gives percentiles (including Min/Max)
  QUANTILES(lambdas / num_lines * 100.0, 101) AS Quantiles
FROM (
  SELECT
    SUM(REGEXP_MATCH(line, r'^.*=>.*\b')) WITHIN RECORD lambdas,
    COUNT(line) WITHIN RECORD num_lines,    
    *
  FROM (
    SELECT
      SPLIT(content, '\n') line,
      binary,
      sample_repo_name,
      sample_path
    FROM
      [fh-bigquery:github_extracts.contents_net_cs] )    
  WHERE
    binary = FALSE)
WHERE
  lambdas > 0

Most-Popular-Execeptions

SELECT
  exception,
  COUNT(exception) count
FROM (
  SELECT
    sample_path,
    sample_repo_name,
    REGEXP_EXTRACT(line, r'(throw\snew\s\w+Exception)(?:\()') exception,
  FROM (
    SELECT
      SPLIT(content, '\n') line,
      sample_path,
      sample_repo_name
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    HAVING
      REGEXP_MATCH(line, r'throw\snew\s\w+Exception(?:\()') ) )
GROUP BY
  exception
ORDER BY
  count DESC

Most_Popular_C#_Repos

SELECT
  *
FROM (
  SELECT
    CONCAT("https://github.com/", sample_repo_name) repo,
    sample_stars stars,
    COUNT(*) files ## as "C# files"
  FROM (
    SELECT
      binary,
      sample_repo_name,
      sample_stars,
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    WHERE
      binary = FALSE )
  GROUP BY
    repo,
    stars
  HAVING
    files > 50 )
WHERE
  stars > 1000
ORDER BY
  stars DESC
LIMIT
  1000

Regions

SELECT
  COUNT(*) files_with_a_region
FROM
  [fh-bigquery:github_extracts.contents_net_cs]
WHERE
  ## new-line, zero or more spaces, '#region', word-boundary
  REGEXP_MATCH(content, r'(?:\n|\r\n)\s*(#region)\b') 

Tabs_v_Spaces

SELECT
  SUM(best='tab') tabs,
  SUM(best='space') spaces,  
  COUNT(*) total
FROM (
  SELECT
    sample_path,
    sample_repo_name,
    COUNT(lineStart) WITHIN RECORD lineCount,
    IF(SUM(lineStart=' ') > SUM(lineStart='\t'), 'space', 'tab') WITHIN RECORD best,
  FROM (
    SELECT
      LEFT(SPLIT(content, '\n'), 1) lineStart,
      sample_path,
      sample_repo_name
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    HAVING
      REGEXP_MATCH(lineStart, r'[ \t]') ) # only lines with a space or tab in them
  HAVING
    lineCount > 10 ) # at least 10 lines that start with space or tab sample_repo_name, sample_repo_name

var-Keyword-Usage

SELECT Count(*) count
FROM
  [fh-bigquery:github_extracts.contents_net_cs]
WHERE
  REGEXP_MATCH(content, r'[\t ]*var\s\w*\s*=\s*(\d|\d.\d)+.*;')
@jairbubbles
Copy link

I'm not sure I undertand the right part of the var regex.
I would expect a line like that to match:

  var toto = new string();

But it doesn't (see https://regex101.com/r/EjbeMj/1)

Why not using something simpler at the right of the =? Something like:

[\t ]*var\s\w*\s*=\s*.*;

(I'm not sure we care about what's on the right of the =)

Which in that case works (see https://regex101.com/r/EjbeMj/2)

@mattwarren
Copy link
Author

@jairbubbles yes, you're absolutely right, I don't know what I was thinking when I wrote that regex!!

If I re-run the query using your regex I get 1,457,154 files that have 1 or more instances of var, which sounds more like it!!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment