Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Analysis of C# on GitHub using Google BigQuery
  1. Async-Await
  2. Brace_Placement
  3. Class_Names
  4. CSharp-Most-Popular-Filenames
  5. CSharp-Using-Statements
  6. FileLength-Quantiles
  7. FileLength-Longest
  8. Files_Per_Repo_Most
  9. Files_Per_Repo_Quantiles
  10. Lambdas
  11. Most-Popular-Execeptions
  12. Most_Popular_C#_Repos
  13. Regions
  14. Tabs_v_Spaces
  15. var-Keyword-Usage

Async-Await

SELECT Count(*) count
FROM
  [fh-bigquery:github_extracts.contents_net_cs]
WHERE
  REGEXP_MATCH(content, r'\sasync\s|\sawait\s')

Brace_Placement

SELECT  
  SUM(CASE WHEN (REGEXP_MATCH(line, r'^\s*{\s*$')) THEN 1 ELSE 0 END) seperate_line,  
  SUM(CASE WHEN (REGEXP_MATCH(line, r'^\s*{\s*$')) THEN 0 ELSE 1 END) same_line,
  ## Catch object initializers, i.e. 'new MyClass {  }', new { ... }, or 'new [] { 1, 3 }'
  SUM(CASE WHEN (REGEXP_MATCH(line, r'^.*\b(new\s?{|new\s+\S*\s*{).*$')) THEN 1 ELSE 0 END) same_line_initializer,
  COUNT(*) total
FROM (
  SELECT *
  FROM (
    SELECT
      SPLIT(content, '\n') line,      
    FROM
      [fh-bigquery:github_extracts.contents_net_cs] )
  WHERE
    REGEXP_MATCH(line, r'.*{.*') ) ## only lines containing '{'

Class_Names

SELECT
  class,
  COUNT(class) count,
FROM (
  SELECT
    REGEXP_EXTRACT(line, r'(\bclass\s+[@_a-zA-Z][^ \t]*)') class,
  FROM (
    SELECT
      SPLIT(content, '\n') line,
      size,
      binary,
      sample_path
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    WHERE
      binary = FALSE
      AND size > 100
      AND REGEXP_MATCH(sample_path, r"^wp-content.*") = FALSE
      AND REGEXP_MATCH(sample_path, r".*AssemblyInfo.cs$") == FALSE )
  WHERE
    REGEXP_MATCH(line, r'(\bclass\s+[@_a-zA-Z][^ \t]*)')
    AND REGEXP_MATCH(line, r'^\s*\/\/') == FALSE ## comment line
    AND SUBSTRING(line, 0, 2) != '@"' ) ## Roslyn tests
GROUP BY
  class
ORDER BY
  count DESC

CSharp-Most-Popular-Filenames

SELECT filename, COUNT(*) as count
FROM (
    SELECT LAST(SPLIT(sample_path, '/')) as filename
    FROM [fh-bigquery:github_extracts.contents_net_cs]
)
GROUP BY filename
HAVING count > 1
ORDER BY count DESC
LIMIT 1000

CSharp-Using-Statements

SELECT
  line,
  COUNT(*) AS count,
FROM (
  SELECT
    id,
    SPLIT(content, "\n") AS line
  FROM
    [fh-bigquery:github_extracts.contents_net_cs] )
WHERE
  REGEXP_MATCH(line, r"^\w*using.*;\w*$")
GROUP BY
  line
ORDER BY
  count DESC
LIMIT
  10000

FileLength-Quantiles

SELECT
  ## See https://cloud.google.com/bigquery/docs/reference/legacy-sql#quantiles
  ## 101, gives percentiles (including Min/Max)
  QUANTILES(lines, 101) AS Quantiles 
FROM (
  SELECT
    COUNT(SPLIT(content, '\n')) AS lines,    
    size,
    binary
  FROM
    [fh-bigquery:github_extracts.contents_net_cs] )
WHERE
  ## try and filter our any 'binary' files that sneak in
  binary = false AND
  (lines > 1 OR (lines = 1 && size < 250))

FileLength-Longest

SELECT  *
FROM (
  SELECT
    COUNT(SPLIT(content, '\n')) lines,
    size,
    binary,
    CONCAT('https://github.com/', sample_repo_name, '/blob/master/', sample_path) url
  FROM
    [fh-bigquery:github_extracts.contents_net_cs] )
WHERE
  ## try and filter our any 'binary' files that sneak in
  FileLength-Longest
  (lines > 1 OR (lines = 1 && size < 250))
ORDER BY
  lines DESC, size DESC
LIMIT
  1000

Files_Per_Repo_Most

SELECT
  *
FROM (
  SELECT
    CONCAT("https://github.com/", sample_repo_name) repo,
    COUNT(*) num_cs_files
  FROM (
    SELECT
      binary,
      sample_repo_name,
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    WHERE
      binary = FALSE )
  GROUP BY
    repo )
ORDER BY
  num_cs_files DESC
LIMIT
  1000

Files_Per_Repo_Quantiles

SELECT
  ## See https://cloud.google.com/bigquery/docs/reference/legacy-sql#quantiles
  ## 101, gives percentiles (including Min/Max)
  QUANTILES(num_cs_files, 101) AS Quantiles
FROM (
  SELECT
    sample_repo_name,
    COUNT(*) num_cs_files
  FROM (
    SELECT
      binary,
      sample_repo_name,
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    WHERE
      binary = FALSE )
  GROUP BY
    sample_repo_name )

Lambdas

SELECT
  ## See https://cloud.google.com/bigquery/docs/reference/legacy-sql#quantiles
  ## 101, gives percentiles (including Min/Max)
  QUANTILES(lambdas / num_lines * 100.0, 101) AS Quantiles
FROM (
  SELECT
    SUM(REGEXP_MATCH(line, r'^.*=>.*\b')) WITHIN RECORD lambdas,
    COUNT(line) WITHIN RECORD num_lines,    
    *
  FROM (
    SELECT
      SPLIT(content, '\n') line,
      binary,
      sample_repo_name,
      sample_path
    FROM
      [fh-bigquery:github_extracts.contents_net_cs] )    
  WHERE
    binary = FALSE)
WHERE
  lambdas > 0

Most-Popular-Execeptions

SELECT
  exception,
  COUNT(exception) count
FROM (
  SELECT
    sample_path,
    sample_repo_name,
    REGEXP_EXTRACT(line, r'(throw\snew\s\w+Exception)(?:\()') exception,
  FROM (
    SELECT
      SPLIT(content, '\n') line,
      sample_path,
      sample_repo_name
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    HAVING
      REGEXP_MATCH(line, r'throw\snew\s\w+Exception(?:\()') ) )
GROUP BY
  exception
ORDER BY
  count DESC

Most_Popular_C#_Repos

SELECT
  *
FROM (
  SELECT
    CONCAT("https://github.com/", sample_repo_name) repo,
    sample_stars stars,
    COUNT(*) files ## as "C# files"
  FROM (
    SELECT
      binary,
      sample_repo_name,
      sample_stars,
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    WHERE
      binary = FALSE )
  GROUP BY
    repo,
    stars
  HAVING
    files > 50 )
WHERE
  stars > 1000
ORDER BY
  stars DESC
LIMIT
  1000

Regions

SELECT
  COUNT(*) files_with_a_region
FROM
  [fh-bigquery:github_extracts.contents_net_cs]
WHERE
  ## new-line, zero or more spaces, '#region', word-boundary
  REGEXP_MATCH(content, r'(?:\n|\r\n)\s*(#region)\b') 

Tabs_v_Spaces

SELECT
  SUM(best='tab') tabs,
  SUM(best='space') spaces,  
  COUNT(*) total
FROM (
  SELECT
    sample_path,
    sample_repo_name,
    COUNT(lineStart) WITHIN RECORD lineCount,
    IF(SUM(lineStart=' ') > SUM(lineStart='\t'), 'space', 'tab') WITHIN RECORD best,
  FROM (
    SELECT
      LEFT(SPLIT(content, '\n'), 1) lineStart,
      sample_path,
      sample_repo_name
    FROM
      [fh-bigquery:github_extracts.contents_net_cs]
    HAVING
      REGEXP_MATCH(lineStart, r'[ \t]') ) # only lines with a space or tab in them
  HAVING
    lineCount > 10 ) # at least 10 lines that start with space or tab sample_repo_name, sample_repo_name

var-Keyword-Usage

SELECT Count(*) count
FROM
  [fh-bigquery:github_extracts.contents_net_cs]
WHERE
  REGEXP_MATCH(content, r'[\t ]*var\s\w*\s*=\s*(\d|\d.\d)+.*;')
@jairbubbles

This comment has been minimized.

Copy link

jairbubbles commented Oct 13, 2017

I'm not sure I undertand the right part of the var regex.
I would expect a line like that to match:

  var toto = new string();

But it doesn't (see https://regex101.com/r/EjbeMj/1)

Why not using something simpler at the right of the =? Something like:

[\t ]*var\s\w*\s*=\s*.*;

(I'm not sure we care about what's on the right of the =)

Which in that case works (see https://regex101.com/r/EjbeMj/2)

@mattwarren

This comment has been minimized.

Copy link
Owner Author

mattwarren commented Oct 16, 2017

@jairbubbles yes, you're absolutely right, I don't know what I was thinking when I wrote that regex!!

If I re-run the query using your regex I get 1,457,154 files that have 1 or more instances of var, which sounds more like it!!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.