Skip to content

Instantly share code, notes, and snippets.

@pepetox
Created March 24, 2020 19:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pepetox/10e65713f4f6008e25f67a64b2622b92 to your computer and use it in GitHub Desktop.
Save pepetox/10e65713f4f6008e25f67a64b2622b92 to your computer and use it in GitHub Desktop.
Queries Webinar BigQuery
SELECT count(repo_name) as numberOfRepos, lang_data.name
FROM `bigquery-public-data.github_repos.languages`, UNNEST(language) AS lang_data
WHERE lang_data.name IN ("JavaScript","Java","Python","Ruby","PHP","C","Go") AND
repo_name IN (
SELECT repo_name
FROM `bigquery-public-data.github_repos.sample_repos`
WHERE watch_count>10
ORDER BY watch_count
)
GROUP BY lang_data.name
ORDER BY count(lang_data.bytes) DESC
SELECT avg(lang_data.bytes)/count(repo_name) as size, lang_data.name
FROM `bigquery-public-data.github_repos.languages`, UNNEST(language) AS lang_data
WHERE lang_data.name IN ("JavaScript","Java","Python","Ruby","PHP","C","Go")
GROUP BY lang_data.name
ORDER BY size ASC
SELECT COUNT(commit) as commits_count, Table_langs.lang_data.name, COUNT (DISTINCT Table_langs.repo_name) as repo_count, COUNT(commit) / COUNT (DISTINCT Table_langs.repo_name) as commits_pre_repo
FROM `bigquery-public-data.github_repos.commits` as Table_commits
INNER JOIN (SELECT repo_name, lang_data FROM `bigquery-public-data.github_repos.languages`, UNNEST( language ) AS lang_data) as Table_langs ON Table_commits.repo_name[ORDINAL(1)] = Table_langs.repo_name
WHERE lang_data.name IN ("JavaScript","Java","Python","Ruby","PHP","C","Go")
GROUP BY Table_langs.lang_data.name
ORDER BY commits_pre_repo ASC
SELECT REGEXP_EXTRACT(path, r'\.([^\.]*)$') as extension, REGEXP_EXTRACT(content, r"\w+") as word, COUNT(REGEXP_EXTRACT(content, r"\w+")) as word_count
FROM (
SELECT id, max(path) as path, max(repo_name) as repo_name
FROM `bigquery-public-data.github_repos.sample_files`
WHERE REGEXP_EXTRACT(path, r'\.([^\.]*)$') IN ('java','h','js','c','php','py','cpp','rb','cc','go')
GROUP BY id
) a
JOIN `bigquery-public-data.github_repos.sample_contents` b
ON a.id=b.id
WHERE REGEXP_EXTRACT(content, r"\w+") IN ('happy', 'perfect', 'easy')
GROUP BY extension, word
Order by word_count DESC
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment