Last active
October 4, 2018 11:08
-
-
Save smola/7013c2ce00788a36d9b57c66521a340d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### source{d} reference SQL queries (DRAFT)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"149 repositories successfully loaded" | |
] | |
} | |
], | |
"source": [ | |
"import tech.sourced.engine._\n", | |
"\n", | |
"val engine = Engine(spark, \"hdfs://hdfs-namenode/100-siva\", \"siva\")\n", | |
"\n", | |
"spark.registerUDFs\n", | |
"\n", | |
"print(s\"${engine.getRepositories.count()} repositories successfully loaded\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+--------------------+--------+\n", | |
"| key| value|\n", | |
"+--------------------+--------+\n", | |
"|spark.sql.files.m...|33554432|\n", | |
"+--------------------+--------+\n", | |
"\n" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"SET spark.sql.files.maxPartitionBytes=33554432" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Show tables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+--------+------------+-----------+\n", | |
"|database| tableName|isTemporary|\n", | |
"+--------+------------+-----------+\n", | |
"| | blobs| true|\n", | |
"| | commits| true|\n", | |
"| | references| true|\n", | |
"| |repositories| true|\n", | |
"| |tree_entries| true|\n", | |
"+--------+------------+-----------+\n", | |
"\n" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"SHOW TABLES" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Describe table" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+--------------+-------------+-------+\n", | |
"| col_name| data_type|comment|\n", | |
"+--------------+-------------+-------+\n", | |
"| repository_id| string| null|\n", | |
"|reference_name| string| null|\n", | |
"| index| int| null|\n", | |
"| hash| string| null|\n", | |
"| message| string| null|\n", | |
"| parents|array<string>| null|\n", | |
"| parents_count| int| null|\n", | |
"| author_email| string| null|\n", | |
"| author_name| string| null|\n", | |
"| author_date| timestamp| null|\n", | |
"+--------------+-------------+-------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"DESCRIBE commits" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Create table of non-fork repository IDs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"++\n", | |
"||\n", | |
"++\n", | |
"++\n", | |
"\n" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"CREATE TABLE IF NOT EXISTS repository_ids_no_forks\n", | |
"USING parquet\n", | |
"OPTIONS ('compression'='snappy')\n", | |
"COMMENT 'repository_id of all repositories containing a HEAD reference and excluding forks'\n", | |
"SELECT\n", | |
" s.repository_id AS repository_id\n", | |
"FROM (\n", | |
" SELECT\n", | |
" hash,\n", | |
" MAX(STRUCT(index, repository_id)) AS s\n", | |
" FROM commits\n", | |
" WHERE\n", | |
" index != 1\n", | |
" AND reference_name = 'refs/heads/HEAD'\n", | |
" AND SIZE(parents) == 0\n", | |
" GROUP BY hash\n", | |
") AS q\n", | |
"ORDER BY repository_id ASC\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Q1: Count repositories" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+----------------+\n", | |
"|repository_count|\n", | |
"+----------------+\n", | |
"| 149|\n", | |
"+----------------+\n", | |
"\n" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"-- Count of total repositories\n", | |
"--\n", | |
"-- categories: fast\n", | |
"SELECT\n", | |
" COUNT(DISTINCT id) AS repository_count\n", | |
"FROM repositories" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Q2: Last commit messages in HEAD for every repository" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+--------------------+\n", | |
"| message|\n", | |
"+--------------------+\n", | |
"| ???????? ?????\n", | |
"|\n", | |
"| Merging xml files|\n", | |
"| log\n", | |
"|\n", | |
"|Delete duplicate ...|\n", | |
"|v1.1.10 New para...|\n", | |
"|assignment 4 firs...|\n", | |
"|Update README.md ...|\n", | |
"|Add extra solder ...|\n", | |
"|Added all the sle...|\n", | |
"|trying to fix dra...|\n", | |
"+--------------------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"-- Last commit messages in HEAD for every repository.\n", | |
"--\n", | |
"-- categories: fast\n", | |
"SELECT message\n", | |
"FROM commits\n", | |
"WHERE\n", | |
" reference_name = 'refs/heads/HEAD' AND\n", | |
" index = 0" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Q3: All commit messages in HEAD history for every repository" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+--------------------+\n", | |
"| message|\n", | |
"+--------------------+\n", | |
"| ???????? ?????\n", | |
"|\n", | |
"|?????? ???? befor...|\n", | |
"| fix\n", | |
"|\n", | |
"| fix\n", | |
"|\n", | |
"| fix\n", | |
"|\n", | |
"|?????????? ??????...|\n", | |
"| up\n", | |
"|\n", | |
"| UP\n", | |
"|\n", | |
"|Merged in xDESTRO...|\n", | |
"|???????? ???? ???...|\n", | |
"+--------------------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"--- All commit messages in HEAD history for every repository.\n", | |
"---\n", | |
"--- categories: fast\n", | |
"SELECT message\n", | |
"FROM commits\n", | |
"WHERE\n", | |
" reference_name = 'refs/heads/HEAD' AND\n", | |
" index >= 0" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Q4: Top 10 repositories by commit count in HEAD" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+--------------------+------------+\n", | |
"| repository_id|commit_count|\n", | |
"+--------------------+------------+\n", | |
"|github.com/bjoeri...| 601|\n", | |
"|github.com/akoo0o...| 592|\n", | |
"|github.com/Deathl...| 592|\n", | |
"|bitbucket.org/lew...| 370|\n", | |
"|github.com/bigben...| 162|\n", | |
"|github.com/Christ...| 162|\n", | |
"|github.com/nickma...| 162|\n", | |
"|github.com/jzawod...| 150|\n", | |
"|github.com/johand...| 147|\n", | |
"|github.com/johhal...| 135|\n", | |
"+--------------------+------------+\n", | |
"\n" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"-- Get top 10 repositoryes by commit count in HEAD\n", | |
"--\n", | |
"-- categories: ????\n", | |
"SELECT\n", | |
" repository_id,\n", | |
" count(*) AS commit_count\n", | |
"FROM commits\n", | |
" WHERE\n", | |
" reference_name = 'refs/heads/HEAD'\n", | |
" -- NOTE: This is the magic bit to get the history, which is used underneath by getAllReferenceCommits\n", | |
" AND index <> -1\n", | |
"GROUP BY repository_id, reference_name\n", | |
"ORDER BY commit_count DESC\n", | |
"LIMIT 10" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Q5: Fork detection (approximate)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+--------------------+-------+--------------------+--------------------+\n", | |
"| hash|n_repos| main_repository_id| repository_ids|\n", | |
"+--------------------+-------+--------------------+--------------------+\n", | |
"|74c998a1251da5cf9...| 9|github.com/nickma...|[github.com/bigbe...|\n", | |
"|ecd4b4d008bd4498e...| 4|github.com/johand...|[github.com/jiamz...|\n", | |
"|e51501b76328057c1...| 3|github.com/bjoeri...|[github.com/Death...|\n", | |
"|6b91410a1bf752662...| 2|github.com/welter...|[github.com/welte...|\n", | |
"|49b7828ca2edf159f...| 2|github.com/esjewe...|[github.com/dolss...|\n", | |
"|acb4c30aa3a5952a8...| 2|github.com/ckreut...|[github.com/ckreu...|\n", | |
"|18e7dcbfc42a83db9...| 2|github.com/Printr...|[github.com/Print...|\n", | |
"+--------------------+-------+--------------------+--------------------+\n", | |
"\n" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"-- Fork detection. Returns all repositories grouped if they form a fork network.\n", | |
"-- The main repository is approximated by picking the one with longest history in HEAD.\n", | |
"--\n", | |
"-- categories: ????\n", | |
"SELECT\n", | |
" hash,\n", | |
" n_repos,\n", | |
" s.repository_id AS main_repository_id,\n", | |
" repository_ids\n", | |
"FROM (\n", | |
" SELECT\n", | |
" hash,\n", | |
" COUNT(*) n_repos,\n", | |
" MAX(STRUCT(index, repository_id)) AS s,\n", | |
" COLLECT_SET(repository_id) AS repository_ids\n", | |
" FROM commits\n", | |
" WHERE\n", | |
" index != 1\n", | |
" AND reference_name = 'refs/heads/HEAD'\n", | |
" AND SIZE(parents) == 0\n", | |
" GROUP BY hash\n", | |
") AS q\n", | |
"WHERE n_repos > 1\n", | |
"ORDER BY n_repos DESC\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Q6: Count repository HEAD" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+----------+\n", | |
"|head_count|\n", | |
"+----------+\n", | |
"| 106|\n", | |
"+----------+\n", | |
"\n" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"-- Count repository HEADs.\n", | |
"--\n", | |
"-- categories: fast\n", | |
"SELECT\n", | |
" COUNT(DISTINCT repository_ID) AS head_count\n", | |
"FROM\n", | |
" references\n", | |
"WHERE name = 'refs/heads/HEAD'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Q7: Count repository HEAD excluding forks" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+--------+\n", | |
"|count(1)|\n", | |
"+--------+\n", | |
"| 80|\n", | |
"+--------+\n", | |
"\n" | |
] | |
}, | |
"execution_count": 50, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"-- Count repository HEADs excluding forks.\n", | |
"--\n", | |
"-- categories: ????\n", | |
"SELECT\n", | |
" COUNT(*)\n", | |
"FROM (\n", | |
"SELECT DISTINCT\n", | |
" s.repository_id AS repository_id\n", | |
"FROM (\n", | |
" SELECT\n", | |
" hash,\n", | |
" MAX(STRUCT(index, repository_id)) AS s\n", | |
" FROM commits\n", | |
" WHERE\n", | |
" index != 1\n", | |
" AND reference_name = 'refs/heads/HEAD'\n", | |
" AND SIZE(parents) == 0\n", | |
" GROUP BY hash\n", | |
") AS q\n", | |
") AS q2" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Q8: Repository count by language presence (HEAD, no forks)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+----------+----------------+\n", | |
"| language|repository_count|\n", | |
"+----------+----------------+\n", | |
"| Unknown| 74|\n", | |
"| Text| 43|\n", | |
"| Markdown| 37|\n", | |
"|JavaScript| 35|\n", | |
"| HTML| 33|\n", | |
"| XML| 27|\n", | |
"| CSS| 27|\n", | |
"| Ruby| 20|\n", | |
"| Shell| 17|\n", | |
"| YAML| 17|\n", | |
"+----------+----------------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"-- Repository count per language precense.\n", | |
"-- If any file in language X is present in HEAD of a repository,\n", | |
"-- it contributes one for the language count.\n", | |
"-- Forks are excluded.\n", | |
"--\n", | |
"-- category: slow,enry\n", | |
"SELECT\n", | |
" language,\n", | |
" COUNT(repository_id) AS repository_count\n", | |
"FROM (\n", | |
" SELECT DISTINCT\n", | |
" t.repository_id AS repository_id,\n", | |
" COALESCE(\n", | |
" classifyLanguages(b.is_binary, t.path, b.content),\n", | |
" 'Unknown') AS language\n", | |
" FROM\n", | |
" tree_entries t\n", | |
" JOIN blobs b ON\n", | |
" t.repository_id = b.repository_id AND\n", | |
" t.reference_name = b.reference_name AND\n", | |
" t.blob = b.blob_id\n", | |
" JOIN repository_ids_no_forks r ON\n", | |
" b.repository_id = r.repository_id\n", | |
" WHERE\n", | |
" t.reference_name = 'refs/heads/HEAD'\n", | |
" ) AS q2\n", | |
"GROUP BY language\n", | |
"ORDER BY repository_count DESC" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Q9: Top 50 tokens in files by repository count (HEAD, no forks)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%sql\n", | |
"-- Top 100 tokens by repository count. Each token counted once per repository.\n", | |
"--\n", | |
"-- category: slow\n", | |
"SELECT\n", | |
" token,\n", | |
" COUNT(DISTINCT repository_id) AS repository_count\n", | |
"FROM (\n", | |
" SELECT\n", | |
" b.repository_id,\n", | |
" EXPLODE(SPLIT(b.content, '[^_A-Za-z0-9]+')) AS token\n", | |
" FROM\n", | |
" blobs b\n", | |
" JOIN repository_ids_no_forks r ON\n", | |
" b.repository_id = r.repository_id\n", | |
" WHERE\n", | |
" reference_name = 'refs/heads/HEAD' AND\n", | |
" is_binary = false AND\n", | |
" LENGTH(content) < 524288\n", | |
") AS q\n", | |
"GROUP BY token\n", | |
"ORDER BY repository_count DESC\n", | |
"LIMIT 50" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Q10: Top 50 identifiers in files by repository count (HEAD, no forks)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+----------+----------------+\n", | |
"|identifier|repository_count|\n", | |
"+----------+----------------+\n", | |
"| | 51|\n", | |
"| String| 48|\n", | |
"| java| 46|\n", | |
"| util| 45|\n", | |
"| i| 41|\n", | |
"| io| 39|\n", | |
"| Override| 38|\n", | |
"| get| 38|\n", | |
"| toString| 38|\n", | |
"| e| 38|\n", | |
"+----------+----------------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"-- Top 100 identifier by repository count. Each identifier counted once per repository.\n", | |
"--\n", | |
"-- category: slow,babelfish\n", | |
"SELECT\n", | |
" identifier,\n", | |
" COUNT(DISTINCT repository_id) AS repository_count\n", | |
"FROM (\n", | |
" SELECT\n", | |
" b.repository_id AS repository_id,\n", | |
" EXPLODE(extractTokens(queryXPath(\n", | |
" extractUASTs(t.path, b.content, classifyLanguages(b.is_binary, t.path, b.content)),\n", | |
" '//*[@roleIdentifier and not(@roleIncomplete)]'))) AS identifier\n", | |
" FROM\n", | |
" tree_entries t\n", | |
" JOIN blobs b ON\n", | |
" t.repository_id = b.repository_id AND\n", | |
" t.reference_name = b.reference_name AND\n", | |
" t.blob = b.blob_id\n", | |
" JOIN repository_ids_no_forks r ON\n", | |
" b.repository_id = r.repository_id\n", | |
" WHERE\n", | |
" b.reference_name = 'refs/heads/HEAD' AND\n", | |
" b.is_binary = false AND\n", | |
" LENGTH(b.content) < 524288 AND\n", | |
" REGEXP_EXTRACT(t.path, '\\.[^.]+$', 0) IN ('.java', '.py', '.go')\n", | |
") AS q\n", | |
"GROUP BY identifier\n", | |
"ORDER BY repository_count DESC\n", | |
"LIMIT 50" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Q11: Top 10 repositories by contributor count (all branches)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+--------------------+-----------------+\n", | |
"| repository_id|contributor_count|\n", | |
"+--------------------+-----------------+\n", | |
"|github.com/nickma...| 34|\n", | |
"|github.com/Christ...| 34|\n", | |
"|github.com/bigben...| 34|\n", | |
"|github.com/ckreut...| 16|\n", | |
"|github.com/johand...| 13|\n", | |
"|github.com/bjoeri...| 11|\n", | |
"|github.com/pelarg...| 10|\n", | |
"|github.com/akoo0o...| 10|\n", | |
"|github.com/Deathl...| 10|\n", | |
"|github.com/alfons...| 9|\n", | |
"+--------------------+-----------------+\n", | |
"\n" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"-- Top 10 repositories by contributor count in all branches.\n", | |
"-- Each distinct email is counted as a distinct contributor.\n", | |
"--\n", | |
"-- categories: fast\n", | |
"SELECT\n", | |
" repository_id,\n", | |
" contributor_count\n", | |
"FROM (\n", | |
" SELECT\n", | |
" repository_id,\n", | |
" COUNT(DISTINCT author_email) AS contributor_count\n", | |
" FROM commits\n", | |
" WHERE index <> -1\n", | |
" GROUP BY repository_id\n", | |
") AS q\n", | |
"ORDER BY contributor_count DESC\n", | |
"LIMIT 10\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Q12: Created projects per year" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"+----+-------------+\n", | |
"|year|project_count|\n", | |
"+----+-------------+\n", | |
"|2017| 6|\n", | |
"|2016| 2|\n", | |
"|2015| 1|\n", | |
"|2014| 12|\n", | |
"|2013| 22|\n", | |
"|2012| 23|\n", | |
"|2011| 8|\n", | |
"|2010| 19|\n", | |
"|2009| 3|\n", | |
"|2008| 10|\n", | |
"+----+-------------+\n", | |
"\n" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"-- Approximate number of projects created per year. Creation date is extracted from the\n", | |
"-- first commit in the history of HEAD reference. Forks are deduplicated by counting\n", | |
"-- the hashes of the initial commit instead of repository id.\n", | |
"--\n", | |
"-- categories: ????\n", | |
"SELECT\n", | |
" year,\n", | |
" COUNT(DISTINCT repository_id) AS project_count\n", | |
"FROM (\n", | |
" SELECT\n", | |
" repository_id,\n", | |
" MIN(YEAR(author_date)) AS year\n", | |
" FROM commits\n", | |
" WHERE\n", | |
" SIZE(parents) == 0 AND\n", | |
" reference_name = 'refs/heads/HEAD'\n", | |
" GROUP BY repository_id\n", | |
") AS q\n", | |
"GROUP BY year\n", | |
"ORDER BY year DESC" | |
] | |
} | |
], | |
"metadata": { | |
"celltoolbar": "Edit Metadata", | |
"kernelspec": { | |
"display_name": "Apache Toree - Scala", | |
"language": "scala", | |
"name": "apache_toree_scala" | |
}, | |
"language_info": { | |
"file_extension": ".scala", | |
"name": "scala", | |
"version": "2.11.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment