Skip to content

Instantly share code, notes, and snippets.

@smola
Last active October 4, 2018 11:08
Show Gist options
  • Save smola/7013c2ce00788a36d9b57c66521a340d to your computer and use it in GitHub Desktop.
Save smola/7013c2ce00788a36d9b57c66521a340d to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### source{d} reference SQL queries (DRAFT)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"149 repositories successfully loaded"
]
}
],
"source": [
"import tech.sourced.engine._\n",
"\n",
"val engine = Engine(spark, \"hdfs://hdfs-namenode/100-siva\", \"siva\")\n",
"\n",
"spark.registerUDFs\n",
"\n",
"print(s\"${engine.getRepositories.count()} repositories successfully loaded\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+--------------------+--------+\n",
"| key| value|\n",
"+--------------------+--------+\n",
"|spark.sql.files.m...|33554432|\n",
"+--------------------+--------+\n",
"\n"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SET spark.sql.files.maxPartitionBytes=33554432"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Show tables"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+--------+------------+-----------+\n",
"|database| tableName|isTemporary|\n",
"+--------+------------+-----------+\n",
"| | blobs| true|\n",
"| | commits| true|\n",
"| | references| true|\n",
"| |repositories| true|\n",
"| |tree_entries| true|\n",
"+--------+------------+-----------+\n",
"\n"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SHOW TABLES"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Describe table"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+--------------+-------------+-------+\n",
"| col_name| data_type|comment|\n",
"+--------------+-------------+-------+\n",
"| repository_id| string| null|\n",
"|reference_name| string| null|\n",
"| index| int| null|\n",
"| hash| string| null|\n",
"| message| string| null|\n",
"| parents|array<string>| null|\n",
"| parents_count| int| null|\n",
"| author_email| string| null|\n",
"| author_name| string| null|\n",
"| author_date| timestamp| null|\n",
"+--------------+-------------+-------+\n",
"only showing top 10 rows\n",
"\n"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DESCRIBE commits"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create table of non-fork repository IDs"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"++\n",
"||\n",
"++\n",
"++\n",
"\n"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"CREATE TABLE IF NOT EXISTS repository_ids_no_forks\n",
"USING parquet\n",
"OPTIONS ('compression'='snappy')\n",
"COMMENT 'repository_id of all repositories containing a HEAD reference and excluding forks'\n",
"SELECT\n",
" s.repository_id AS repository_id\n",
"FROM (\n",
" SELECT\n",
" hash,\n",
" MAX(STRUCT(index, repository_id)) AS s\n",
" FROM commits\n",
" WHERE\n",
" index != 1\n",
" AND reference_name = 'refs/heads/HEAD'\n",
" AND SIZE(parents) == 0\n",
" GROUP BY hash\n",
") AS q\n",
"ORDER BY repository_id ASC\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q1: Count repositories"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------------+\n",
"|repository_count|\n",
"+----------------+\n",
"| 149|\n",
"+----------------+\n",
"\n"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"-- Count of total repositories\n",
"--\n",
"-- categories: fast\n",
"SELECT\n",
" COUNT(DISTINCT id) AS repository_count\n",
"FROM repositories"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q2: Last commit messages in HEAD for every repository"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+--------------------+\n",
"| message|\n",
"+--------------------+\n",
"| ???????? ?????\n",
"|\n",
"| Merging xml files|\n",
"| log\n",
"|\n",
"|Delete duplicate ...|\n",
"|v1.1.10 New para...|\n",
"|assignment 4 firs...|\n",
"|Update README.md ...|\n",
"|Add extra solder ...|\n",
"|Added all the sle...|\n",
"|trying to fix dra...|\n",
"+--------------------+\n",
"only showing top 10 rows\n",
"\n"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"-- Last commit messages in HEAD for every repository.\n",
"--\n",
"-- categories: fast\n",
"SELECT message\n",
"FROM commits\n",
"WHERE\n",
" reference_name = 'refs/heads/HEAD' AND\n",
" index = 0"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q3: All commit messages in HEAD history for every repository"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+--------------------+\n",
"| message|\n",
"+--------------------+\n",
"| ???????? ?????\n",
"|\n",
"|?????? ???? befor...|\n",
"| fix\n",
"|\n",
"| fix\n",
"|\n",
"| fix\n",
"|\n",
"|?????????? ??????...|\n",
"| up\n",
"|\n",
"| UP\n",
"|\n",
"|Merged in xDESTRO...|\n",
"|???????? ???? ???...|\n",
"+--------------------+\n",
"only showing top 10 rows\n",
"\n"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"--- All commit messages in HEAD history for every repository.\n",
"---\n",
"--- categories: fast\n",
"SELECT message\n",
"FROM commits\n",
"WHERE\n",
" reference_name = 'refs/heads/HEAD' AND\n",
" index >= 0"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q4: Top 10 repositories by commit count in HEAD"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+--------------------+------------+\n",
"| repository_id|commit_count|\n",
"+--------------------+------------+\n",
"|github.com/bjoeri...| 601|\n",
"|github.com/akoo0o...| 592|\n",
"|github.com/Deathl...| 592|\n",
"|bitbucket.org/lew...| 370|\n",
"|github.com/bigben...| 162|\n",
"|github.com/Christ...| 162|\n",
"|github.com/nickma...| 162|\n",
"|github.com/jzawod...| 150|\n",
"|github.com/johand...| 147|\n",
"|github.com/johhal...| 135|\n",
"+--------------------+------------+\n",
"\n"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"-- Get top 10 repositoryes by commit count in HEAD\n",
"--\n",
"-- categories: ????\n",
"SELECT\n",
" repository_id,\n",
" count(*) AS commit_count\n",
"FROM commits\n",
" WHERE\n",
" reference_name = 'refs/heads/HEAD'\n",
" -- NOTE: This is the magic bit to get the history, which is used underneath by getAllReferenceCommits\n",
" AND index <> -1\n",
"GROUP BY repository_id, reference_name\n",
"ORDER BY commit_count DESC\n",
"LIMIT 10"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q5: Fork detection (approximate)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+--------------------+-------+--------------------+--------------------+\n",
"| hash|n_repos| main_repository_id| repository_ids|\n",
"+--------------------+-------+--------------------+--------------------+\n",
"|74c998a1251da5cf9...| 9|github.com/nickma...|[github.com/bigbe...|\n",
"|ecd4b4d008bd4498e...| 4|github.com/johand...|[github.com/jiamz...|\n",
"|e51501b76328057c1...| 3|github.com/bjoeri...|[github.com/Death...|\n",
"|6b91410a1bf752662...| 2|github.com/welter...|[github.com/welte...|\n",
"|49b7828ca2edf159f...| 2|github.com/esjewe...|[github.com/dolss...|\n",
"|acb4c30aa3a5952a8...| 2|github.com/ckreut...|[github.com/ckreu...|\n",
"|18e7dcbfc42a83db9...| 2|github.com/Printr...|[github.com/Print...|\n",
"+--------------------+-------+--------------------+--------------------+\n",
"\n"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"-- Fork detection. Returns all repositories grouped if they form a fork network.\n",
"-- The main repository is approximated by picking the one with longest history in HEAD.\n",
"--\n",
"-- categories: ????\n",
"SELECT\n",
" hash,\n",
" n_repos,\n",
" s.repository_id AS main_repository_id,\n",
" repository_ids\n",
"FROM (\n",
" SELECT\n",
" hash,\n",
" COUNT(*) n_repos,\n",
" MAX(STRUCT(index, repository_id)) AS s,\n",
" COLLECT_SET(repository_id) AS repository_ids\n",
" FROM commits\n",
" WHERE\n",
" index != 1\n",
" AND reference_name = 'refs/heads/HEAD'\n",
" AND SIZE(parents) == 0\n",
" GROUP BY hash\n",
") AS q\n",
"WHERE n_repos > 1\n",
"ORDER BY n_repos DESC\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q6: Count repository HEAD"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------+\n",
"|head_count|\n",
"+----------+\n",
"| 106|\n",
"+----------+\n",
"\n"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"-- Count repository HEADs.\n",
"--\n",
"-- categories: fast\n",
"SELECT\n",
" COUNT(DISTINCT repository_ID) AS head_count\n",
"FROM\n",
" references\n",
"WHERE name = 'refs/heads/HEAD'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q7: Count repository HEAD excluding forks"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+--------+\n",
"|count(1)|\n",
"+--------+\n",
"| 80|\n",
"+--------+\n",
"\n"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"-- Count repository HEADs excluding forks.\n",
"--\n",
"-- categories: ????\n",
"SELECT\n",
" COUNT(*)\n",
"FROM (\n",
"SELECT DISTINCT\n",
" s.repository_id AS repository_id\n",
"FROM (\n",
" SELECT\n",
" hash,\n",
" MAX(STRUCT(index, repository_id)) AS s\n",
" FROM commits\n",
" WHERE\n",
" index != 1\n",
" AND reference_name = 'refs/heads/HEAD'\n",
" AND SIZE(parents) == 0\n",
" GROUP BY hash\n",
") AS q\n",
") AS q2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q8: Repository count by language presence (HEAD, no forks)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------+----------------+\n",
"| language|repository_count|\n",
"+----------+----------------+\n",
"| Unknown| 74|\n",
"| Text| 43|\n",
"| Markdown| 37|\n",
"|JavaScript| 35|\n",
"| HTML| 33|\n",
"| XML| 27|\n",
"| CSS| 27|\n",
"| Ruby| 20|\n",
"| Shell| 17|\n",
"| YAML| 17|\n",
"+----------+----------------+\n",
"only showing top 10 rows\n",
"\n"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"-- Repository count per language precense.\n",
"-- If any file in language X is present in HEAD of a repository,\n",
"-- it contributes one for the language count.\n",
"-- Forks are excluded.\n",
"--\n",
"-- category: slow,enry\n",
"SELECT\n",
" language,\n",
" COUNT(repository_id) AS repository_count\n",
"FROM (\n",
" SELECT DISTINCT\n",
" t.repository_id AS repository_id,\n",
" COALESCE(\n",
" classifyLanguages(b.is_binary, t.path, b.content),\n",
" 'Unknown') AS language\n",
" FROM\n",
" tree_entries t\n",
" JOIN blobs b ON\n",
" t.repository_id = b.repository_id AND\n",
" t.reference_name = b.reference_name AND\n",
" t.blob = b.blob_id\n",
" JOIN repository_ids_no_forks r ON\n",
" b.repository_id = r.repository_id\n",
" WHERE\n",
" t.reference_name = 'refs/heads/HEAD'\n",
" ) AS q2\n",
"GROUP BY language\n",
"ORDER BY repository_count DESC"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q9: Top 50 tokens in files by repository count (HEAD, no forks)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%sql\n",
"-- Top 100 tokens by repository count. Each token counted once per repository.\n",
"--\n",
"-- category: slow\n",
"SELECT\n",
" token,\n",
" COUNT(DISTINCT repository_id) AS repository_count\n",
"FROM (\n",
" SELECT\n",
" b.repository_id,\n",
" EXPLODE(SPLIT(b.content, '[^_A-Za-z0-9]+')) AS token\n",
" FROM\n",
" blobs b\n",
" JOIN repository_ids_no_forks r ON\n",
" b.repository_id = r.repository_id\n",
" WHERE\n",
" reference_name = 'refs/heads/HEAD' AND\n",
" is_binary = false AND\n",
" LENGTH(content) < 524288\n",
") AS q\n",
"GROUP BY token\n",
"ORDER BY repository_count DESC\n",
"LIMIT 50"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q10: Top 50 identifiers in files by repository count (HEAD, no forks)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------+----------------+\n",
"|identifier|repository_count|\n",
"+----------+----------------+\n",
"| | 51|\n",
"| String| 48|\n",
"| java| 46|\n",
"| util| 45|\n",
"| i| 41|\n",
"| io| 39|\n",
"| Override| 38|\n",
"| get| 38|\n",
"| toString| 38|\n",
"| e| 38|\n",
"+----------+----------------+\n",
"only showing top 10 rows\n",
"\n"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"-- Top 100 identifier by repository count. Each identifier counted once per repository.\n",
"--\n",
"-- category: slow,babelfish\n",
"SELECT\n",
" identifier,\n",
" COUNT(DISTINCT repository_id) AS repository_count\n",
"FROM (\n",
" SELECT\n",
" b.repository_id AS repository_id,\n",
" EXPLODE(extractTokens(queryXPath(\n",
" extractUASTs(t.path, b.content, classifyLanguages(b.is_binary, t.path, b.content)),\n",
" '//*[@roleIdentifier and not(@roleIncomplete)]'))) AS identifier\n",
" FROM\n",
" tree_entries t\n",
" JOIN blobs b ON\n",
" t.repository_id = b.repository_id AND\n",
" t.reference_name = b.reference_name AND\n",
" t.blob = b.blob_id\n",
" JOIN repository_ids_no_forks r ON\n",
" b.repository_id = r.repository_id\n",
" WHERE\n",
" b.reference_name = 'refs/heads/HEAD' AND\n",
" b.is_binary = false AND\n",
" LENGTH(b.content) < 524288 AND\n",
" REGEXP_EXTRACT(t.path, '\\.[^.]+$', 0) IN ('.java', '.py', '.go')\n",
") AS q\n",
"GROUP BY identifier\n",
"ORDER BY repository_count DESC\n",
"LIMIT 50"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q11: Top 10 repositories by contributor count (all branches)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+--------------------+-----------------+\n",
"| repository_id|contributor_count|\n",
"+--------------------+-----------------+\n",
"|github.com/nickma...| 34|\n",
"|github.com/Christ...| 34|\n",
"|github.com/bigben...| 34|\n",
"|github.com/ckreut...| 16|\n",
"|github.com/johand...| 13|\n",
"|github.com/bjoeri...| 11|\n",
"|github.com/pelarg...| 10|\n",
"|github.com/akoo0o...| 10|\n",
"|github.com/Deathl...| 10|\n",
"|github.com/alfons...| 9|\n",
"+--------------------+-----------------+\n",
"\n"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"-- Top 10 repositories by contributor count in all branches.\n",
"-- Each distinct email is counted as a distinct contributor.\n",
"--\n",
"-- categories: fast\n",
"SELECT\n",
" repository_id,\n",
" contributor_count\n",
"FROM (\n",
" SELECT\n",
" repository_id,\n",
" COUNT(DISTINCT author_email) AS contributor_count\n",
" FROM commits\n",
" WHERE index <> -1\n",
" GROUP BY repository_id\n",
") AS q\n",
"ORDER BY contributor_count DESC\n",
"LIMIT 10\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q12: Created projects per year"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----+-------------+\n",
"|year|project_count|\n",
"+----+-------------+\n",
"|2017| 6|\n",
"|2016| 2|\n",
"|2015| 1|\n",
"|2014| 12|\n",
"|2013| 22|\n",
"|2012| 23|\n",
"|2011| 8|\n",
"|2010| 19|\n",
"|2009| 3|\n",
"|2008| 10|\n",
"+----+-------------+\n",
"\n"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"-- Approximate number of projects created per year. Creation date is extracted from the\n",
"-- first commit in the history of HEAD reference. Forks are deduplicated by counting\n",
"-- the hashes of the initial commit instead of repository id.\n",
"--\n",
"-- categories: ????\n",
"SELECT\n",
" year,\n",
" COUNT(DISTINCT repository_id) AS project_count\n",
"FROM (\n",
" SELECT\n",
" repository_id,\n",
" MIN(YEAR(author_date)) AS year\n",
" FROM commits\n",
" WHERE\n",
" SIZE(parents) == 0 AND\n",
" reference_name = 'refs/heads/HEAD'\n",
" GROUP BY repository_id\n",
") AS q\n",
"GROUP BY year\n",
"ORDER BY year DESC"
]
}
],
"metadata": {
"celltoolbar": "Edit Metadata",
"kernelspec": {
"display_name": "Apache Toree - Scala",
"language": "scala",
"name": "apache_toree_scala"
},
"language_info": {
"file_extension": ".scala",
"name": "scala",
"version": "2.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment