smola/reference_queries.ipynb

## reference_queries.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### source{d} reference SQL queries (DRAFT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "149 repositories successfully loaded"
     ]
    }
   ],
   "source": [
    "import tech.sourced.engine._\n",
    "\n",
    "val engine = Engine(spark, \"hdfs://hdfs-namenode/100-siva\", \"siva\")\n",
    "\n",
    "spark.registerUDFs\n",
    "\n",
    "print(s\"${engine.getRepositories.count()} repositories successfully loaded\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+--------------------+--------+\n",
       "|                 key|   value|\n",
       "+--------------------+--------+\n",
       "|spark.sql.files.m...|33554432|\n",
       "+--------------------+--------+\n",
       "\n"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "SET spark.sql.files.maxPartitionBytes=33554432"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Show tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+--------+------------+-----------+\n",
       "|database|   tableName|isTemporary|\n",
       "+--------+------------+-----------+\n",
       "|        |       blobs|       true|\n",
       "|        |     commits|       true|\n",
       "|        |  references|       true|\n",
       "|        |repositories|       true|\n",
       "|        |tree_entries|       true|\n",
       "+--------+------------+-----------+\n",
       "\n"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "SHOW TABLES"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Describe table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+--------------+-------------+-------+\n",
       "|      col_name|    data_type|comment|\n",
       "+--------------+-------------+-------+\n",
       "| repository_id|       string|   null|\n",
       "|reference_name|       string|   null|\n",
       "|         index|          int|   null|\n",
       "|          hash|       string|   null|\n",
       "|       message|       string|   null|\n",
       "|       parents|array<string>|   null|\n",
       "| parents_count|          int|   null|\n",
       "|  author_email|       string|   null|\n",
       "|   author_name|       string|   null|\n",
       "|   author_date|    timestamp|   null|\n",
       "+--------------+-------------+-------+\n",
       "only showing top 10 rows\n",
       "\n"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "DESCRIBE commits"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create table of non-fork repository IDs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "++\n",
       "||\n",
       "++\n",
       "++\n",
       "\n"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "CREATE TABLE IF NOT EXISTS repository_ids_no_forks\n",
    "USING parquet\n",
    "OPTIONS ('compression'='snappy')\n",
    "COMMENT 'repository_id of all repositories containing a HEAD reference and excluding forks'\n",
    "SELECT\n",
    "    s.repository_id AS repository_id\n",
    "FROM (\n",
    "    SELECT\n",
    "        hash,\n",
    "        MAX(STRUCT(index, repository_id)) AS s\n",
    "    FROM commits\n",
    "    WHERE\n",
    "        index != 1\n",
    "        AND reference_name = 'refs/heads/HEAD'\n",
    "        AND SIZE(parents) == 0\n",
    "    GROUP BY hash\n",
    ") AS q\n",
    "ORDER BY repository_id ASC\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Q1: Count repositories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+----------------+\n",
       "|repository_count|\n",
       "+----------------+\n",
       "|             149|\n",
       "+----------------+\n",
       "\n"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "-- Count of total repositories\n",
    "--\n",
    "-- categories: fast\n",
    "SELECT\n",
    "    COUNT(DISTINCT id) AS repository_count\n",
    "FROM repositories"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Q2: Last commit messages in HEAD for every repository"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+--------------------+\n",
       "|             message|\n",
       "+--------------------+\n",
       "|     ???????? ?????\n",
       "|\n",
       "|   Merging xml files|\n",
       "|                log\n",
       "|\n",
       "|Delete duplicate ...|\n",
       "|v1.1.10  New para...|\n",
       "|assignment 4 firs...|\n",
       "|Update README.md ...|\n",
       "|Add extra solder ...|\n",
       "|Added all the sle...|\n",
       "|trying to fix dra...|\n",
       "+--------------------+\n",
       "only showing top 10 rows\n",
       "\n"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "-- Last commit messages in HEAD for every repository.\n",
    "--\n",
    "-- categories: fast\n",
    "SELECT message\n",
    "FROM commits\n",
    "WHERE\n",
    "    reference_name = 'refs/heads/HEAD' AND\n",
    "    index = 0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Q3: All commit messages in HEAD history for every repository"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+--------------------+\n",
       "|             message|\n",
       "+--------------------+\n",
       "|     ???????? ?????\n",
       "|\n",
       "|?????? ???? befor...|\n",
       "|                fix\n",
       "|\n",
       "|                fix\n",
       "|\n",
       "|                fix\n",
       "|\n",
       "|?????????? ??????...|\n",
       "|                 up\n",
       "|\n",
       "|                 UP\n",
       "|\n",
       "|Merged in xDESTRO...|\n",
       "|???????? ???? ???...|\n",
       "+--------------------+\n",
       "only showing top 10 rows\n",
       "\n"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "--- All commit messages in HEAD history for every repository.\n",
    "---\n",
    "--- categories: fast\n",
    "SELECT message\n",
    "FROM commits\n",
    "WHERE\n",
    "    reference_name = 'refs/heads/HEAD' AND\n",
    "    index >= 0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Q4: Top 10 repositories by commit count in HEAD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+--------------------+------------+\n",
       "|       repository_id|commit_count|\n",
       "+--------------------+------------+\n",
       "|github.com/bjoeri...|         601|\n",
       "|github.com/akoo0o...|         592|\n",
       "|github.com/Deathl...|         592|\n",
       "|bitbucket.org/lew...|         370|\n",
       "|github.com/bigben...|         162|\n",
       "|github.com/Christ...|         162|\n",
       "|github.com/nickma...|         162|\n",
       "|github.com/jzawod...|         150|\n",
       "|github.com/johand...|         147|\n",
       "|github.com/johhal...|         135|\n",
       "+--------------------+------------+\n",
       "\n"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "-- Get top 10 repositoryes by commit count in HEAD\n",
    "--\n",
    "-- categories: ????\n",
    "SELECT\n",
    "    repository_id,\n",
    "    count(*) AS commit_count\n",
    "FROM commits\n",
    " WHERE\n",
    "    reference_name = 'refs/heads/HEAD'\n",
    "    -- NOTE: This is the magic bit to get the history, which is used underneath by getAllReferenceCommits\n",
    "    AND index <> -1\n",
    "GROUP BY repository_id, reference_name\n",
    "ORDER BY commit_count DESC\n",
    "LIMIT 10"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Q5: Fork detection (approximate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+--------------------+-------+--------------------+--------------------+\n",
       "|                hash|n_repos|  main_repository_id|      repository_ids|\n",
       "+--------------------+-------+--------------------+--------------------+\n",
       "|74c998a1251da5cf9...|      9|github.com/nickma...|[github.com/bigbe...|\n",
       "|ecd4b4d008bd4498e...|      4|github.com/johand...|[github.com/jiamz...|\n",
       "|e51501b76328057c1...|      3|github.com/bjoeri...|[github.com/Death...|\n",
       "|6b91410a1bf752662...|      2|github.com/welter...|[github.com/welte...|\n",
       "|49b7828ca2edf159f...|      2|github.com/esjewe...|[github.com/dolss...|\n",
       "|acb4c30aa3a5952a8...|      2|github.com/ckreut...|[github.com/ckreu...|\n",
       "|18e7dcbfc42a83db9...|      2|github.com/Printr...|[github.com/Print...|\n",
       "+--------------------+-------+--------------------+--------------------+\n",
       "\n"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "-- Fork detection. Returns all repositories grouped if they form a fork network.\n",
    "-- The main repository is approximated by picking the one with longest history in HEAD.\n",
    "--\n",
    "-- categories: ????\n",
    "SELECT\n",
    "    hash,\n",
    "    n_repos,\n",
    "    s.repository_id AS main_repository_id,\n",
    "    repository_ids\n",
    "FROM (\n",
    "    SELECT\n",
    "        hash,\n",
    "        COUNT(*) n_repos,\n",
    "        MAX(STRUCT(index, repository_id)) AS s,\n",
    "        COLLECT_SET(repository_id) AS repository_ids\n",
    "    FROM commits\n",
    "    WHERE\n",
    "        index != 1\n",
    "        AND reference_name = 'refs/heads/HEAD'\n",
    "        AND SIZE(parents) == 0\n",
    "    GROUP BY hash\n",
    ") AS q\n",
    "WHERE n_repos > 1\n",
    "ORDER BY n_repos DESC\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Q6: Count repository HEAD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+----------+\n",
       "|head_count|\n",
       "+----------+\n",
       "|       106|\n",
       "+----------+\n",
       "\n"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "-- Count repository HEADs.\n",
    "--\n",
    "-- categories: fast\n",
    "SELECT\n",
    "    COUNT(DISTINCT repository_ID) AS head_count\n",
    "FROM\n",
    "    references\n",
    "WHERE name = 'refs/heads/HEAD'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Q7: Count repository HEAD excluding forks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+--------+\n",
       "|count(1)|\n",
       "+--------+\n",
       "|      80|\n",
       "+--------+\n",
       "\n"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "-- Count repository HEADs excluding forks.\n",
    "--\n",
    "-- categories: ????\n",
    "SELECT\n",
    "    COUNT(*)\n",
    "FROM (\n",
    "SELECT DISTINCT\n",
    "     s.repository_id AS repository_id\n",
    "FROM (\n",
    "    SELECT\n",
    "        hash,\n",
    "        MAX(STRUCT(index, repository_id)) AS s\n",
    "    FROM commits\n",
    "    WHERE\n",
    "        index != 1\n",
    "        AND reference_name = 'refs/heads/HEAD'\n",
    "        AND SIZE(parents) == 0\n",
    "    GROUP BY hash\n",
    ") AS q\n",
    ") AS q2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Q8: Repository count by language presence (HEAD, no forks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+----------+----------------+\n",
       "|  language|repository_count|\n",
       "+----------+----------------+\n",
       "|   Unknown|              74|\n",
       "|      Text|              43|\n",
       "|  Markdown|              37|\n",
       "|JavaScript|              35|\n",
       "|      HTML|              33|\n",
       "|       XML|              27|\n",
       "|       CSS|              27|\n",
       "|      Ruby|              20|\n",
       "|     Shell|              17|\n",
       "|      YAML|              17|\n",
       "+----------+----------------+\n",
       "only showing top 10 rows\n",
       "\n"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "-- Repository count per language precense.\n",
    "-- If any file in language X is present in HEAD of a repository,\n",
    "-- it contributes one for the language count.\n",
    "-- Forks are excluded.\n",
    "--\n",
    "-- category: slow,enry\n",
    "SELECT\n",
    "    language,\n",
    "    COUNT(repository_id) AS repository_count\n",
    "FROM (\n",
    "    SELECT DISTINCT\n",
    "        t.repository_id AS repository_id,\n",
    "        COALESCE(\n",
    "            classifyLanguages(b.is_binary, t.path, b.content),\n",
    "            'Unknown') AS language\n",
    "    FROM\n",
    "        tree_entries t\n",
    "        JOIN blobs b ON\n",
    "            t.repository_id = b.repository_id AND\n",
    "            t.reference_name = b.reference_name AND\n",
    "            t.blob = b.blob_id\n",
    "        JOIN repository_ids_no_forks r ON\n",
    "            b.repository_id = r.repository_id\n",
    "    WHERE\n",
    "        t.reference_name = 'refs/heads/HEAD'\n",
    "    ) AS q2\n",
    "GROUP BY language\n",
    "ORDER BY repository_count DESC"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Q9: Top 50 tokens in files by repository count (HEAD, no forks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%sql\n",
    "-- Top 100 tokens by repository count. Each token counted once per repository.\n",
    "--\n",
    "-- category: slow\n",
    "SELECT\n",
    "    token,\n",
    "    COUNT(DISTINCT repository_id) AS repository_count\n",
    "FROM (\n",
    "    SELECT\n",
    "        b.repository_id,\n",
    "        EXPLODE(SPLIT(b.content, '[^_A-Za-z0-9]+')) AS token\n",
    "    FROM\n",
    "        blobs b\n",
    "        JOIN repository_ids_no_forks r ON\n",
    "            b.repository_id = r.repository_id\n",
    "    WHERE\n",
    "        reference_name = 'refs/heads/HEAD' AND\n",
    "        is_binary = false AND\n",
    "        LENGTH(content) < 524288\n",
    ") AS q\n",
    "GROUP BY token\n",
    "ORDER BY repository_count DESC\n",
    "LIMIT 50"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Q10: Top 50 identifiers in files by repository count (HEAD, no forks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+----------+----------------+\n",
       "|identifier|repository_count|\n",
       "+----------+----------------+\n",
       "|          |              51|\n",
       "|    String|              48|\n",
       "|      java|              46|\n",
       "|      util|              45|\n",
       "|         i|              41|\n",
       "|        io|              39|\n",
       "|  Override|              38|\n",
       "|       get|              38|\n",
       "|  toString|              38|\n",
       "|         e|              38|\n",
       "+----------+----------------+\n",
       "only showing top 10 rows\n",
       "\n"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "-- Top 100 identifier by repository count. Each identifier counted once per repository.\n",
    "--\n",
    "-- category: slow,babelfish\n",
    "SELECT\n",
    "    identifier,\n",
    "    COUNT(DISTINCT repository_id) AS repository_count\n",
    "FROM (\n",
    "    SELECT\n",
    "        b.repository_id AS repository_id,\n",
    "        EXPLODE(extractTokens(queryXPath(\n",
    "            extractUASTs(t.path, b.content, classifyLanguages(b.is_binary, t.path, b.content)),\n",
    "            '//*[@roleIdentifier and not(@roleIncomplete)]'))) AS identifier\n",
    "    FROM\n",
    "        tree_entries t\n",
    "        JOIN blobs b ON\n",
    "            t.repository_id = b.repository_id AND\n",
    "            t.reference_name = b.reference_name AND\n",
    "            t.blob = b.blob_id\n",
    "        JOIN repository_ids_no_forks r ON\n",
    "            b.repository_id = r.repository_id\n",
    "    WHERE\n",
    "        b.reference_name = 'refs/heads/HEAD' AND\n",
    "        b.is_binary = false AND\n",
    "        LENGTH(b.content) < 524288 AND\n",
    "        REGEXP_EXTRACT(t.path, '\\.[^.]+$', 0) IN ('.java', '.py', '.go')\n",
    ") AS q\n",
    "GROUP BY identifier\n",
    "ORDER BY repository_count DESC\n",
    "LIMIT 50"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Q11: Top 10 repositories by contributor count (all branches)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+--------------------+-----------------+\n",
       "|       repository_id|contributor_count|\n",
       "+--------------------+-----------------+\n",
       "|github.com/nickma...|               34|\n",
       "|github.com/Christ...|               34|\n",
       "|github.com/bigben...|               34|\n",
       "|github.com/ckreut...|               16|\n",
       "|github.com/johand...|               13|\n",
       "|github.com/bjoeri...|               11|\n",
       "|github.com/pelarg...|               10|\n",
       "|github.com/akoo0o...|               10|\n",
       "|github.com/Deathl...|               10|\n",
       "|github.com/alfons...|                9|\n",
       "+--------------------+-----------------+\n",
       "\n"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "-- Top 10 repositories by contributor count in all branches.\n",
    "-- Each distinct email is counted as a distinct contributor.\n",
    "--\n",
    "-- categories: fast\n",
    "SELECT\n",
    "    repository_id,\n",
    "    contributor_count\n",
    "FROM (\n",
    "    SELECT\n",
    "        repository_id,\n",
    "        COUNT(DISTINCT author_email) AS contributor_count\n",
    "    FROM commits\n",
    "    WHERE index <> -1\n",
    "    GROUP BY repository_id\n",
    ") AS q\n",
    "ORDER BY contributor_count DESC\n",
    "LIMIT 10\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Q12: Created projects per year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+----+-------------+\n",
       "|year|project_count|\n",
       "+----+-------------+\n",
       "|2017|            6|\n",
       "|2016|            2|\n",
       "|2015|            1|\n",
       "|2014|           12|\n",
       "|2013|           22|\n",
       "|2012|           23|\n",
       "|2011|            8|\n",
       "|2010|           19|\n",
       "|2009|            3|\n",
       "|2008|           10|\n",
       "+----+-------------+\n",
       "\n"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "-- Approximate number of projects created per year. Creation date is extracted from the\n",
    "-- first commit in the history of HEAD reference. Forks are deduplicated by counting\n",
    "-- the hashes of the initial commit instead of repository id.\n",
    "--\n",
    "-- categories: ????\n",
    "SELECT\n",
    "    year,\n",
    "    COUNT(DISTINCT repository_id) AS project_count\n",
    "FROM (\n",
    "    SELECT\n",
    "        repository_id,\n",
    "        MIN(YEAR(author_date)) AS year\n",
    "    FROM commits\n",
    "    WHERE\n",
    "        SIZE(parents) == 0 AND\n",
    "        reference_name = 'refs/heads/HEAD'\n",
    "    GROUP BY repository_id\n",
    ") AS q\n",
    "GROUP BY year\n",
    "ORDER BY year DESC"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Edit Metadata",
  "kernelspec": {
   "display_name": "Apache Toree - Scala",
   "language": "scala",
   "name": "apache_toree_scala"
  },
  "language_info": {
   "file_extension": ".scala",
   "name": "scala",
   "version": "2.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### source{d} reference SQL queries (DRAFT)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"149 repositories successfully loaded"
	]
	}
	],
	"source": [
	"import tech.sourced.engine._\n",
	"\n",
	"val engine = Engine(spark, \"hdfs://hdfs-namenode/100-siva\", \"siva\")\n",
	"\n",
	"spark.registerUDFs\n",
	"\n",
	"print(s\"${engine.getRepositories.count()} repositories successfully loaded\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+--------------------+--------+\n",
	"\| key\| value\|\n",
	"+--------------------+--------+\n",
	"\|spark.sql.files.m...\|33554432\|\n",
	"+--------------------+--------+\n",
	"\n"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"SET spark.sql.files.maxPartitionBytes=33554432"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Show tables"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+--------+------------+-----------+\n",
	"\|database\| tableName\|isTemporary\|\n",
	"+--------+------------+-----------+\n",
	"\| \| blobs\| true\|\n",
	"\| \| commits\| true\|\n",
	"\| \| references\| true\|\n",
	"\| \|repositories\| true\|\n",
	"\| \|tree_entries\| true\|\n",
	"+--------+------------+-----------+\n",
	"\n"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"SHOW TABLES"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Describe table"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+--------------+-------------+-------+\n",
	"\| col_name\| data_type\|comment\|\n",
	"+--------------+-------------+-------+\n",
	"\| repository_id\| string\| null\|\n",
	"\|reference_name\| string\| null\|\n",
	"\| index\| int\| null\|\n",
	"\| hash\| string\| null\|\n",
	"\| message\| string\| null\|\n",
	"\| parents\|array<string>\| null\|\n",
	"\| parents_count\| int\| null\|\n",
	"\| author_email\| string\| null\|\n",
	"\| author_name\| string\| null\|\n",
	"\| author_date\| timestamp\| null\|\n",
	"+--------------+-------------+-------+\n",
	"only showing top 10 rows\n",
	"\n"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"DESCRIBE commits"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Create table of non-fork repository IDs"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"++\n",
	"\|\|\n",
	"++\n",
	"++\n",
	"\n"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"CREATE TABLE IF NOT EXISTS repository_ids_no_forks\n",
	"USING parquet\n",
	"OPTIONS ('compression'='snappy')\n",
	"COMMENT 'repository_id of all repositories containing a HEAD reference and excluding forks'\n",
	"SELECT\n",
	" s.repository_id AS repository_id\n",
	"FROM (\n",
	" SELECT\n",
	" hash,\n",
	" MAX(STRUCT(index, repository_id)) AS s\n",
	" FROM commits\n",
	" WHERE\n",
	" index != 1\n",
	" AND reference_name = 'refs/heads/HEAD'\n",
	" AND SIZE(parents) == 0\n",
	" GROUP BY hash\n",
	") AS q\n",
	"ORDER BY repository_id ASC\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Q1: Count repositories"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+----------------+\n",
	"\|repository_count\|\n",
	"+----------------+\n",
	"\| 149\|\n",
	"+----------------+\n",
	"\n"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"-- Count of total repositories\n",
	"--\n",
	"-- categories: fast\n",
	"SELECT\n",
	" COUNT(DISTINCT id) AS repository_count\n",
	"FROM repositories"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Q2: Last commit messages in HEAD for every repository"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+--------------------+\n",
	"\| message\|\n",
	"+--------------------+\n",
	"\| ???????? ?????\n",
	"\|\n",
	"\| Merging xml files\|\n",
	"\| log\n",
	"\|\n",
	"\|Delete duplicate ...\|\n",
	"\|v1.1.10 New para...\|\n",
	"\|assignment 4 firs...\|\n",
	"\|Update README.md ...\|\n",
	"\|Add extra solder ...\|\n",
	"\|Added all the sle...\|\n",
	"\|trying to fix dra...\|\n",
	"+--------------------+\n",
	"only showing top 10 rows\n",
	"\n"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"-- Last commit messages in HEAD for every repository.\n",
	"--\n",
	"-- categories: fast\n",
	"SELECT message\n",
	"FROM commits\n",
	"WHERE\n",
	" reference_name = 'refs/heads/HEAD' AND\n",
	" index = 0"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Q3: All commit messages in HEAD history for every repository"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+--------------------+\n",
	"\| message\|\n",
	"+--------------------+\n",
	"\| ???????? ?????\n",
	"\|\n",
	"\|?????? ???? befor...\|\n",
	"\| fix\n",
	"\|\n",
	"\| fix\n",
	"\|\n",
	"\| fix\n",
	"\|\n",
	"\|?????????? ??????...\|\n",
	"\| up\n",
	"\|\n",
	"\| UP\n",
	"\|\n",
	"\|Merged in xDESTRO...\|\n",
	"\|???????? ???? ???...\|\n",
	"+--------------------+\n",
	"only showing top 10 rows\n",
	"\n"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"--- All commit messages in HEAD history for every repository.\n",
	"---\n",
	"--- categories: fast\n",
	"SELECT message\n",
	"FROM commits\n",
	"WHERE\n",
	" reference_name = 'refs/heads/HEAD' AND\n",
	" index >= 0"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Q4: Top 10 repositories by commit count in HEAD"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+--------------------+------------+\n",
	"\| repository_id\|commit_count\|\n",
	"+--------------------+------------+\n",
	"\|github.com/bjoeri...\| 601\|\n",
	"\|github.com/akoo0o...\| 592\|\n",
	"\|github.com/Deathl...\| 592\|\n",
	"\|bitbucket.org/lew...\| 370\|\n",
	"\|github.com/bigben...\| 162\|\n",
	"\|github.com/Christ...\| 162\|\n",
	"\|github.com/nickma...\| 162\|\n",
	"\|github.com/jzawod...\| 150\|\n",
	"\|github.com/johand...\| 147\|\n",
	"\|github.com/johhal...\| 135\|\n",
	"+--------------------+------------+\n",
	"\n"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"-- Get top 10 repositoryes by commit count in HEAD\n",
	"--\n",
	"-- categories: ????\n",
	"SELECT\n",
	" repository_id,\n",
	" count(*) AS commit_count\n",
	"FROM commits\n",
	" WHERE\n",
	" reference_name = 'refs/heads/HEAD'\n",
	" -- NOTE: This is the magic bit to get the history, which is used underneath by getAllReferenceCommits\n",
	" AND index <> -1\n",
	"GROUP BY repository_id, reference_name\n",
	"ORDER BY commit_count DESC\n",
	"LIMIT 10"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Q5: Fork detection (approximate)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+--------------------+-------+--------------------+--------------------+\n",
	"\| hash\|n_repos\| main_repository_id\| repository_ids\|\n",
	"+--------------------+-------+--------------------+--------------------+\n",
	"\|74c998a1251da5cf9...\| 9\|github.com/nickma...\|[github.com/bigbe...\|\n",
	"\|ecd4b4d008bd4498e...\| 4\|github.com/johand...\|[github.com/jiamz...\|\n",
	"\|e51501b76328057c1...\| 3\|github.com/bjoeri...\|[github.com/Death...\|\n",
	"\|6b91410a1bf752662...\| 2\|github.com/welter...\|[github.com/welte...\|\n",
	"\|49b7828ca2edf159f...\| 2\|github.com/esjewe...\|[github.com/dolss...\|\n",
	"\|acb4c30aa3a5952a8...\| 2\|github.com/ckreut...\|[github.com/ckreu...\|\n",
	"\|18e7dcbfc42a83db9...\| 2\|github.com/Printr...\|[github.com/Print...\|\n",
	"+--------------------+-------+--------------------+--------------------+\n",
	"\n"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"-- Fork detection. Returns all repositories grouped if they form a fork network.\n",
	"-- The main repository is approximated by picking the one with longest history in HEAD.\n",
	"--\n",
	"-- categories: ????\n",
	"SELECT\n",
	" hash,\n",
	" n_repos,\n",
	" s.repository_id AS main_repository_id,\n",
	" repository_ids\n",
	"FROM (\n",
	" SELECT\n",
	" hash,\n",
	" COUNT(*) n_repos,\n",
	" MAX(STRUCT(index, repository_id)) AS s,\n",
	" COLLECT_SET(repository_id) AS repository_ids\n",
	" FROM commits\n",
	" WHERE\n",
	" index != 1\n",
	" AND reference_name = 'refs/heads/HEAD'\n",
	" AND SIZE(parents) == 0\n",
	" GROUP BY hash\n",
	") AS q\n",
	"WHERE n_repos > 1\n",
	"ORDER BY n_repos DESC\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Q6: Count repository HEAD"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+----------+\n",
	"\|head_count\|\n",
	"+----------+\n",
	"\| 106\|\n",
	"+----------+\n",
	"\n"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"-- Count repository HEADs.\n",
	"--\n",
	"-- categories: fast\n",
	"SELECT\n",
	" COUNT(DISTINCT repository_ID) AS head_count\n",
	"FROM\n",
	" references\n",
	"WHERE name = 'refs/heads/HEAD'"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Q7: Count repository HEAD excluding forks"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 50,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+--------+\n",
	"\|count(1)\|\n",
	"+--------+\n",
	"\| 80\|\n",
	"+--------+\n",
	"\n"
	]
	},
	"execution_count": 50,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"-- Count repository HEADs excluding forks.\n",
	"--\n",
	"-- categories: ????\n",
	"SELECT\n",
	" COUNT(*)\n",
	"FROM (\n",
	"SELECT DISTINCT\n",
	" s.repository_id AS repository_id\n",
	"FROM (\n",
	" SELECT\n",
	" hash,\n",
	" MAX(STRUCT(index, repository_id)) AS s\n",
	" FROM commits\n",
	" WHERE\n",
	" index != 1\n",
	" AND reference_name = 'refs/heads/HEAD'\n",
	" AND SIZE(parents) == 0\n",
	" GROUP BY hash\n",
	") AS q\n",
	") AS q2"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Q8: Repository count by language presence (HEAD, no forks)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+----------+----------------+\n",
	"\| language\|repository_count\|\n",
	"+----------+----------------+\n",
	"\| Unknown\| 74\|\n",
	"\| Text\| 43\|\n",
	"\| Markdown\| 37\|\n",
	"\|JavaScript\| 35\|\n",
	"\| HTML\| 33\|\n",
	"\| XML\| 27\|\n",
	"\| CSS\| 27\|\n",
	"\| Ruby\| 20\|\n",
	"\| Shell\| 17\|\n",
	"\| YAML\| 17\|\n",
	"+----------+----------------+\n",
	"only showing top 10 rows\n",
	"\n"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"-- Repository count per language precense.\n",
	"-- If any file in language X is present in HEAD of a repository,\n",
	"-- it contributes one for the language count.\n",
	"-- Forks are excluded.\n",
	"--\n",
	"-- category: slow,enry\n",
	"SELECT\n",
	" language,\n",
	" COUNT(repository_id) AS repository_count\n",
	"FROM (\n",
	" SELECT DISTINCT\n",
	" t.repository_id AS repository_id,\n",
	" COALESCE(\n",
	" classifyLanguages(b.is_binary, t.path, b.content),\n",
	" 'Unknown') AS language\n",
	" FROM\n",
	" tree_entries t\n",
	" JOIN blobs b ON\n",
	" t.repository_id = b.repository_id AND\n",
	" t.reference_name = b.reference_name AND\n",
	" t.blob = b.blob_id\n",
	" JOIN repository_ids_no_forks r ON\n",
	" b.repository_id = r.repository_id\n",
	" WHERE\n",
	" t.reference_name = 'refs/heads/HEAD'\n",
	" ) AS q2\n",
	"GROUP BY language\n",
	"ORDER BY repository_count DESC"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Q9: Top 50 tokens in files by repository count (HEAD, no forks)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%%sql\n",
	"-- Top 100 tokens by repository count. Each token counted once per repository.\n",
	"--\n",
	"-- category: slow\n",
	"SELECT\n",
	" token,\n",
	" COUNT(DISTINCT repository_id) AS repository_count\n",
	"FROM (\n",
	" SELECT\n",
	" b.repository_id,\n",
	" EXPLODE(SPLIT(b.content, '[^_A-Za-z0-9]+')) AS token\n",
	" FROM\n",
	" blobs b\n",
	" JOIN repository_ids_no_forks r ON\n",
	" b.repository_id = r.repository_id\n",
	" WHERE\n",
	" reference_name = 'refs/heads/HEAD' AND\n",
	" is_binary = false AND\n",
	" LENGTH(content) < 524288\n",
	") AS q\n",
	"GROUP BY token\n",
	"ORDER BY repository_count DESC\n",
	"LIMIT 50"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Q10: Top 50 identifiers in files by repository count (HEAD, no forks)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+----------+----------------+\n",
	"\|identifier\|repository_count\|\n",
	"+----------+----------------+\n",
	"\| \| 51\|\n",
	"\| String\| 48\|\n",
	"\| java\| 46\|\n",
	"\| util\| 45\|\n",
	"\| i\| 41\|\n",
	"\| io\| 39\|\n",
	"\| Override\| 38\|\n",
	"\| get\| 38\|\n",
	"\| toString\| 38\|\n",
	"\| e\| 38\|\n",
	"+----------+----------------+\n",
	"only showing top 10 rows\n",
	"\n"
	]
	},
	"execution_count": 32,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"-- Top 100 identifier by repository count. Each identifier counted once per repository.\n",
	"--\n",
	"-- category: slow,babelfish\n",
	"SELECT\n",
	" identifier,\n",
	" COUNT(DISTINCT repository_id) AS repository_count\n",
	"FROM (\n",
	" SELECT\n",
	" b.repository_id AS repository_id,\n",
	" EXPLODE(extractTokens(queryXPath(\n",
	" extractUASTs(t.path, b.content, classifyLanguages(b.is_binary, t.path, b.content)),\n",
	" '//*[@roleIdentifier and not(@roleIncomplete)]'))) AS identifier\n",
	" FROM\n",
	" tree_entries t\n",
	" JOIN blobs b ON\n",
	" t.repository_id = b.repository_id AND\n",
	" t.reference_name = b.reference_name AND\n",
	" t.blob = b.blob_id\n",
	" JOIN repository_ids_no_forks r ON\n",
	" b.repository_id = r.repository_id\n",
	" WHERE\n",
	" b.reference_name = 'refs/heads/HEAD' AND\n",
	" b.is_binary = false AND\n",
	" LENGTH(b.content) < 524288 AND\n",
	" REGEXP_EXTRACT(t.path, '\\.[^.]+$', 0) IN ('.java', '.py', '.go')\n",
	") AS q\n",
	"GROUP BY identifier\n",
	"ORDER BY repository_count DESC\n",
	"LIMIT 50"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Q11: Top 10 repositories by contributor count (all branches)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+--------------------+-----------------+\n",
	"\| repository_id\|contributor_count\|\n",
	"+--------------------+-----------------+\n",
	"\|github.com/nickma...\| 34\|\n",
	"\|github.com/Christ...\| 34\|\n",
	"\|github.com/bigben...\| 34\|\n",
	"\|github.com/ckreut...\| 16\|\n",
	"\|github.com/johand...\| 13\|\n",
	"\|github.com/bjoeri...\| 11\|\n",
	"\|github.com/pelarg...\| 10\|\n",
	"\|github.com/akoo0o...\| 10\|\n",
	"\|github.com/Deathl...\| 10\|\n",
	"\|github.com/alfons...\| 9\|\n",
	"+--------------------+-----------------+\n",
	"\n"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"-- Top 10 repositories by contributor count in all branches.\n",
	"-- Each distinct email is counted as a distinct contributor.\n",
	"--\n",
	"-- categories: fast\n",
	"SELECT\n",
	" repository_id,\n",
	" contributor_count\n",
	"FROM (\n",
	" SELECT\n",
	" repository_id,\n",
	" COUNT(DISTINCT author_email) AS contributor_count\n",
	" FROM commits\n",
	" WHERE index <> -1\n",
	" GROUP BY repository_id\n",
	") AS q\n",
	"ORDER BY contributor_count DESC\n",
	"LIMIT 10\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Q12: Created projects per year"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"+----+-------------+\n",
	"\|year\|project_count\|\n",
	"+----+-------------+\n",
	"\|2017\| 6\|\n",
	"\|2016\| 2\|\n",
	"\|2015\| 1\|\n",
	"\|2014\| 12\|\n",
	"\|2013\| 22\|\n",
	"\|2012\| 23\|\n",
	"\|2011\| 8\|\n",
	"\|2010\| 19\|\n",
	"\|2009\| 3\|\n",
	"\|2008\| 10\|\n",
	"+----+-------------+\n",
	"\n"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%sql\n",
	"-- Approximate number of projects created per year. Creation date is extracted from the\n",
	"-- first commit in the history of HEAD reference. Forks are deduplicated by counting\n",
	"-- the hashes of the initial commit instead of repository id.\n",
	"--\n",
	"-- categories: ????\n",
	"SELECT\n",
	" year,\n",
	" COUNT(DISTINCT repository_id) AS project_count\n",
	"FROM (\n",
	" SELECT\n",
	" repository_id,\n",
	" MIN(YEAR(author_date)) AS year\n",
	" FROM commits\n",
	" WHERE\n",
	" SIZE(parents) == 0 AND\n",
	" reference_name = 'refs/heads/HEAD'\n",
	" GROUP BY repository_id\n",
	") AS q\n",
	"GROUP BY year\n",
	"ORDER BY year DESC"
	]
	}
	],
	"metadata": {
	"celltoolbar": "Edit Metadata",
	"kernelspec": {
	"display_name": "Apache Toree - Scala",
	"language": "scala",
	"name": "apache_toree_scala"
	},
	"language_info": {
	"file_extension": ".scala",
	"name": "scala",
	"version": "2.11.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}