Skip to content

Instantly share code, notes, and snippets.

@warenlg
Last active August 14, 2019 08:30
Show Gist options
  • Save warenlg/ba22d3b2078a9d6382dc91285caeb84a to your computer and use it in GitHub Desktop.
Save warenlg/ba22d3b2078a9d6382dc91285caeb84a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"from collections import defaultdict, OrderedDict\n",
"import os\n",
"import pickle\n",
"import glob\n",
"import re\n",
"\n",
"from labours import ProtobufReader, DevDay\n",
"from matplotlib import pyplot as plt\n",
"from matplotlib.colors import ListedColormap, to_hex\n",
"from matplotlib.legend import Legend\n",
"from networkx import connected_components, Graph\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.cluster import KMeans, DBSCAN\n",
"from sklearn.neighbors import KDTree\n",
"from sklearn.neighbors.kde import KernelDensity\n",
"from run import no_logs, run, run_one, run_and_print\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"from tqdm import tqdm_notebook as tqdm\n",
"from umap import UMAP\n",
"\n",
"from topic_modeling_utils import (aggregate_contribution_per_developer, collect_files_per_developer,\n",
" collect_unique_identities, get_identity_index,\n",
" OwnershipProtobufReader)\n",
"\n",
"%pylab inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Collection from `hercules`"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"PATH_TO_GITLAB_STATS = \"/home/waren/sourced/workspace/gitlab_paper/data/statistics_v9.2.0/\"\n",
"PATH_TO_SRCD_STATS = \"/home/waren/sourced/data/companies_hercules_stats/src-d/\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"forks = [\"k8s-nvidia-gpu-overcommit\", \"PyHive\", \"go-oniguruma\",\n",
" \"spark-ui-proxy\", \"terraform-render-bootkube\", \"gcfg\",\n",
" \"gluster-kubernetes\", \"kube-cert-manager\", \"libcompose\",\n",
" \"deployment\", \"or-tools\", \"matchbox\", \"beanstalk\",\n",
" \"go-github\", \"envconfig\", \"go-bindata\"]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"120"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"repos = []\n",
"for root, dirs, files in os.walk(PATH_TO_SRCD_STATS):\n",
" if \"statistics.pb\" in files:\n",
" reader = ProtobufReader()\n",
" reader.read(os.path.join(root, \"statistics.pb\"))\n",
" repo_name = reader.get_name().split(\"/\")[-1][:-4]\n",
" if repo_name not in forks:\n",
" repos.append((repo_name, ) + reader.get_devs() + reader.get_header())\n",
"len(repos)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Identity matching"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"graph = Graph()\n",
"for _, people, _, _, _ in repos:\n",
" for p in people:\n",
" keys = p.split(\"|\")\n",
" for x in keys:\n",
" for y in keys:\n",
" if x < y:\n",
" graph.add_edge(x, y)\n",
" elif x == y:\n",
" graph.add_node(x)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"345"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"identities = []\n",
"for cc in connected_components(graph):\n",
" #exclude_identity = re.compile(\"dependabot\")\n",
" #cc = {identity for identity in cc if not re.search(exclude_identity, identity)}\n",
" #if len(cc) > 1:\n",
" identities.append(\"|\".join(sorted(cc, key=lambda s: s if \"@\" not in s else \"|\" + s)))\n",
"len(identities)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"dev_map = {}\n",
"for i, sig in enumerate(identities):\n",
" for key in sig.split(\"|\"):\n",
" dev_map[key] = i\n",
"\n",
"id2dev = {}\n",
"for dev, index in dev_map.items():\n",
" if index not in id2dev and re.search(\"@\", dev):\n",
" id2dev[index] = dev\n",
" if len(dev.split()) > 1 and not re.search(\"@\", dev):\n",
" id2dev[index] = dev "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"project_map = {}\n",
"for i, (repo_name, _, _, _, _) in enumerate(repos):\n",
" project_map[repo_name] = i\n",
"id2project = {v: k for k, v in project_map.items()}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Commit time series"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Collect the commit time series per identity"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"from collections import defaultdict\n",
"from datetime import datetime, timedelta\n",
"\n",
"min_start_time = None\n",
"for _, _, _, start_time, _ in repos:\n",
" if min_start_time is None or min_start_time > start_time:\n",
" min_start_time = start_time\n",
"min_start_time = datetime.utcfromtimestamp(min_start_time)\n",
"\n",
"data = defaultdict((lambda: defaultdict(list)))\n",
"for repo, people, days, start_time, _ in repos:\n",
" people = {i: dev_map[p.split(\"|\", 1)[0]] for i, p in enumerate(people)}\n",
" base_date = datetime.utcfromtimestamp(start_time)\n",
" delta = (base_date - min_start_time).days\n",
" for day, devs in days.items():\n",
" date = (base_date + timedelta(days=day)).date()\n",
" for dev, devday in devs.items():\n",
" data[people[dev]][day + delta].append(devday)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"343"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(data)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1785"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"maxday = max(max(v) for v in data.values())\n",
"maxday"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Distance matrix computaion using DTW with `fastdtw` "
]
},
{
"cell_type": "code",
"execution_count": 193,
"metadata": {},
"outputs": [],
"source": [
"from fastdtw import fastdtw\n",
"\n",
"def dtwdist(d1, d2):\n",
" s1 = sorted((k, sum(d.Commits for d in v)) for k, v in data[d1].items())\n",
" s2 = sorted((k, sum(d.Commits for d in v)) for k, v in data[d2].items())\n",
" offset = min(s1[0][0], s2[0][0])\n",
" maxday = max(s1[-1][0], s2[-1][0])\n",
" arr1 = numpy.zeros(maxday - offset + 1, dtype=numpy.float32)\n",
" arr2 = numpy.zeros_like(arr1)\n",
" for d, v in s1:\n",
" arr1[d - offset] = v\n",
" for d, v in s2:\n",
" arr2[d - offset] = v\n",
" return fastdtw(arr1 / arr1.mean(), arr2 / arr2.mean(), radius=30, dist=1)[0]"
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/waren/.local/lib/python3.6/site-packages/ipykernel_launcher.py:4: DeprecationWarning:\n",
"\n",
"Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "606c5a299faa4de38328f186b2df9186",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=66), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/waren/.local/lib/python3.6/site-packages/ipykernel_launcher.py:5: DeprecationWarning:\n",
"\n",
"Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.\n",
"\n",
"/home/waren/.local/lib/python3.6/site-packages/ipykernel_launcher.py:14: FutureWarning:\n",
"\n",
"Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"commit_threshold = 10 #commits\n",
"datalist = sorted((k, v) for k, v in data.items()\n",
" if sum(sum(d.Commits for d in days)\n",
" for days in v.values()) > commit_threshold)\n",
"dev_labels = [id2dev[k[0]] for k in datalist]\n",
"\n",
"dists = numpy.zeros((len(datalist),) * 2)\n",
"for x, (d1, _) in tqdm(list(enumerate(datalist))):\n",
" for y, (d2, _) in enumerate(datalist[x + 1:]):\n",
" y += x + 1\n",
" dists[x, y] = dists[y, x] = dtwdist(d1, d2)"
]
},
{
"cell_type": "code",
"execution_count": 195,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/waren/.local/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning:\n",
"\n",
"Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.\n",
"\n"
]
},
{
"data": {
"text/plain": [
"2851"
]
},
"execution_count": 195,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(sum(d.Commits for d in days) for days in data[0].values())"
]
},
{
"cell_type": "code",
"execution_count": 196,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(66, 66)"
]
},
"execution_count": 196,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dists.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dimensionality reduction with `UMAP`"
]
},
{
"cell_type": "code",
"execution_count": 197,
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(1)\n",
"embeddings = UMAP(metric=\"precomputed\", n_neighbors=4, min_dist=0.1).fit_transform(dists)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Clustering with `DBSCAN` "
]
},
{
"cell_type": "code",
"execution_count": 198,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4\n"
]
},
{
"data": {
"text/plain": [
"(66, 2)"
]
},
"execution_count": 198,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clusters = DBSCAN(eps=0.9).fit_predict(embeddings)\n",
"n_clusters = clusters.max() + 1\n",
"print(n_clusters)\n",
"embeddings.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Visualization"
]
},
{
"cell_type": "code",
"execution_count": 206,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"linkText": "Export to plot.ly",
"plotlyServerURL": "https://plot.ly",
"showLink": false
},
"data": [
{
"marker": {
"color": "#8000ff",
"opacity": 0.8,
"size": 7,
"symbol": "circle"
},
"mode": "markers",
"name": "Core source{d}",
"text": [
"vadim markovtsev",
"rafael porres molina",
"máximo cuadros ortiz",
"konstantin slavnov",
"santiago m. mola",
"maxim sukharev",
"egor bulychev",
"waren long",
"manuel carmona",
"kuba@sourced.tech",
"david pordomingo",
"miguel molina",
"javi fontan",
"francesc campoy",
"eiso kant",
"antonio navarro perez",
"bzz@users.noreply.github.com",
"carlos martín",
"alexander bezzubov"
],
"type": "scatter",
"uid": "cf749e2d-2736-404a-a330-b8f03612aae5",
"x": [
1.11305832862854,
1.682020664215088,
1.296476125717163,
0.9207094311714172,
1.739689588546753,
1.290390133857727,
1.6636384725570679,
1.6353179216384888,
1.1794235706329346,
1.1400810480117798,
1.4303711652755737,
1.4148986339569092,
1.4780642986297607,
1.2566068172454834,
1.503415822982788,
1.2715226411819458,
2.05823016166687,
1.015299677848816,
2.1729071140289307
],
"y": [
-5.144843101501465,
-3.3574132919311523,
-5.285112380981445,
-4.204207897186279,
-4.358362197875977,
-4.820174217224121,
-2.9993033409118652,
-3.42612361907959,
-3.9462811946868896,
-3.550429344177246,
-4.144782543182373,
-5.387336254119873,
-4.551661968231201,
-3.0117740631103516,
-2.556983709335327,
-5.1205878257751465,
-3.3334505558013916,
-4.618728160858154,
-3.5077779293060303
]
},
{
"marker": {
"color": "#2adddd",
"opacity": 0.8,
"size": 7,
"symbol": "circle"
},
"mode": "markers",
"name": "Recent hires/contributors",
"text": [
"maartje eyskens",
"hugo mougard",
"irina khismatullina",
"guillem duran ballester",
"fernanda gomes",
"kuba podgórski",
"tristan kalos",
"lou marvin caraig",
"m. j. fromberger",
"dependabot[bot]@users.noreply.github.com",
"robert lin",
"juanjo alvarez martinez",
"anna tsolakou",
"ricardo baeta",
"david riosalido",
"romain keramitas",
"esther garcía"
],
"type": "scatter",
"uid": "ae450a8f-6784-4923-86b7-5ee8976068e4",
"x": [
1.3543689250946045,
1.5874836444854736,
1.5189763307571411,
1.5551575422286987,
0.755084753036499,
0.5140291452407837,
1.508233666419983,
1.321685791015625,
0.8390812873840332,
1.1978576183319092,
1.269224762916565,
0.5987231135368347,
1.260170578956604,
0.5853797197341919,
0.9789519309997559,
1.1117695569992065,
0.6380635499954224
],
"y": [
-0.3943886160850525,
-0.7375251054763794,
-0.144102081656456,
1.003696084022522,
2.39174747467041,
1.453855037689209,
1.295262098312378,
0.23659320175647736,
1.8189053535461426,
0.16993653774261475,
1.525714635848999,
1.6045781373977661,
-0.5878535509109497,
0.38755834102630615,
0.9693411588668823,
-0.09487339109182358,
0.34487804770469666
]
},
{
"marker": {
"color": "#d4dd80",
"opacity": 0.8,
"size": 7,
"symbol": "circle"
},
"mode": "markers",
"name": "Tyba/former employees",
"text": [
"sonia meruelo",
"sergio arbeo",
"ferhat elmas",
"denys smirnov",
"iván sánchez",
"carlos cobo",
"alberto cortés",
"me@darkowlzz.space",
"timofei semenov",
"alfredo beaumont",
"said tahsin dane",
"oleh zasadnyy",
"vitaliy zasadnyy",
"margarida garcia",
"jorge schnura becerro",
"philip thomas casado",
"ori rawlings",
"joshua sjoding",
"david paz",
"roberto santalla"
],
"type": "scatter",
"uid": "ec54004c-3bc4-4ebd-80ee-2c4bbfc457cb",
"x": [
-2.7406165599823,
-2.5616726875305176,
-3.504784345626831,
-2.996340274810791,
-5.18230676651001,
-5.484813690185547,
-5.501468181610107,
-3.2163307666778564,
-3.462519645690918,
-2.7279961109161377,
-4.691711902618408,
-5.7580084800720215,
-5.726004123687744,
-4.103541851043701,
-3.0228071212768555,
-5.366780757904053,
-3.165879249572754,
-5.141332149505615,
-3.8846211433410645,
-3.121025800704956
],
"y": [
1.876463532447815,
1.593294382095337,
1.7349369525909424,
2.383864402770996,
2.4687163829803467,
2.598036289215088,
2.896465301513672,
2.2222061157226562,
1.8241033554077148,
1.6740546226501465,
2.0973777770996094,
2.316901445388794,
2.2940659523010254,
2.030949592590332,
1.5836026668548584,
2.831446647644043,
2.099158763885498,
2.291452407836914,
2.005383014678955,
2.63633394241333
]
},
{
"marker": {
"color": "#ff0000",
"opacity": 0.8,
"size": 7,
"symbol": "circle"
},
"mode": "markers",
"name": "External contributors",
"text": [
"andrew kutta",
"bake@192k.pw",
"filip navara",
"jeremy stribling",
"taru karttunen",
"chris marchesi",
"theodoros zarkopafilis ntakouris",
"theo despoudis",
"hugovk@users.noreply.github.com"
],
"type": "scatter",
"uid": "bb74e889-0377-489f-9fa2-ff190630c240",
"x": [
0.6868482232093811,
0.34223905205726624,
0.4892539381980896,
-0.14282183349132538,
-0.005935766734182835,
0.686651349067688,
0.7589490413665771,
0.6047843098640442,
0.9016683101654053
],
"y": [
3.786644220352173,
4.267488479614258,
3.918689012527466,
4.404962062835693,
4.3351287841796875,
4.319298267364502,
4.154350280761719,
3.4951086044311523,
4.0686845779418945
]
}
],
"layout": {
"autosize": false,
"height": 1000,
"title": {
"text": "Developer similarity based on commit time series<br> on source{d} codebase"
},
"width": 1000,
"xaxis": {
"showgrid": false,
"showticklabels": false,
"zeroline": false
},
"yaxis": {
"showgrid": false,
"showticklabels": false,
"zeroline": false
}
}
},
"text/html": [
"<div>\n",
" \n",
" \n",
" <div id=\"44ad73d4-4994-4d92-b193-69b77f50aed0\" class=\"plotly-graph-div\" style=\"height:1000px; width:1000px;\"></div>\n",
" <script type=\"text/javascript\">\n",
" require([\"plotly\"], function(Plotly) {\n",
" window.PLOTLYENV=window.PLOTLYENV || {};\n",
" window.PLOTLYENV.BASE_URL='https://plot.ly';\n",
" \n",
" if (document.getElementById(\"44ad73d4-4994-4d92-b193-69b77f50aed0\")) {\n",
" Plotly.newPlot(\n",
" '44ad73d4-4994-4d92-b193-69b77f50aed0',\n",
" [{\"marker\": {\"color\": \"#8000ff\", \"opacity\": 0.8, \"size\": 7, \"symbol\": \"circle\"}, \"mode\": \"markers\", \"name\": \"Core source{d}\", \"text\": [\"vadim markovtsev\", \"rafael porres molina\", \"m\\u00e1ximo cuadros ortiz\", \"konstantin slavnov\", \"santiago m. mola\", \"maxim sukharev\", \"egor bulychev\", \"waren long\", \"manuel carmona\", \"kuba@sourced.tech\", \"david pordomingo\", \"miguel molina\", \"javi fontan\", \"francesc campoy\", \"eiso kant\", \"antonio navarro perez\", \"bzz@users.noreply.github.com\", \"carlos mart\\u00edn\", \"alexander bezzubov\"], \"type\": \"scatter\", \"uid\": \"cf749e2d-2736-404a-a330-b8f03612aae5\", \"x\": [1.11305832862854, 1.682020664215088, 1.296476125717163, 0.9207094311714172, 1.739689588546753, 1.290390133857727, 1.6636384725570679, 1.6353179216384888, 1.1794235706329346, 1.1400810480117798, 1.4303711652755737, 1.4148986339569092, 1.4780642986297607, 1.2566068172454834, 1.503415822982788, 1.2715226411819458, 2.05823016166687, 1.015299677848816, 2.1729071140289307], \"y\": [-5.144843101501465, -3.3574132919311523, -5.285112380981445, -4.204207897186279, -4.358362197875977, -4.820174217224121, -2.9993033409118652, -3.42612361907959, -3.9462811946868896, -3.550429344177246, -4.144782543182373, -5.387336254119873, -4.551661968231201, -3.0117740631103516, -2.556983709335327, -5.1205878257751465, -3.3334505558013916, -4.618728160858154, -3.5077779293060303]}, {\"marker\": {\"color\": \"#2adddd\", \"opacity\": 0.8, \"size\": 7, \"symbol\": \"circle\"}, \"mode\": \"markers\", \"name\": \"Recent hires/contributors\", \"text\": [\"maartje eyskens\", \"hugo mougard\", \"irina khismatullina\", \"guillem duran ballester\", \"fernanda gomes\", \"kuba podg\\u00f3rski\", \"tristan kalos\", \"lou marvin caraig\", \"m. j. fromberger\", \"dependabot[bot]@users.noreply.github.com\", \"robert lin\", \"juanjo alvarez martinez\", \"anna tsolakou\", \"ricardo baeta\", \"david riosalido\", \"romain keramitas\", \"esther garc\\u00eda\"], \"type\": \"scatter\", \"uid\": \"ae450a8f-6784-4923-86b7-5ee8976068e4\", \"x\": [1.3543689250946045, 1.5874836444854736, 1.5189763307571411, 1.5551575422286987, 0.755084753036499, 0.5140291452407837, 1.508233666419983, 1.321685791015625, 0.8390812873840332, 1.1978576183319092, 1.269224762916565, 0.5987231135368347, 1.260170578956604, 0.5853797197341919, 0.9789519309997559, 1.1117695569992065, 0.6380635499954224], \"y\": [-0.3943886160850525, -0.7375251054763794, -0.144102081656456, 1.003696084022522, 2.39174747467041, 1.453855037689209, 1.295262098312378, 0.23659320175647736, 1.8189053535461426, 0.16993653774261475, 1.525714635848999, 1.6045781373977661, -0.5878535509109497, 0.38755834102630615, 0.9693411588668823, -0.09487339109182358, 0.34487804770469666]}, {\"marker\": {\"color\": \"#d4dd80\", \"opacity\": 0.8, \"size\": 7, \"symbol\": \"circle\"}, \"mode\": \"markers\", \"name\": \"Tyba/former employees\", \"text\": [\"sonia meruelo\", \"sergio arbeo\", \"ferhat elmas\", \"denys smirnov\", \"iv\\u00e1n s\\u00e1nchez\", \"carlos cobo\", \"alberto cort\\u00e9s\", \"me@darkowlzz.space\", \"timofei semenov\", \"alfredo beaumont\", \"said tahsin dane\", \"oleh zasadnyy\", \"vitaliy zasadnyy\", \"margarida garcia\", \"jorge schnura becerro\", \"philip thomas casado\", \"ori rawlings\", \"joshua sjoding\", \"david paz\", \"roberto santalla\"], \"type\": \"scatter\", \"uid\": \"ec54004c-3bc4-4ebd-80ee-2c4bbfc457cb\", \"x\": [-2.7406165599823, -2.5616726875305176, -3.504784345626831, -2.996340274810791, -5.18230676651001, -5.484813690185547, -5.501468181610107, -3.2163307666778564, -3.462519645690918, -2.7279961109161377, -4.691711902618408, -5.7580084800720215, -5.726004123687744, -4.103541851043701, -3.0228071212768555, -5.366780757904053, -3.165879249572754, -5.141332149505615, -3.8846211433410645, -3.121025800704956], \"y\": [1.876463532447815, 1.593294382095337, 1.7349369525909424, 2.383864402770996, 2.4687163829803467, 2.598036289215088, 2.896465301513672, 2.2222061157226562, 1.8241033554077148, 1.6740546226501465, 2.0973777770996094, 2.316901445388794, 2.2940659523010254, 2.030949592590332, 1.5836026668548584, 2.831446647644043, 2.099158763885498, 2.291452407836914, 2.005383014678955, 2.63633394241333]}, {\"marker\": {\"color\": \"#ff0000\", \"opacity\": 0.8, \"size\": 7, \"symbol\": \"circle\"}, \"mode\": \"markers\", \"name\": \"External contributors\", \"text\": [\"andrew kutta\", \"bake@192k.pw\", \"filip navara\", \"jeremy stribling\", \"taru karttunen\", \"chris marchesi\", \"theodoros zarkopafilis ntakouris\", \"theo despoudis\", \"hugovk@users.noreply.github.com\"], \"type\": \"scatter\", \"uid\": \"bb74e889-0377-489f-9fa2-ff190630c240\", \"x\": [0.6868482232093811, 0.34223905205726624, 0.4892539381980896, -0.14282183349132538, -0.005935766734182835, 0.686651349067688, 0.7589490413665771, 0.6047843098640442, 0.9016683101654053], \"y\": [3.786644220352173, 4.267488479614258, 3.918689012527466, 4.404962062835693, 4.3351287841796875, 4.319298267364502, 4.154350280761719, 3.4951086044311523, 4.0686845779418945]}],\n",
" {\"autosize\": false, \"height\": 1000, \"title\": {\"text\": \"Developer similarity based on commit time series<br> on source{d} codebase\"}, \"width\": 1000, \"xaxis\": {\"showgrid\": false, \"showticklabels\": false, \"zeroline\": false}, \"yaxis\": {\"showgrid\": false, \"showticklabels\": false, \"zeroline\": false}},\n",
" {\"showLink\": false, \"linkText\": \"Export to plot.ly\", \"plotlyServerURL\": \"https://plot.ly\", \"responsive\": true}\n",
" ).then(function(){\n",
" \n",
"var gd = document.getElementById('44ad73d4-4994-4d92-b193-69b77f50aed0');\n",
"var x = new MutationObserver(function (mutations, observer) {{\n",
" var display = window.getComputedStyle(gd).display;\n",
" if (!display || display === 'none') {{\n",
" console.log([gd, 'removed!']);\n",
" Plotly.purge(gd);\n",
" observer.disconnect();\n",
" }}\n",
"}});\n",
"\n",
"// Listen for the removal of the full notebook cells\n",
"var notebookContainer = gd.closest('#notebook-container');\n",
"if (notebookContainer) {{\n",
" x.observe(notebookContainer, {childList: true});\n",
"}}\n",
"\n",
"// Listen for the clearing of the current output cell\n",
"var outputEl = gd.closest('.output');\n",
"if (outputEl) {{\n",
" x.observe(outputEl, {childList: true});\n",
"}}\n",
"\n",
" })\n",
" };\n",
" });\n",
" </script>\n",
" </div>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#import plotly.plotly as py\n",
"\n",
"cluster_colors = cm.rainbow(numpy.linspace(0, 1, n_clusters))\n",
"label_descriptions = [\"Core source{d}\", \"Recent hires/contributors\", \"Tyba/former employees\", \"External contributors\"]\n",
"data_traces = []\n",
"for cluster_id, color in enumerate(cluster_colors):\n",
" trace = go.Scatter(\n",
" x = embeddings[numpy.where(clusters == cluster_id)[0], 0],\n",
" y = embeddings[numpy.where(clusters == cluster_id)[0], 1],\n",
" name = label_descriptions[cluster_id],\n",
" mode = \"markers\",\n",
" marker = dict(color = to_hex(color), size = 7, opacity = 0.8, symbol = \"circle\"),\n",
" text = [dev_labels[i] for i in numpy.where(clusters == cluster_id)[0]]\n",
" )\n",
" data_traces.append(trace)\n",
"\n",
"layout = go.Layout(\n",
" title = go.layout.Title(\n",
" text = \"Developer similarity based on commit time series\"\n",
" \"<br> on source{d} codebase\"),\n",
" autosize=False,\n",
" width=1000,\n",
" height=1000,\n",
" xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),\n",
" yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)\n",
")\n",
"\n",
"fig = go.Figure(data=data_traces, layout=layout)\n",
"py.iplot(fig, filename='commit-time-series-source{d}.html')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Language experience"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Collect the aggregated language distributions per identity"
]
},
{
"cell_type": "code",
"execution_count": 262,
"metadata": {},
"outputs": [],
"source": [
"loc_values = defaultdict(list)\n",
"for repo_name, people, days, start_time, _ in repos:\n",
" for day, devs in sorted(days.items()):\n",
" for dev, devday in devs.items():\n",
" for k, v in devday.Languages.items():\n",
" loc_values[k].extend(v)\n",
" \n",
"upper_bounds = {}\n",
"for lang, values in loc_values.items():\n",
" values = array(values)\n",
" upper_bounds[lang] = np.percentile(values[values != 0], 99)"
]
},
{
"cell_type": "code",
"execution_count": 263,
"metadata": {},
"outputs": [],
"source": [
"def limit(contributions, threshold):\n",
" return [v if v <= threshold else threshold for v in contributions]\n",
"\n",
"def apply_UB(devday, upper_bounds):\n",
" return DevDay(devday.Commits, devday.Added, devday.Removed, devday.Changed,\n",
" {lang: limit(contributions, upper_bounds[lang]) for lang, contributions in devday.Languages.items()})"
]
},
{
"cell_type": "code",
"execution_count": 264,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"343\n",
"120\n"
]
}
],
"source": [
"people_contributions = defaultdict(lambda: defaultdict(int))\n",
"#upper_bound = np.percentile(loc_values[loc_values != 0], 99)\n",
"aggregated_dev, aggregated_project = OrderedDict(), OrderedDict()\n",
"for repo_name, people, days, start_time, _ in repos:\n",
" people_map = {}\n",
" for i, p in enumerate(people):\n",
" try:\n",
" people_map[i] = dev_map[p.split(\"|\", 1)[0]]\n",
" except KeyError:\n",
" continue\n",
" for day, devs in sorted(days.items()):\n",
" for dev, devday in devs.items():\n",
" devday_bounded = apply_UB(devday, upper_bounds)\n",
" #devday_bounded = devday\n",
" if dev in people_map:\n",
" people_contributions[id2dev[people_map[dev]]][repo_name] += devday.Commits\n",
" try:\n",
" aggregated_project[project_map[repo_name]] = aggregated_project[project_map[repo_name]].add(devday_bounded)\n",
" except KeyError:\n",
" aggregated_project[project_map[repo_name]] = devday_bounded\n",
" try:\n",
" aggregated_dev[people_map[dev]] = aggregated_dev[people_map[dev]].add(devday_bounded)\n",
" except KeyError:\n",
" aggregated_dev[people_map[dev]] = devday_bounded\n",
"print(len(aggregated_dev))\n",
"print(len(aggregated_project))"
]
},
{
"cell_type": "code",
"execution_count": 265,
"metadata": {},
"outputs": [],
"source": [
"contribution_threshold = 20 #commits\n",
"contribution_graph = Graph()\n",
"for dev, contributions in people_contributions.items():\n",
" for repo, n_commits in contributions.items():\n",
" if n_commits > contribution_threshold:\n",
" contribution_graph.add_edge(dev, repo)"
]
},
{
"cell_type": "code",
"execution_count": 266,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[((0, 'vadim markovtsev'), 2851),\n",
" ((16, 'miguel molina'), 1846),\n",
" ((8, 'maxim sukharev'), 1375),\n",
" ((4, 'máximo cuadros ortiz'), 1195),\n",
" ((31, 'carlos martín'), 911),\n",
" ((26, 'antonio navarro perez'), 835),\n",
" ((7, 'santiago m. mola'), 715),\n",
" ((21, 'javi fontan'), 658),\n",
" ((15, 'david pordomingo'), 590),\n",
" ((38, 'alexander bezzubov'), 579)]"
]
},
"execution_count": 266,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dev_commits = {(dev, id2dev[dev]): stats.Commits for dev, stats in aggregated_dev.items()}\n",
"sorted_dev_commits = sorted(dev_commits.items(), key=lambda x: x[1], reverse=True)\n",
"sorted_dev_commits[:10]"
]
},
{
"cell_type": "code",
"execution_count": 267,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"67\n",
"37\n"
]
}
],
"source": [
"def filter_stats(aggregated_stats, commit_threshold):\n",
" discarded = []\n",
" for index, stats in aggregated_stats.items():\n",
" if stats.Commits < commit_threshold:\n",
" discarded.append(index)\n",
" for index in discarded:\n",
" del aggregated_stats[index]\n",
"\n",
"filter_stats(aggregated_dev, commit_threshold=10)\n",
"filter_stats(aggregated_project, commit_threshold=100)\n",
"\n",
"dev_labels = [id2dev[i] for i in aggregated_dev]\n",
"project_labels = [id2project[i] for i in aggregated_project]\n",
"all_labels = dev_labels + project_labels\n",
" \n",
"print(len(aggregated_dev))\n",
"print(len(aggregated_project))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Build the language matrix"
]
},
{
"cell_type": "code",
"execution_count": 357,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of languages: 48\n"
]
}
],
"source": [
"# collect unique languages\n",
"unique_lang = set()\n",
"for dev in aggregated_dev:\n",
" unique_lang.update(aggregated_dev[dev].Languages.keys())\n",
"lang2remove = [\"\", \"SVG\", \"JSON\", \"CSV\"]\n",
"for l in lang2remove:\n",
" unique_lang.remove(l) # exclude case when `enry` can't identify language\n",
"# this mapping will help us to prepare features\n",
"lang2id = OrderedDict()\n",
"actions = [\"added\", \"removed\", \"changed\"]\n",
"for i, lang in enumerate(sorted(unique_lang)):\n",
" for j, act in enumerate(actions):\n",
" lang2id[lang + \"_\" + act] = i * 3 + j\n",
"#lang2id[\"sum\"] = len(lang2id)\n",
"print(\"Number of languages:\", len(unique_lang))"
]
},
{
"cell_type": "code",
"execution_count": 406,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(67, 144)\n",
"(37, 144)\n"
]
}
],
"source": [
"def build_language_matrix(aggregated_stats):\n",
" languages = numpy.zeros((len(aggregated_stats), len(lang2id)))\n",
" for i, (index, stats) in enumerate(aggregated_stats.items()):\n",
" #languages[i, -1] = aggregated_stats[index].Commits\n",
" for lang in stats.Languages:\n",
" for val, act in zip(stats.Languages[lang], actions):\n",
" try:\n",
" languages[i, lang2id[lang + \"_\" + act]] = val\n",
" except KeyError:\n",
" pass\n",
" return languages\n",
"\n",
"languages_dev = build_language_matrix(aggregated_dev)\n",
"languages_project = build_language_matrix(aggregated_project)\n",
"print(languages_dev.shape)\n",
"print(languages_project.shape)"
]
},
{
"cell_type": "code",
"execution_count": 592,
"metadata": {},
"outputs": [],
"source": [
"def add_experience_column(df, aggregated_dev, aggregated_project):\n",
" experience = [stats.Commits for stats in aggregated_dev.values()]\n",
" experience.extend([stats.Commits for stats in aggregated_project.values()])\n",
" experience = [s / sum(experience) for s in experience]\n",
" df[\"Experience\"] = experience\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 593,
"metadata": {},
"outputs": [],
"source": [
"df_devs = pd.DataFrame(data=languages_dev, columns=lang2id.keys())\n",
"df_devs = df_devs.div(df_devs.sum(axis=1), axis=0)\n",
"df_projects = pd.DataFrame(data=languages_project, columns=lang2id.keys())\n",
"df_projects = df_projects.div(df_projects.sum(axis=1), axis=0)\n",
"df = pd.concat([df_devs, df_projects], ignore_index=True)\n",
"df = add_experience_column(df, aggregated_dev, aggregated_project)\n",
"df = df.div(df.sum(axis=1), axis=0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dimensionality reduction"
]
},
{
"cell_type": "code",
"execution_count": 618,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/umap/spectral.py:229: UserWarning:\n",
"\n",
"Embedding a total of 2 separate connected components using meta-embedding (experimental)\n",
"\n"
]
},
{
"data": {
"text/plain": [
"(104, 2)"
]
},
"execution_count": 618,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"SEED = 2\n",
"n_neighbors = 3\n",
"min_dist = 0.2\n",
"n_components = 2\n",
"metric = \"euclidean\"\n",
"dim_red = UMAP(n_neighbors=n_neighbors, min_dist=min_dist,\n",
" n_components=n_components,\n",
" metric=metric, random_state=SEED)\n",
"embeddings = dim_red.fit_transform(df)\n",
"embeddings.shape"
]
},
{
"cell_type": "code",
"execution_count": 619,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.collections.PathCollection at 0x7f759a8a5588>"
]
},
"execution_count": 619,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAFlCAYAAADoPlOZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3da3Bc533n+e//9O2gL7iDBEkRoigykGUrkm2MZIeurWTtSdkuVzSZcarsN2PXJKXJzrr25a63XJWtypvN7L7Yqhl7N6vNuuJMbSWZdZVHSiKvL7ETj5lINuWRTMkSRIgyLyBBNhpAo++38+wLdEMgCZCg0Beg+/epQqEvh33+QIG/fvq5HXPOISIi/c/rdQEiItIdCnwRkQGhwBcRGRAKfBGRAaHAFxEZEAp8EZEBEe51AXczOTnpTpw40esyREQOjJdffnnZOTe13XP7OvBPnDjBuXPnel2GiMiBYWaXdnpOXToiIgNCgS8iMiAU+CIiA0KBLyIyIBT4IiIDQoEvIjIgFPgiIgNCgS8iMiAU+CIiA0KBLyIyIPb11goigyadKzO/lGO1WGUsHmV2OsVUyu91WdIn1MIX2SfSuTJnFzKUawETiRjlWsDZhQzpXLnXpUmfaEvgm9nXzeymmb22w/O/bmZZM3ul+fUH7TivSL9I58p88+UrvHEty8V0nmypRiIWJhkLM7+U63V50ifa1cL/U+CT9zjmPzvnnmh+/WGbzity4LVa9muFKpPJGLVGwGuLWdaKVeLREKvFaq9LlD7RlsB3zv0IWGnHa4kMmvmlHMlYmImUT6UR4EdCxKMhrqwUKVYbjMWjvS5R+kQ3+/A/amavmtm3zez9Ox1kZs+Y2TkzO5dOp7tYnkj3pXNl/vHtZV69ska+Umc5X6FUaxANeWTyFfKVOrPTqV6XKX2iW4H/M+BB59zjwL8H/tNOBzrnnnXOzTnn5qamtr1oi0hfaHXlxCIhomEjFvIwM6r1Bsv5MqOJCGdOTWiWjrRNVwLfObfunMs3b78ARMxsshvnFtmvWl05v3I4Rbke4ICJeJSwF+J9R0f57IePK+ylrboS+GY2bWbWvP1k87yZbpxbZL9abQ7KjgxFeezYCNGwR6Vep1pvqGUvHdGWhVdm9ufArwOTZnYV+J+ACIBz7o+BzwL/jZnVgRLwOeeca8e5RQ6qsXiUYrVBIhZuhn6UQqWOH/EU9tIRbQl859zn7/H8V4GvtuNcIv1iMhnluVeuEQQB48kYU6kYnnk8fnyi16VJn9JKW5EeSOfKvLmU59RUkvFEjJV8lQs38jwynVTrXjpGe+mI9EBrwDYRCxOPhTEzMvkKP5y/yUQyptCXjlALX6QHWgO2a8Uqry1mqTUCJpMx1gpV7Z8jHaPAF+mB1oDtlZUi8WgIPxKi0giYSPnaP0c6RoEv0gOz0ynylTqZfIVoyKNUa1CqNZgZj2v/HOkYBb5Ij4Q9uJ4t8V+urFGtBzx2bISRoaj2z5GOUeCLdFlrSwU/EuaTHzjC0dEhitU6zjkKlbr2z5GOUeCLdNnWGTpjiRgffnCMYT/K64vrlGt1wp7x4sUMP76Q1uCttJUCX6TLWjN0WkbjUeZOjHFkNEY9AD8S0hWvpCMU+CJd1pqhs1Wx2mC9XN9s+ZuZrnglbafAF+my1gydQuXWfvuRocgtLX9AM3akrRT4Il02lfI5c2oCP+KRKVTwIx5nTk0wM57YtuWvGTvSLtpaQaQHplL+ttsnnF3Y2DU8Hg1RrDbIV+raTE3aRi18kX1ip5a/9tWRdlELX2Qf2anlL9IOauGLiAwIBb6IyIBQ4IuIDAgFvojIgFDgi4gMCAW+iMiAUOCLiAwIBb6IyIDQwiuRAyydKzO/lGO1WGUsHmV2OqWFW7IjBb7IAZXOlfn2+eusFevUGgGRkMfFdJ5PPXZEoS/bUpeOyAH10sUMl1dKhDwYGQoT8uDySomXLmZ6XZrsU2rhi+xzO3XbvHF9ndGhCH5kYw99PxJidAjeuL7OZx4/1uOqZT9SC19kH2td8LxcC+647KFhONwtxzschvWoWtnvFPgi+9jWC57fftnDR44Mky3VKNUaOAelWoNsqcYjR4Z7XbbsUwp8kX3s9guew8bFUS5l8oAjW6ryTjrP9WyRwDlmxuM8dXK8N8XKvqfAF9nHtrvg+fW1EldXy/iRML8xe5gHJ5LUAjg5mdQMHbkrDdqK7GOz06k7Lnu4kM5z+nCSRLOr5588FKNQqeNHPIW93JVa+CL72HaXPTw+PsT08NAtx8WjIVaL1R5VKQeFWvgi+9ztlz388QUoVhskYu/+9y1WG4zFo70oTw6QtrTwzezrZnbTzF7b4Xkzs39nZgtm9nMz+1A7zisyiGanU+QrdQqVOs45CpU6+Uqd2elUr0uTfa5dLfw/Bb4K/NkOz38KON38egr4P5rfReQ+TaV8HplO8sP5m9xcr3BoOMZvzB56T/333d6LZ7fn0x5BndGWwHfO/cjMTtzlkKeBP3POOeBFMxs1syPOuevtOL/IIEnnyry5lOf0oWEef2BjIPfNpTwTydh9hWJrUVcyFmYiEaNYbXB2IcOZUxP3/TrbhfPtj08mo7y5lL/n+dpVl9ypW334x4ArW+5fbT52R+Cb2TPAMwAzMzNdKU7kINm6GAvY/D6/lNs2aHdqHd/+OrVGwOVMkYUbOT7y8MSuWtU7hfMj08k7wv25V65xampjdlG2VOXySpFMrsz1bJHPfvj45rnml3I0GgEX03lylTqpWJiJRHTz55P3bt8N2jrnngWeBZibm3P3OFxk4KwWq0wkYrc8Fo+GyBQqu2odt94Q/vaNG0yP+Dw4kcA5eG0xy1AkhJmjXAv49vnrjCdiONyObxw7vfn8cP4mpw8N3/J4EAQs5ysk/DDnm+eaTPos5yu31Hgpk2dxrUIiGmLED1OpB7y9XKBcr/Ox01Ob51a3z/3rVuAvAse33H+g+ZiI3KfWYqztZum0ArgeBLx2LU++XCMc8ghfND7z+NFb3hCmh31y5TrnF7OEPCMeDeGA1FB0o7W/UmJxtcR4MsprV9c4u7DM008c5ZEjI5vn3enN55fLBTwzCpU6ST/CzHic8WSMlXwV84yhSIihSIhyrcFEMra5XcRUyme9XMezjc3g8pU6N9fLZPIVrq7kGRmKNfcLgpVClaOjcXX73IduzcN/HviXzdk6HwGy6r8XeW/uNktntVil1mhwfjFLtR4w7Ecx4O/fuslfv3qNPz37DpdXCtSDgJlmy96A+etZLqbz/PzqGvlKnXOXMlxfK/EPFzMs3CgwFA0T8oznXrlGOlferGWnlcCFSoNcuc6wH6VaDzi/mCUW9vA8I5MrEwt5lGsNitUGx8fjt6wjGBmKEDjHcq7CxXSBQrVB4ByZYo1XrqwS9oyL6SKXV4rUg+COPYZkZ+2alvnnwD8Cs2Z21cx+18x+38x+v3nIC8BFYAH4v4B/047zigyi7RZjtVq2Y/EoF27mN1vQZhut/42W/CpXV8u8tZTjhfPXWS9X+cCxEcw5Flcr1APHo0eGqdUDXrq4yo31ImNDETwPfrlcJOwZQeBuCdXt3nwW0nnmTozhHFTqDfywhwE31is8/cRRRhNRlvMVIiGPDxwbYbT5ptFaRzAznuDUoSRrpSpB0CARCxEAHsZStsx3Xl/iF9fWWFor8ffzN8mWNt4otPjs3to1S+fz93jeAf9tO84lIncuxmqZnU7x3deXOJTycc5RqQcsrpY4nIzyy+XCRos/HKLacJxdyPCpDxwhEg5z5tQ44XCIcMjjZq5CPOqRyVd4cCJJNLzRLlxcLXHqUPKWUG29+cwv5cgUKozFoxwfH+LERJLDwzWurBTJljcGXpN+iEeOjDCRjG12K8Wjoc1PKI8fn9j8GZbzVYb9MCcnk6wUK7x6pcJDE3EiIeOdTIFIKMThZJR8ucb5xSyPHRsh7HlafHYP+27QVkTeu6mUz5MPjXMx/W7QTqaiZIs1hoeiPDAW5510gWjYo1CpceFGnuVChY+eHMfMuLxS3OgbH/Yp1xo0HDjncM6RLdeYTMbuCNWdVgKPxqOMNo9t7fXTOv72N4nHj7/b9956/nq2yHK+QrZc58R4nHgszPVsmeGhKCN+mOvZCicm4/hhj7du5JgZT2y+acj2FPgifeapkxPUAzZb0OVawDvLRR5/YIRkLMxDUwmurpQIex7VeoMnT4wTCYVIxMI8dmwjoNdLNSaGY6RzVdaKNfxwiIcmEoRC3j1X9G634dvWFjzs/All6/Of/fBxzi5keONalsOpKBduFlgv13h4KoFnG4PMR0aHqNYdgQs0YLsLCnyRPnN7C/rkVJxMoUw9cDgHIc84NBLj4akEh1L+HQE9mYxyOVPkg8dHiT8Y4sLNPMu5Co89MMJTJ+8dqvdqwd/vz3E9W2StUOXUoQTRkEe11sCPhnl8ZpSPnpzUTqH3wTa61/enubk5d+7cuV6XIXLgvXk9y3OvXCMIAsaTMaZSMTx7d7B3u1Wxy/nqvpjjvnUqaa3R4OVLq4DxoZlRouGNqZtq3b/LzF52zs1t95xa+CIDoDVYutNCpXt1sfTS1k8MpVqdJ46PAY6Gc/gR7z19ehhUCnyRAbGfQ/1eDnLt+4kugCIiMiAU+CIiA0KBLyIyIBT4IiIDQoEvIjIgFPgiIgNCgS8iMiAU+CIiA0KBLyIyIBT4IiIDQoEvIjIgFPgiIgNCgS8iMiAU+CIiA0KBLyIyIBT4IiIDQoEvIjIgFPgiIgNCgS8iMiAU+CIiA0KBLyIyIBT4IiIDQoEvIjIgFPgiIgNCgS8iMiAU+CIiA0KBLyIyIBT4IiIDoi2Bb2afNLN5M1swsy9v8/wXzSxtZq80v36vHecVEZHdC+/1BcwsBHwN+KfAVeCnZva8c+4Xtx36l865L+31fPtROldmfinHarHKWDzK7HSKqZTf67JERG6x58AHngQWnHMXAczsL4CngdsDvy+9eT3Lc69cIwgc44ko1VrAcr7KmVMTCn0R2Vfa0aVzDLiy5f7V5mO3+xdm9nMz+6aZHd/pxczsGTM7Z2bn0ul0G8rrnHSuzHOvXCPkGVOpGPXAcXG5QKMRML+U63V5IiK36Nag7V8BJ5xzvwp8D/jGTgc65551zs055+ampqa6VN57M7+UIwgCRoeimBl+JEQ8GmI5X2G1WO11eSIit2hH4C8CW1vsDzQf2+ScyzjnKs27fwJ8uA3n7bnVYpXxZIxyvbH5WCzssVLY6MsXEdlP2hH4PwVOm9lDZhYFPgc8v/UAMzuy5e5vAW+04bw9lc6Vubpa5MpKkbdu5MgUKjgHa6UanmfMTqd6XaKIyC32PGjrnKub2ZeA7wAh4OvOudfN7A+Bc86554H/zsx+C6gDK8AX93reXmoN1OZKdbLlOomox5WVIrlyjWQswtNPHNWArYjsO+ac63UNO5qbm3Pnzp3rdRm3ePN6lq/+YIFqI2AyESMSMlaLNUb8MEfHh/jsh48r7EWkZ8zsZefc3HbPtWNa5sBozcqpNxpMJX3qgaNYrnNszGd0KMrRsSGFvYjsW9pa4T60ZuWMJ31qgSMa9oiFQ2SLdQ3Uisi+p8C/D61ZOaPxCNV6QKUeEPaM5UJFA7Uisu+pS2cH222XMBaPUmsErJfrHB31WSvWyOTLREMhDdSKyL6nwN9GOlfm7EKGZCzMRCJGsdrg7EKGR6aTLOerPDyVIJ2r0GgEjAylePqJozxyZKTXZYuI3JUCfxvzSzmSsTCJ2Mavp/W9tUfO/FKOSMjj/UdHtFGaiBwYCvymrV0489dzvP/YMFt/PfFoiEyhwlTKV8CLyIE00IHfCvnLKwWurJQ4NZXkyOgQ0XCRly+tMndinJGhjZk3xWpDs3Ck72mr7/42kLN00rkyf/3qNf79Dxb42aU1rq+VCHnGxeUC2VKN04eTgPHWjRzOOQqVOvlKXbNwpK+1xq7KtYCJRIxyLeDsQoZ0rtzr0qRNBi7wW3/U56+uUazUeOvGOv+wsEK1XiceDXFlpchoPMqHZkap1BpkChX8iKf97aXvbR27MjMSsTDJWFhbffeRvu7S2e7j6fxSjsAFvJMpMjoUZjQeZbVQ5b9cznLm1CTlegBANBziow9P8rHT+3uLZpF2WS1WmUjEbnmsNXYl/aFvA//dK1EFjCdj1BobV6IqVmvkynVG/AhmYAbTIz5vpwu8s1zg0SPDm104jx+f6PWPIdJ2O/XTj8WjFKuNzVlpoLGrftOXXTq3XolqiFrD8Xa6QOACsqUaK/kKx8Z8ssUql1cKLK6ViIWNlXyFpB9SF470rbv1089Op8hX6hQqdY1d9am+bOFv7HnjmEhEMYOhSAiAdK7CsB9mvVQnW6rhgHrdUWsETCVjHB72+c33H1HQS9/aaY3J/FKOj52e4sypCV66uMIrl1dxON53ZLiX5Uqb9WXgrxarjCeiVOoB9UbAzVyFYjWgWq/xqceO8uEHx/nqDxYw4MGJBCNDEbyQ8fBUgvmlnAJf+tZu+unrgeOJmTHi0dDmKnN94u0PfRn4Y/Eo1VrA+cUsN3MVkrEQgQsIMFYKVSaSMX7t1AS58sbH16QfYWY8zrAf0QCV9LWd+ukN48cX0rz4doZoOMTpw0nMwrd8AlDgH3x9Gfiz0ymW81ViYY94JESp2sA849dnp5hI+Mwv5ZgZT1CuBbf84RcqdQ1QSV+bnU5xdiEDsNmCv7ZWBMCPhDBzeAavLWb5wLERRuNRzdTpI30Z+FMpnzOnJli4kePIaIzUUJSZ8TgjQ1Gcc2QKFT5ycuKOP3zNzJF+s92MnNZ+UJlChbF4lPFEFD+y0ZpPDUWp1oNb1qRopk7/6MvAh43Q/8jDE3e04pfWS9xYL/PiRQh7UK41KNU2WvaPH1c/pfSP7XZ9/fb564wnojjYfAN48WKGeHRjYsPMeJzzi1n8sLfZ5amGUP/o28CHOz++Lq2X+NmlNT54fHTzP0C+UteAlPSl22fk1IOAyytF1op15k6MbQ7Ihj3b7NcfGYry2LER3rqxsUDRj3ibDSHts3Pw9eU8/JZW144f8cgUKtxYL/PB46McHYtr6bj0vdVidbPlDnB5pcjIUIRaENzy9w/ulvn3Yc9jZjzBF888xMdOT22GvfbZOfj6uoUP3LKd8V+9uqil4zIwbp+Rky/XiIZCpGLvvgnEoyFKtfod/fq3d2/ebf6+WvkHR98H/lZaOi6D5PYuzXDIY61U4/ShBOevrpGr1Il4Hien4ve8zoP22ekPfd2lczstHZdBcnuX5snJJGPxMBduFqjWA6IhY71cZaVQvWfXTKuxBJAtVTm/uMbfz9/g6mpR3ToHyEC18Fv/Ae720VXkoLvb4Opfv3qN9dIqCzdzgDEzESfph+/ZNdP6tLBerrJwM49nRigU4lDS10rcA2SgAh/QJQqlr7UGVwMXkM5VeO3qGmcXlnn6iaM8cmSE9XKVUMjj1OEUfjhEud5g4Waecq0B7LwVeKux9M2Xr9BoOMJhw/OMX64UiXgeYQ8+8/ix7v2g8p4MVJeOSL9rXe/h7XSBWsMxlRoi5BnPvXKNdK5MtlTDM2MoEtrcWNAzI1uq3fO1p1I+D4zFefz4KA0HsbDHiB/GM/jJOyvq2jkAFPgifWS1WCWdqzAUCW2G+uhQhCBwzC/lGPbDBG5jwaFzjnKtQeBg2N/dh/2xeJSFmwXi0VBzKwYDg8lUTNObDwAFvkgfGYtHWclX8MPvTr2s1APGE1FWi1UenEjy8GSCSMgjW64TCXk8PJngwYnkrl5/djrFcqFC4BzOQanWoFRrcPpQktVitVM/lrSJAl+kj8xOp/A8j7VSdbMFX6w2mEzGNgdwQyGPk1NJPnpygpNTSUIhb9cz1aZSPk+eGMcB6+Uq0bDHY8dGiIRCmt58ACjwRfrIVMrn6SeO0ggc6VyFsGecnExshvrtUzXfy9Xdnjo5zsx4gl99YIwPHB0h7Hma3nxADNwsHZF+98iRESaSsR2nZu51ppqmNx9cCnyRPtTp6cea3nwwqUtHRGRAtCXwzeyTZjZvZgtm9uVtno+Z2V82n3/JzE6047wiIrJ7ew58MwsBXwM+BTwKfN7MHr3tsN8FVp1zp4D/Dfi3ez2viIjcn3a08J8EFpxzF51zVeAvgKdvO+Zp4BvN298EPm5m1oZzi4jILrUj8I8BV7bcv9p8bNtjnHN1IAtse800M3vGzM6Z2bl0Ot2G8kREBPbhoK1z7lnn3Jxzbm5qaufNnERE5P60I/AXgeNb7j/QfGzbY8wsDIwAmTacW0REdqkdgf9T4LSZPWRmUeBzwPO3HfM88IXm7c8CP3DOuTacW0REdmnPC6+cc3Uz+xLwHSAEfN0597qZ/SFwzjn3PPB/A//BzBaAFTbeFEREpIvastLWOfcC8MJtj/3Blttl4HfacS4REXlv9t2grYiIdIYCX0RkQCjwRUQGhAJfRGRAKPBFRAbEQO2Hn86Vd7wohIhIvxuYFn46V+bsQoZyLWAiEaNcCzi7kCGdK/e6NBGRrhiYFv78Uo7ABVxczpMv10j6ESaTUeaXcmrli8hAGJjAv7xS4OpqiXg0zLAfpVxvsHAzT7nWALRJm4j0v4Hp0smWanhmDEVCmMFQJIRnRrZU63VpIiJdMTCBP+yHCRyUaw2cc5RrDQK38biIyCAYmLR7cCKJHw6TKVTJluukYmEenvQ5PKL+exEZDAMT+LPTKZbzVU5OJYlHQxSrDfKVOrPTqV6XJiLSFQPTpTOV8jlzagI/4pEpVPAjHmdOTWiGjogMjIFo4d++4OojJxX0IjJ4+j7w37ye5blXrhEEAePJGLVGwHK+qta9iAycvu7SSefKPPfKNUKeMZUaotZwvJ0uELiA+aVcr8sTEemqvm7hzy/lCALHRCK6OfceIJ2rEAn19XudiMgd+jr1VotVxhNRKvVg8zE/HGIlX2EsHu1hZSIi3dfXLXwDVgpV3skUGPEjHBsboh44PM/TdEwRGTh928JP58pcyhS4tFIgCBxL2TI/eWeFXLnG008c1YCtiAycvm3hv3Qxw2qxzkOTSbKlKuulOg3nmD2c4pEjI70uT0Sk6/o28N+4vs7oUIShaIjJZAyAUrXB4lqpx5WJiPRG33bpGIbD3fKYw2FYjyoSEemtvg38R44Mky3VKNUaOAelWoNsqcYjR4Z7XZqISE/0ZZfOxmULHdlSldVCldF4hJF4lJnxOE+dHO91eSIiPdF3LfzWtWv9SJjfmD3MgxNJagGcnEzyqceOaHaOiAysvmvhzy/lSMbCJJpf/+ShGIVKHT/iKexFZKD1XeCvFquEPeP84rsXKz8+NkSpVu91aSIiPdV3XTqG8fKlVar1gGE/SrUe8PKlVc3OEZGB13eBDw424901v1vzcRGRwdV3ge+AD82MEgl5ZMt1IiGPD82MKu5FZOD1XR/+WDxKuRbw2AOjm4+1Bm1FRAZZ36Xg7HSKfKVOoVLHOUehUtfFykVE2GPgm9m4mX3PzC40v4/tcFzDzF5pfj2/l3Peiy5WLiKyvb126XwZ+Fvn3B+Z2Zeb9/+HbY4rOeee2OO5dm0q5SvgRURus9cunaeBbzRvfwP4Z3t8PRER6ZC9Bv5h59z15u0l4PAOx/lmds7MXjSzu74pmNkzzWPPpdPpPZYnIiIt9+zSMbPvA9PbPPWVrXecc87Mdpr9+KBzbtHMTgI/MLPzzrm3tzvQOfcs8CzA3NycZlOKiLTJPQPfOfeJnZ4zsxtmdsQ5d93MjgA3d3iNxeb3i2b2d8AHgW0DX0REOmOvXTrPA19o3v4C8NztB5jZmJnFmrcngTPAL/Z4XhERuU97Dfw/Av6pmV0APtG8j5nNmdmfNI95H3DOzF4Ffgj8kXNOgS8i0mV7mpbpnMsAH9/m8XPA7zVv/wPw2F7OIyIie9d3K21FRGR7CnwRkQGhwBcRGRAKfBGRAaHAFxEZEAp8EZEBocAXERkQCnwRkQGhwBcRGRAKfBGRAaHAFxEZEAp8EZEBocAXERkQe72IuYjcQzpXZn4px+WVAtlSjWE/zIMTSWanU0yl/F6XJwNEgS/SQelcmbMLGQIXcHW1hGfGjfUKl1eKfPf1JZ58aJynTk4o+KUr1KUj0kHzSzmSsTDL+SrxaJhIyOPmeoVMvsahlM/FdJGzCxnSuXKvS5UBoMAX6aDVYpV4NES+XMMPh7iZK5OIhQgC8CMetSAgGQszv5TrdakyABT4Ih00Fo9SrDZI+hHK9QalagPDGIp6VOoBqViYeDTEarHa61JlACjwRTpodjpFvlJnMhmlWK3TCBz5So3hoTDFaoPj43GK1QZj8WivS5UBoEFbkQ6aSvmcOTXB/FKOcm2jdX8jV2YoEuHUoQSRkEe+Uufx4xO9LlUGgAJfpMOmUn5zFs4U8O40zdViFT/i8fhxzdKR7lDgi7TZ1kAfi0fvmG/fegNoHffixcy2x4m0m/rwRdqoNe++XAuYSMQo14Jtp13u9jiRdlILX6SNWvPuE7GN/1qt7/NLuVta77cfV2sEXM4UWbiR4yMPT6i1Lx2hFr5IG7Xm3W+13bTLrcetFau8tpjFMzBzau1LxyjwRdqoNe9+q+2mXW497spKcSP8DVJDURKxsBZjSUeoS0ekjWanU5xdyAAbLftitbE57XLrYK4BK4UqR0fj5Mp1omGjXA84dSi5+W8zhUoPfxLpR2rhi7RRa969H/HIFCr4EY8zpzbm2G8dpPUjG22tcq1B4AIc8NixEUaGNj4JaDGWdIJa+CJt9u68+3f9+EL6jsHco6Nx/IjHF888xNmFDGHPwzl3y6cCkXZSC1+kg9K5Mj++kOZv37jB2+kc2dK7g7etwdydPhVolo60mwJfpEO2zrWfHvbJleucX8xuhn6r2+ZeC7VE2kWBL9IhW+faz0wkcA4MuJQpUKjUNzdV0wIs6RYFvkgbtbpw/urVRV58O0OtsTH1cjQe5QPHRkjFwixly5vdNsv56uabgplpSqZ0lAJfpE1u3y4hGg7x8qXVzS6c0XiUhw+l+Pj7DvOx01NMpfxdL9QSaYc9Bb6Z/Y6ZvW5mgZnN3eW4T5rZvJktmNmX93T19coAABAsSURBVHJOkf1qaxdOtlSjHgS8nS7wvdeXWC1UNrtxZqdTm/9mtwu1RNphry3814B/DvxopwPMLAR8DfgU8CjweTN7dI/nFdl3Wq311lYJsbDHrx4bplRr8OLFDOVa/Y7ZN60LpBQqdZxz274piLTLnubhO+feADCzux32JLDgnLvYPPYvgKeBX+zl3CL7Tau13toqwY+EcMATM2OcnEziR7w7Zt9svUBKplBhLB7V/vjSMd1YeHUMuLLl/lXgqZ0ONrNngGcAZmZmOluZSBu1tlXI5CtMJmOUag1KtQanDiXvulXCdgu1RDrhnoFvZt8Hprd56ivOuefaXZBz7lngWYC5uTnX7tcX6ZRWa/16tshyvsxEyufUoSQjQ1EKlfo9++U1H1867Z6B75z7xB7PsQgc33L/geZjIn1nKuXz2Q8f5+xChmQsTDwa2uyXv9tWCW9ez/LcK9cIgoDxZIxaI2A5X9WKW2mrbkzL/Clw2sweMrMo8Dng+S6cV6Qntm6V8M5yjgs31ylWa8wv5bZdUJXOlXnulWuEPGMqNUSt4Xg7XSBwgebjS1vtdVrmb5vZVeCjwN+Y2Xeajx81sxcAnHN14EvAd4A3gP/onHt9b2WL7G9TKZ/Z6RSJWJTTh4Y5MZHccRXt/FKOIHCMDkUwg6FIiKFIiHSuovn40lZ7naXzLeBb2zx+Dfj0lvsvAC/s5VwiB81uL3e4WqwynohSqQf4kY1FWH44RDpX4v1HR7pfuPQtbY8s0iGrxSoTidjm/WypyqVMgaXsRgu/NSg7Fo9SrQVcXC4AEAt7rJVqeJ6n+fjSVtpaQaRDtq6izZaqnF/MkivXmR72b+nemZ1OEQp5nJxMEPaMdK5CI3A8/cRRDdhKW6mFL9IhWy93eClTwADnYGYicUv3zux0irBnLNwo4HA89sAIT53U7BxpPwW+SIdsXUW7lC0zPewzM5FgtDkfPx4N8c5ybnPHzF87Nbl5tSuRTlDgi3TQ1lW05Vqw2bKHjU3S1st1Dg/H7zmwK9IOCnyRLtjavROPhjZb8iNDkVu2R95pYFekHTRoK9IFO123dmY8ccvA7ksXM/xiMcdKocbPLq3x7fPXdfUraRu18EW6ZKdN0lot/9cX17i6WiIS8viVQwlCHlxeKfHSxQyfefxYt8uVPqQWvkgPbW35z9/IMeJHmD2cJDUUxY+EGB2K8Mb19V6XKX1CLXyRHmu1/F98O0Ms4hGPvvvf0uEw7nq9CZFdUwtfZJ945Mgw2VKNUq2Bc1CqNciWajxyZLjXpUmfUOCL7BNPnRxnZjxO4BzZUoXAOWbG4zx1crzXpUmfUJeOyD4xlfL51GNHdBEU6RgFvsg+ossdSiepS0dEZEAo8EVEBoQCX0RkQCjwRUQGhAJfRGRAKPBFRAaEAl9EZEAo8EVEBoQCX0RkQCjwRUQGhAJfRGRAKPBFRAaEAl9EZEBot0wRkX0gnSt3fGtstfBFRHosnStzdiFDuRYwkYhRrgWcXciQzpXbeh618EVEeuyliytcXilQbwQk/Qgz43GSsTDzS7m2tvLVwhcR6aF0rsxPfrmCAcN+lGo94PxillqjwWqx2tZzqYUvItJD80s5hsIelzNFGgEMRT2Gh8JcuJnnQzNjbT2XWvgiIj10KZOnUg8oVBuEQ1BrBFzKlLicKTA7nWrruRT4IiI9tF6uk/TDzE6niIQ8aoEj4nkcHm7/9Y33FPhm9jtm9rqZBWY2d5fjfmlm583sFTM7t5dzioj0k5GhCIFzhDzjockkD08lOTbuc2xsqO3n2msf/mvAPwf+z10c+xvOueU9nk9EpK/MjCfwIyGW81XWy1WSfoSjo0kOtbl1D3sMfOfcGwBm1p5qREQGzOx0iuV8lZOTSeLREMVqg3yl3vb+e+heH74DvmtmL5vZM3c70MyeMbNzZnYunU53qTwRkd6YSvmcOTWBH/HIFCr4EY8zpyba3n8Pu2jhm9n3geltnvqKc+65XZ7nY865RTM7BHzPzN50zv1ouwOdc88CzwLMzc25Xb6+iMiBc/t2Ch852Zmgb7ln4DvnPrHXkzjnFpvfb5rZt4AngW0DX0RkELS2U0jGwkwkYhSrDc4uZDrWuocudOmYWcLMUq3bwG+yMdgrIjKw5pdyJGNhErEwZkYiFt7cTqFT9jot87fN7CrwUeBvzOw7zcePmtkLzcMOAz82s1eBnwB/45z7//ZyXhGRg261WKXWaHB+cY1/fDvN+cW1jmynsNVeZ+l8C/jWNo9fAz7dvH0ReHwv5xER6TeG8fKlVUbjUYb9KOV6g5cvrfLE8fZup7CVVtqKiPSEYyP2N25vfLfm452hwBcR6bJ0rswb19cJgoArqyWuZ8tEQh4fmhntYNxrt0wRka5qzc6JRUL4kRATZhSrDY6Px4mEPPxI59rhauGLiHRRa3bOrxxOUa4HOGAoEuLCjXzHVti2qIUvItJFl1cK5Mp1CpU6IYNqPcC5AOeso3PwQYEvItI16VyZKyslQp4xOrQxM6dUa/Dw1MZmaZ0Me1CXjohI18wv5Tg1lcQ5qNQb+GEPAy7cyHe0K6dFLXwRkS5ZLVY5MjpEPBbmykqRbLlOKhYm6Yc63roHBb6ISNeMxaMUqw1G41FG41EACpV6R2fmbKUuHRGRLpmdTpGvbAzYOucoVOodn5mzlVr4IiJd0NoKuVitcT1bZNgP8+BEksePd3ZmzlZq4YuIdFhrsVW5FjAWj1JrOOaX8qx1cKO07SjwRUQ6rLXYqh4EvHZtHc+MQymfi+kiZxcypHPlrtShwBcR6bDVYpV4NMTllSJDkRBDkRB+xKMWBB3fA38rBb6ISIe1ZufkyzX8cAiASj0gFQsTj4Y6ugf+Vgp8EZEOa83OCYc8SrU65VqDYrXBSDzMuUsrvHl9nR9fSHe8a0eBLyLSYVMpnzOnJjg5mSSdr9IIYGbc58KNPOulOu8/OkK5FnS8P1/TMkVEumAq5fOZx4/y1Mlx5pdyvPh2hpAZsbDHG0s5UrEwE4ko80u5g3sRcxEReddUyudjp6c4MhojFPKIRUKM+GFqjYC3lwtcyuQ7dm4FvohID6yX63gGfiSEmeFHQni28XinqEtHRKQLWittV4tVxuJRPIPAOUq1Bn44RLneIHCOkaFIx2pQ4IuIdFhrpW0yFmYiEaNYbbBWrHN4OEa14VgvV0n6EY6ObuyL3ykKfBGRDmuttE3ENiI3EQtz+nCSCzfyfPjBceLR0MY8fV3iUETkYFstVgl7xvnFPPlyjaQf4fjYEMfHh/AjHplChbF4tOMbqSnwRUQ6zDBevrTKaDzKsL9xacP//FaasUSMlB9hLB5ldjqlSxyKiBx8jo3Y37hdqNS4lq1QbzSYSMS6sugKFPgiIh3ngA/NjBIJeWTLda5nKySiHpdWSrx2LUu9S5uoKfBFRDpsLB4lGg7x2AOjPHpkmGJ1Y1+d8USUaj3g/GKWWqPR8U3UFPgiIh229dKGlzMF/EiIWsNxeNjf3C75ws08Y83r3HaKAl9EpMNam6f5EY+l9TIPjg9xeMQn7BnOOXCwnKt0/Nq2mqUjItIFUyl/cxZOuRZQawRcWSmSLdeJeB5PPjTe8Vk6CnwRkS6anU5trrr9wLGRzQVXT52c6Pi51aUjItJFW7t3MoUKfsTjzKnOLrhqUQtfRKTLtnbvdNOeWvhm9r+a2Ztm9nMz+5aZje5w3CfNbN7MFszsy3s5p4iIvDd77dL5HvAB59yvAm8B/+PtB5hZCPga8CngUeDzZvboHs8rIiL3aU+B75z7rnOutVv/i8AD2xz2JLDgnLvonKsCfwE8vZfziojI/WvnoO2/Ar69zePHgCtb7l9tPrYtM3vGzM6Z2bl0Ot3G8kREBts9B23N7PvA9DZPfcU591zzmK8AdeD/2WtBzrlngWcB5ubm3F5fT0RENtwz8J1zn7jb82b2ReAzwMedc9sF9CJwfMv9B5qPiYhIF+11ls4ngf8e+C3nXHGHw34KnDazh8wsCnwOeH4v5xURkfu31z78rwIp4Htm9oqZ/TGAmR01sxcAmoO6XwK+A7wB/Efn3Ot7PK+IiNynPS28cs6d2uHxa8Cnt9x/AXhhL+cSEZG90dYKIiIDQoEvIjIgbPuJNfuDmaWBS72u4z5MAsu9LuI9Uu29odp7o59rf9A5N7XdE/s68A8aMzvnnJvrdR3vhWrvDdXeG4Nau7p0REQGhAJfRGRAKPDb69leF7AHqr03VHtvDGTt6sMXERkQauGLiAwIBf4emNnvmNnrZhaY2Y6j5mb2SzM739x+4lw3a9zJfdS+765WZmbjZvY9M7vQ/D62w3GN5u/8FTPr6f5N9/o9mlnMzP6y+fxLZnai+1Vubxe1f9HM0lt+17/XizpvZ2ZfN7ObZvbaDs+bmf275s/1czP7ULdr3Mkuav91M8tu+Z3/wa5e2Dmnr/f4BbwPmAX+Dpi7y3G/BCZ7Xe/91g6EgLeBk0AUeBV4dB/U/r8AX27e/jLwb3c4Lt/rWnf7ewT+DfDHzdufA/6y13XfR+1fBL7a61q3qf2/Aj4EvLbD859m4xoeBnwEeKnXNd9H7b8O/PX9vq5a+HvgnHvDOTff6zrei13Wvl+vVvY08I3m7W8A/6yHtezGbn6PW3+mbwIfNzPrYo072a9/A/fknPsRsHKXQ54G/sxteBEYNbMj3anu7nZR+3uiwO8OB3zXzF42s2d6Xcx9uK+rlXXRYefc9ebtJeDwDsf5zaunvWhmvXxT2M3vcfMYt7HDbBaY6Ep1d7fbv4F/0ewW+aaZHd/m+f1ov/5979ZHzexVM/u2mb1/N/9gT7tlDoLdXPFrFz7mnFs0s0NsbCX9ZvMdvKPaVHtP3K32rXecc87Mdppq9mDz934S+IGZnXfOvd3uWoW/Av7cOVcxs3/NxieV/7rHNfW7n7Hx9503s08D/wk4fa9/pMC/B3ePK37t8jUWm99vmtm32PiY3PHAb0PtPbta2d1qN7MbZnbEOXe9+RH85g6v0fq9XzSzvwM+yEZ/dLft5vfYOuaqmYWBESDTnfLu6p61O+e21vknbIyxHAQH9mp8zrn1LbdfMLP/3cwmnXN33R9IXTodZmYJM0u1bgO/CWw78r4P7derlT0PfKF5+wvAHZ9WzGzMzGLN25PAGeAXXavwVrv5PW79mT4L/MA1R+d67J6139bv/VtsXOjoIHge+JfN2TofAbJbugr3NTObbo3xmNmTbGT5vRsIvR6NPshfwG+z0e9XAW4A32k+fhR4oXn7JBszG14FXmejO+VA1N68/2ngLTZaxvul9gngb4ELwPeB8ebjc8CfNG//GnC++Xs/D/xuj2u+4/cI/CEblwcF8IH/F1gAfgKc7PXv+T5q/5+bf9uvAj8EHul1zc26/hy4DtSaf+u/C/w+8PvN5w34WvPnOs9dZtrtw9q/tOV3/iLwa7t5Xa20FREZEOrSEREZEAp8EZEBocAXERkQCnwRkQGhwBcRGRAKfBGRAaHAFxEZEAp8EZEB8f8DyQQUbzXji3YAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x432 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# normalize features\n",
"X_sc = StandardScaler().fit_transform(embeddings)\n",
"plt.figure(figsize=(6,6))\n",
"plt.scatter(X_sc[:, 0], X_sc[:, 1], alpha=0.25)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Clustering "
]
},
{
"cell_type": "code",
"execution_count": 620,
"metadata": {},
"outputs": [],
"source": [
"clusters = DBSCAN(eps=1.1).fit_predict(embeddings)\n",
"n_clusters = clusters.max() + 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Visualization"
]
},
{
"cell_type": "code",
"execution_count": 621,
"metadata": {},
"outputs": [],
"source": [
"langs_to_remove2 = r\"(Experience|Makefile|Markdown|Graphviz (DOT)|reStructuredText|YAML|Text)\"\n",
"def get_cluster_langs(dataframe, clusters, cluster, k=20):\n",
" mask = clusters == cluster\n",
" res = dataframe.iloc[mask] \n",
" res = res.where(res > 0).dropna(axis=\"columns\", how=\"all\").fillna(0)\n",
" n_devs, n_langs = res.shape\n",
" col_counts = res.astype(bool).sum(axis=0) # count nonzero values in each column\n",
" ind = numpy.argsort(col_counts)\n",
" k = min(k, n_langs) # show only k languages\n",
" ind = ind[-k:]\n",
" top_langs = res.columns[ind]\n",
" langs_list = []\n",
" for l in top_langs[::-1]:\n",
" lang = l.split(\"_\")[0]\n",
" if lang == \"JavaScript\":\n",
" lang = \"JS\"\n",
" if not re.search(langs_to_remove2, l) and lang not in langs_list:\n",
" langs_list.append(lang)\n",
" return \" - \".join(langs_list[:3])"
]
},
{
"cell_type": "code",
"execution_count": 622,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/waren/.local/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:\n",
"\n",
"Consider using IPython.display.IFrame instead\n",
"\n"
]
},
{
"data": {
"text/html": [
"<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~warensourced/39.embed\" height=\"1000px\" width=\"1000px\"></iframe>"
],
"text/plain": [
"<chart_studio.tools.PlotlyDisplay object>"
]
},
"execution_count": 622,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import plotly.plotly as py\n",
"#import plotly.offline as py\n",
"\n",
"data_traces = []\n",
"cluster_colors = cm.rainbow(numpy.linspace(0, 1, n_clusters))\n",
"\n",
"for cluster_id, color in enumerate(cluster_colors):\n",
" trace_devs = go.Scatter(\n",
" x = embeddings[:len(dev_labels)][numpy.where(clusters[:len(dev_labels)] == cluster_id)[0], 0],\n",
" y = embeddings[:len(dev_labels)][numpy.where(clusters[:len(dev_labels)] == cluster_id)[0], 1],\n",
" name = get_cluster_langs(df, clusters=clusters, cluster=cluster_id),\n",
" mode = \"markers\",\n",
" showlegend=False,\n",
" marker = dict(color = to_hex(color), size = 7, opacity = 1, symbol = \"bowtie\"),\n",
" text = [dev_labels[i] for i in numpy.where(clusters[:len(dev_labels)] == cluster_id)[0]]\n",
" )\n",
" trace_projects = go.Scatter(\n",
" x = embeddings[len(dev_labels):][numpy.where(clusters[len(dev_labels):] == cluster_id)[0], 0],\n",
" y = embeddings[len(dev_labels):][numpy.where(clusters[len(dev_labels):] == cluster_id)[0], 1],\n",
" mode = \"markers\",\n",
" name = get_cluster_langs(df, clusters=clusters, cluster=cluster_id),\n",
" marker = dict(color = to_hex(color), size = 9, opacity = 0.6, symbol = \"circle\"),\n",
" text = [project_labels[i] for i in numpy.where(clusters[len(dev_labels):] == cluster_id)[0]]\n",
" )\n",
" data_traces.extend([trace_devs, trace_projects])\n",
" \n",
"\n",
"layout = go.Layout(\n",
" title = go.layout.Title(\n",
" text = \"Developer similarity based on language experience\"\n",
" \"<br> on source{d} codebase\"),\n",
" autosize=False,\n",
" width=1000,\n",
" height=1000,\n",
" xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),\n",
" yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)\n",
")\n",
"\n",
"fig = go.Figure(data=data_traces, layout=layout)\n",
"py.iplot(fig, filename='language-experience-source{d}.html')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Commit count by repo"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"datetime.datetime(2014, 7, 28, 3, 3, 54)"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from datetime import datetime, timedelta\n",
"\n",
"min_start_time = None\n",
"for _, _, _, start_time, _ in repos:\n",
" if min_start_time is None or min_start_time > start_time:\n",
" min_start_time = start_time\n",
"min_start_time = datetime.utcfromtimestamp(min_start_time)\n",
"min_start_time"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"343\n"
]
}
],
"source": [
"commit_count = defaultdict(lambda: defaultdict(int)) # commit_count[dev_id][repo_id]\n",
"last_commit = {}\n",
"for repo, people, days, start_time, _ in repos:\n",
" base_date = datetime.utcfromtimestamp(start_time)\n",
" people_map = {}\n",
" for i, p in enumerate(people):\n",
" try:\n",
" people_map[i] = dev_map[p.split(\"|\", 1)[0]]\n",
" except KeyError:\n",
" continue\n",
" for day, devs in sorted(days.items()):\n",
" date = (base_date + timedelta(days=day)).date()\n",
" for dev, devday in devs.items():\n",
" #devday_bounded = apply_UB(devday, upper_bounds)\n",
" #if dev in people_map:\n",
" commit_count[people_map[dev]][repo] += devday.Commits\n",
" try:\n",
" if date > last_commit[people_map[dev]]:\n",
" last_commit[people_map[dev]] = date\n",
" except KeyError:\n",
" last_commit[people_map[dev]] = date\n",
"print(len(commit_count))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"85\n"
]
}
],
"source": [
"for dev, last_date in last_commit.items():\n",
" if last_date.year < 2019:\n",
" del commit_count[dev]\n",
"print(len(commit_count))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"36\n"
]
}
],
"source": [
"lower_global_commit_threshold = 30\n",
"datalist = sorted((k, v) for k, v in commit_count.items()\n",
" if sum([n_commits for n_commits in v.values()]) > lower_global_commit_threshold)\n",
"dev_labels = [id2dev[k[0]] for k in datalist]\n",
"print(len(datalist))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"311"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"commit_values = []\n",
"for dev, commit_count_by_repo in datalist:\n",
" commit_values.extend(commit_count_by_repo.values())\n",
"commit_values = array(commit_values)\n",
"upper_bound = int(np.percentile(commit_values[commit_values != 0], 99))\n",
"upper_bound"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"repo2id = OrderedDict()\n",
"repos_names = set()\n",
"for dev, commit_count_by_repo in datalist:\n",
" repos_names.update(commit_count_by_repo.keys())\n",
"for i, repo in enumerate(sorted(repos_names)):\n",
" repo2id[repo] = i"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"vadim markovtsev : style-analyzer\n",
"vadim markovtsev : hercules\n",
"vadim markovtsev : ml\n",
"máximo cuadros ortiz : go-git\n",
"maxim sukharev : lookout\n",
"miguel molina : go-mysql-server\n",
"carlos martín : gitbase-web\n",
"(36, 120)\n"
]
}
],
"source": [
"def build_commit_matrix(datalist):\n",
" commit_matrix = np.zeros((len(datalist), len(repo2id)))\n",
" for i, (dev, commit_count_by_repo) in enumerate(datalist):\n",
" for repo, commit_count in commit_count_by_repo.items():\n",
" if commit_count < 2:\n",
" continue\n",
" elif commit_count > upper_bound:\n",
" commit_count = upper_bound\n",
" print(\"%s : %s\"% (id2dev[dev], repo))\n",
" commit_matrix[i, repo2id[repo]] = commit_count\n",
" return commit_matrix\n",
"\n",
"commit_matrix = build_commit_matrix(datalist)\n",
"print(commit_matrix.shape)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"0 : vadim markovtsev\n",
"--- total commit count : 2851\n",
"dev-similarity : 7\n",
"style-analyzer : 434\n",
"tab-vs-spaces : 8\n",
"ml-mining : 1\n",
"coreos-nvidia : 5\n",
"datasets : 59\n",
"tmsc : 16\n",
"engine : 4\n",
"hercules : 607\n",
"gitbase : 1\n",
"code-annotation : 2\n",
"lookout-sdk : 4\n",
"gitbase-web : 1\n",
"go-license-detector : 85\n",
"ml-core : 6\n",
"role2vec : 1\n",
"sparkpickle : 6\n",
"snippet-ranger : 5\n",
"blog : 195\n",
"tensorflow-codelab : 7\n",
"design : 1\n",
"talks : 4\n",
"engine-analyses : 17\n",
"tensorflow-swivel : 17\n",
"conferences : 31\n",
"guide : 25\n",
"treediff : 14\n",
"modelforge : 116\n",
"lookout-sdk-ml : 139\n",
"diffcuda : 17\n",
"jgit-spark-connector : 2\n",
"jgscm : 25\n",
"wmd-relax : 79\n",
"go-git : 7\n",
"borges : 1\n",
"lapjv : 25\n",
"vecino : 38\n",
"dev-explorer : 9\n",
"ml-backlog : 5\n",
"apollo : 93\n",
"enry : 11\n",
"lookout : 3\n",
"landing : 2\n",
"gypogit : 4\n",
"minhashcuda : 44\n",
"models : 8\n",
"awesome-machine-learning-on-source-code : 42\n",
"kmcuda : 210\n",
"code-completion : 44\n",
"seriate : 6\n",
"shell-complete : 3\n",
"okrs : 4\n",
"ml : 351\n",
"\n",
"1 : marcelo novaes\n",
"--- total commit count : 62\n",
"dev-similarity : 2\n",
"datasets : 2\n",
"intro : 1\n",
"engine : 3\n",
"hercules : 1\n",
"design : 10\n",
"gemini : 3\n",
"guide : 18\n",
"homebrew : 9\n",
"labeled-generated-code : 2\n",
"apollo : 3\n",
"landing : 2\n",
"awesome-machine-learning-on-source-code : 4\n",
"okrs : 1\n",
"ml : 1\n",
"\n",
"2 : rafael porres molina\n",
"--- total commit count : 203\n",
"kubernetes-local-pv-provisioner : 1\n",
"style-analyzer : 1\n",
"datasets : 1\n",
"rovers : 1\n",
"code-annotation : 2\n",
"charts : 100\n",
"gitbase-web : 3\n",
"blog : 2\n",
"talks : 3\n",
"infrastructure-dockerfiles : 14\n",
"lookout-sonarcheck-analyzer : 3\n",
"conferences : 7\n",
"guide : 5\n",
"terraform-provisioner-online-rescue : 4\n",
"google-cloud-dns-healthcheck : 15\n",
"jgit-spark-connector : 3\n",
"borges : 1\n",
"lookout : 14\n",
"landing : 5\n",
"lookout-gometalint-analyzer : 2\n",
"ewbf-miner-docker : 6\n",
"sourced-ui : 1\n",
"docs : 2\n",
"terraform-provider-online : 7\n",
"\n",
"3 : maartje eyskens\n",
"--- total commit count : 185\n",
"kubernetes-local-pv-provisioner : 5\n",
"style-analyzer : 4\n",
"gitbase : 2\n",
"code-annotation : 2\n",
"charts : 82\n",
"gitbase-web : 2\n",
"caddy-build : 11\n",
"lookout-terraform-analyzer : 12\n",
"blog : 2\n",
"infrastructure-dockerfiles : 15\n",
"lookout-sonarcheck-analyzer : 1\n",
"guide : 3\n",
"terraform-provisioner-online-rescue : 4\n",
"lookout : 7\n",
"landing : 1\n",
"lookout-gometalint-analyzer : 1\n",
"sourced-ui : 4\n",
"terraform-provider-online : 27\n",
"\n",
"4 : máximo cuadros ortiz\n",
"--- total commit count : 1195\n",
"go-log : 20\n",
"coreos-nvidia : 22\n",
"go-borges : 18\n",
"proteus : 1\n",
"core-retrieval : 2\n",
"hercules : 2\n",
"rovers : 50\n",
"gitbase : 12\n",
"charts : 4\n",
"go-git-fixtures : 31\n",
"go-billy-gluster : 1\n",
"role2vec : 1\n",
"go-errors : 10\n",
"sparkpickle : 1\n",
"snippet-ranger : 1\n",
"blog : 47\n",
"go-queue : 12\n",
"talks : 14\n",
"fsbench : 14\n",
"core : 1\n",
"guide : 55\n",
"go-parse-utils : 2\n",
"homebrew : 3\n",
"modelforge : 2\n",
"go-billy-siva : 5\n",
"git-fixture : 5\n",
"go-compose-installer : 15\n",
"diffcuda : 2\n",
"jgit-spark-connector : 3\n",
"wmd-relax : 2\n",
"go-git : 584\n",
"borges : 13\n",
"vecino : 1\n",
"enry : 18\n",
"go-billy : 69\n",
"ci : 5\n",
"lookout : 14\n",
"landing : 5\n",
"gypogit : 2\n",
"minhashcuda : 1\n",
"beanstool : 23\n",
"awesome-machine-learning-on-source-code : 1\n",
"go-mysql-server : 17\n",
"kmcuda : 1\n",
"swivel-spark-prep : 1\n",
"combustion : 20\n",
"go-kallax : 4\n",
"shell-complete : 1\n",
"okrs : 5\n",
"framework : 4\n",
"terraform-provider-online : 7\n",
"ml : 3\n",
"go-siva : 38\n",
"\n",
"5 : konstantin slavnov\n",
"--- total commit count : 536\n",
"style-analyzer : 216\n",
"ml-mining : 1\n",
"datasets : 2\n",
"tmsc : 4\n",
"hercules : 2\n",
"lookout-sdk : 1\n",
"go-license-detector : 1\n",
"ml-core : 5\n",
"snippet-ranger : 24\n",
"blog : 1\n",
"jupyter-spark-docker : 1\n",
"function-name-analyzer : 3\n",
"guide : 2\n",
"modelforge : 18\n",
"lookout-sdk-ml : 36\n",
"jgit-spark-connector : 3\n",
"jgscm : 1\n",
"code2vec : 1\n",
"wmd-relax : 4\n",
"vecino : 6\n",
"apollo : 14\n",
"minhashcuda : 6\n",
"awesome-machine-learning-on-source-code : 2\n",
"kmcuda : 2\n",
"ml : 180\n",
"\n",
"6 : hugo mougard\n",
"--- total commit count : 289\n",
"style-analyzer : 155\n",
"reading-club : 34\n",
"blog : 3\n",
"function-name-analyzer : 25\n",
"conferences : 1\n",
"formatml : 17\n",
"lookout-sdk-ml : 14\n",
"ml-backlog : 1\n",
"awesome-machine-learning-on-source-code : 39\n",
"\n",
"7 : santiago m. mola\n",
"--- total commit count : 715\n",
"style-analyzer : 1\n",
"go-log : 2\n",
"datasets : 15\n",
"intro : 1\n",
"core-retrieval : 33\n",
"hercules : 1\n",
"rovers : 24\n",
"gitbase : 66\n",
"siva-java : 1\n",
"charts : 5\n",
"go-git-fixtures : 5\n",
"go-license-detector : 1\n",
"go-errors : 1\n",
"blog : 3\n",
"go-queue : 6\n",
"jupyter-spark-docker : 2\n",
"core : 16\n",
"guide : 79\n",
"best-practices-experiments : 1\n",
"go-billy-siva : 8\n",
"jgit-spark-connector : 4\n",
"regression-borges : 1\n",
"go-git : 121\n",
"go-cli : 18\n",
"borges : 76\n",
"replicatedcom-application : 1\n",
"enry : 14\n",
"go-billy : 21\n",
"ci : 10\n",
"lookout : 28\n",
"landing : 2\n",
"lookout-gometalint-analyzer : 1\n",
"go-mysql-server : 63\n",
"swivel-spark-prep : 1\n",
"go-kallax : 5\n",
"okrs : 38\n",
"framework : 19\n",
"go-siva : 21\n",
"\n",
"8 : maxim sukharev\n",
"--- total commit count : 1375\n",
"style-analyzer : 3\n",
"go-log : 1\n",
"engine : 165\n",
"proteus : 1\n",
"hercules : 12\n",
"code-annotation : 164\n",
"charts : 6\n",
"lookout-sdk : 61\n",
"gitbase-web : 129\n",
"lookout-terraform-analyzer : 3\n",
"go-queue : 1\n",
"jupyter-spark-docker : 1\n",
"lookout-sonarcheck-analyzer : 28\n",
"gemini : 253\n",
"guide : 1\n",
"lookout-sdk-ml : 1\n",
"sourced-ce : 73\n",
"jgit-spark-connector : 3\n",
"apollo : 4\n",
"ci : 6\n",
"lookout : 384\n",
"landing : 1\n",
"lookout-gometalint-analyzer : 11\n",
"go-mysql-server : 2\n",
"lookout-test-fixtures : 10\n",
"sourced-ui : 47\n",
"go-kallax : 3\n",
"framework : 1\n",
"\n",
"9 : egor bulychev\n",
"--- total commit count : 210\n",
"style-analyzer : 42\n",
"hercules : 3\n",
"engine-analyses : 4\n",
"best-practices-experiments : 5\n",
"lookout-sdk-ml : 5\n",
"jgit-spark-connector : 3\n",
"jgscm : 2\n",
"apollo : 20\n",
"awesome-machine-learning-on-source-code : 27\n",
"code-completion : 6\n",
"ml : 93\n",
"\n",
"10 : irina khismatullina\n",
"--- total commit count : 257\n",
"style-analyzer : 237\n",
"ml-core : 2\n",
"conferences : 3\n",
"modelforge : 7\n",
"lookout-sdk-ml : 1\n",
"models : 4\n",
"ml : 3\n",
"\n",
"12 : waren long\n",
"--- total commit count : 230\n",
"style-analyzer : 94\n",
"datasets : 8\n",
"blog : 13\n",
"conferences : 2\n",
"modelforge : 4\n",
"lookout-sdk-ml : 11\n",
"apollo : 1\n",
"awesome-machine-learning-on-source-code : 11\n",
"shell-complete : 22\n",
"ml : 64\n",
"\n",
"13 : manuel carmona\n",
"--- total commit count : 542\n",
"go-log : 1\n",
"go-borges : 32\n",
"datasets : 24\n",
"core-retrieval : 5\n",
"rovers : 21\n",
"gitbase : 63\n",
"siva-java : 19\n",
"gitbase-web : 2\n",
"blog : 11\n",
"go-queue : 12\n",
"berserker : 7\n",
"gitcollector : 34\n",
"jgit-spark-connector : 95\n",
"go-git : 5\n",
"borges : 37\n",
"enry : 80\n",
"go-mysql-server : 78\n",
"framework : 15\n",
"go-siva : 1\n",
"\n",
"14 : kuba@sourced.tech\n",
"--- total commit count : 238\n",
"go-log : 10\n",
"go-vitess : 2\n",
"rovers : 1\n",
"gitbase : 67\n",
"go-queue : 6\n",
"guide : 1\n",
"go-git : 8\n",
"borges : 5\n",
"enry : 2\n",
"go-billy : 1\n",
"go-mysql-server : 128\n",
"regression-gitbase : 6\n",
"framework : 1\n",
"\n",
"15 : david pordomingo\n",
"--- total commit count : 590\n",
"go-log : 1\n",
"flamingo : 3\n",
"engine : 14\n",
"gitbase : 2\n",
"code-annotation : 76\n",
"charts : 1\n",
"go-git-fixtures : 2\n",
"lookout-sdk : 8\n",
"gitbase-web : 31\n",
"lookout-terraform-analyzer : 3\n",
"blog : 44\n",
"design : 1\n",
"talks : 15\n",
"lookout-sonarcheck-analyzer : 2\n",
"gemini : 14\n",
"guide : 28\n",
"docsrv : 3\n",
"sourced-ce : 8\n",
"jgit-spark-connector : 2\n",
"go-git : 4\n",
"enry : 3\n",
"ci : 8\n",
"lookout : 47\n",
"landing : 201\n",
"go-mysql-server : 2\n",
"sourced.ghost.io : 2\n",
"lookout-test-fixtures : 4\n",
"go-kallax : 37\n",
"docs : 23\n",
"go-siva : 1\n",
"\n",
"16 : miguel molina\n",
"--- total commit count : 1846\n",
"tab-vs-spaces : 2\n",
"datasets : 10\n",
"platform-starter : 13\n",
"flamingo : 134\n",
"engine : 41\n",
"proteus : 58\n",
"core-retrieval : 14\n",
"rovers : 16\n",
"gitbase : 289\n",
"siva-java : 9\n",
"go-git-fixtures : 2\n",
"go-license-detector : 1\n",
"go-errors : 1\n",
"blog : 38\n",
"go-queue : 3\n",
"talks : 12\n",
"core : 3\n",
"guide : 3\n",
"docsrv : 36\n",
"go-parse-utils : 10\n",
"go-billy-siva : 1\n",
"jgit-spark-connector : 206\n",
"go-git : 34\n",
"borges : 47\n",
"enry : 12\n",
"ci : 21\n",
"landing : 187\n",
"go-mysql-server : 316\n",
"code-completion : 53\n",
"go-kallax : 259\n",
"regression-gitbase : 2\n",
"framework : 13\n",
"\n",
"17 : guillem duran ballester\n",
"--- total commit count : 58\n",
"ml-mining : 4\n",
"confident-metrics : 7\n",
"ml-core : 25\n",
"modelforge : 2\n",
"lookout-sdk-ml : 20\n",
"\n",
"21 : javi fontan\n",
"--- total commit count : 658\n",
"go-borges : 60\n",
"datasets : 2\n",
"regression-core : 26\n",
"engine : 2\n",
"core-retrieval : 20\n",
"rovers : 3\n",
"gitbase : 100\n",
"charts : 15\n",
"go-billy-gluster : 18\n",
"gitbase-web : 1\n",
"go-queue : 19\n",
"jupyter-spark-docker : 8\n",
"conferences : 4\n",
"guide : 2\n",
"go-billy-siva : 22\n",
"gitcollector : 1\n",
"jgit-spark-connector : 16\n",
"regression-borges : 22\n",
"go-git : 72\n",
"go-cli : 3\n",
"borges : 142\n",
"go-billy : 7\n",
"ci : 1\n",
"go-mysql-server : 63\n",
"regression-gitbase : 19\n",
"framework : 4\n",
"go-siva : 6\n",
"\n",
"24 : francesc campoy\n",
"--- total commit count : 233\n",
"datasets : 31\n",
"engine : 20\n",
"gitbase : 4\n",
"go-license-detector : 1\n",
"engine-tour : 4\n",
"blog : 59\n",
"github-reminder : 11\n",
"engine-analyses : 5\n",
"conferences : 65\n",
"guide : 24\n",
"enry : 1\n",
"lookout : 1\n",
"landing : 1\n",
"awesome-machine-learning-on-source-code : 1\n",
"okrs : 1\n",
"docs : 2\n",
"product-backlog : 1\n",
"go-siva : 1\n",
"\n",
"25 : eiso kant\n",
"--- total commit count : 320\n",
"datasets : 1\n",
"intro : 1\n",
"engine : 30\n",
"gitbase : 1\n",
"go-git-fixtures : 1\n",
"blog : 68\n",
"guide : 127\n",
"code2vec : 1\n",
"go-git : 1\n",
"enry : 2\n",
"ci : 2\n",
"landing : 61\n",
"models : 1\n",
"awesome-machine-learning-on-source-code : 4\n",
"code-completion : 1\n",
"swivel-spark-prep : 1\n",
"go-kallax : 1\n",
"okrs : 15\n",
"ml : 1\n",
"\n",
"26 : antonio navarro perez\n",
"--- total commit count : 835\n",
"datasets : 2\n",
"proteus : 1\n",
"core-retrieval : 38\n",
"rovers : 66\n",
"gitbase : 199\n",
"siva-java : 12\n",
"go-git-fixtures : 7\n",
"go-queue : 2\n",
"jupyter-spark-docker : 2\n",
"core : 12\n",
"berserker : 6\n",
"guide : 1\n",
"go-billy-siva : 16\n",
"jgit-spark-connector : 103\n",
"go-git : 66\n",
"borges : 84\n",
"enry : 1\n",
"go-billy : 8\n",
"ci : 2\n",
"beanstool : 1\n",
"go-mysql-server : 198\n",
"regression-gitbase : 1\n",
"framework : 6\n",
"go-siva : 1\n",
"\n",
"27 : bzz@users.noreply.github.com\n",
"--- total commit count : 158\n",
"datasets : 2\n",
"reading-club : 24\n",
"rovers : 1\n",
"siva-java : 1\n",
"code-annotation : 8\n",
"lookout-sdk : 5\n",
"gitbase-web : 9\n",
"blog : 6\n",
"berserker : 7\n",
"lookout-sonarcheck-analyzer : 1\n",
"gemini : 16\n",
"conferences : 1\n",
"guide : 11\n",
"jgit-spark-connector : 8\n",
"go-git : 1\n",
"borges : 4\n",
"enry : 11\n",
"lookout : 16\n",
"minhashcuda : 1\n",
"lookout-gometalint-analyzer : 7\n",
"awesome-machine-learning-on-source-code : 11\n",
"lookout-test-fixtures : 1\n",
"okrs : 6\n",
"\n",
"31 : carlos martín\n",
"--- total commit count : 911\n",
"intro : 1\n",
"engine : 172\n",
"gitbase : 2\n",
"code-annotation : 43\n",
"charts : 8\n",
"lookout-sdk : 29\n",
"gitbase-web : 312\n",
"go-queue : 5\n",
"lookout-sonarcheck-analyzer : 3\n",
"gemini : 29\n",
"guide : 3\n",
"sourced-ce : 39\n",
"jgit-spark-connector : 9\n",
"go-cli : 3\n",
"borges : 1\n",
"apollo : 1\n",
"lookout : 226\n",
"landing : 1\n",
"lookout-gometalint-analyzer : 5\n",
"lookout-test-fixtures : 4\n",
"sourced-ui : 11\n",
"okrs : 4\n",
"\n",
"32 : lou marvin caraig\n",
"--- total commit count : 229\n",
"regression-core : 2\n",
"engine : 95\n",
"lookout-sdk : 46\n",
"gitbase-web : 12\n",
"guide : 2\n",
"sourced-ce : 32\n",
"lookout : 30\n",
"lookout-gometalint-analyzer : 8\n",
"lookout-test-fixtures : 1\n",
"sourced-ui : 1\n",
"\n",
"38 : alexander bezzubov\n",
"--- total commit count : 579\n",
"reading-club : 3\n",
"engine : 2\n",
"hercules : 11\n",
"gitbase : 1\n",
"siva-java : 2\n",
"lookout-sdk : 51\n",
"gitbase-web : 2\n",
"blog : 24\n",
"berserker : 87\n",
"lookout-sonarcheck-analyzer : 15\n",
"gemini : 107\n",
"conferences : 1\n",
"guide : 7\n",
"jgit-spark-connector : 42\n",
"borges : 14\n",
"apollo : 5\n",
"enry : 74\n",
"lookout : 41\n",
"lookout-gometalint-analyzer : 41\n",
"awesome-machine-learning-on-source-code : 6\n",
"swivel-spark-prep : 42\n",
"ml : 1\n",
"\n",
"46 : dependabot[bot]@users.noreply.github.com\n",
"--- total commit count : 95\n",
"engine : 16\n",
"gitbase-web : 36\n",
"lookout : 18\n",
"go-mysql-server : 1\n",
"ml : 24\n",
"\n",
"57 : robert lin\n",
"--- total commit count : 53\n",
"hercules : 53\n",
"\n",
"65 : juanjo alvarez martinez\n",
"--- total commit count : 48\n",
"rovers : 3\n",
"gitbase : 1\n",
"go-queue : 1\n",
"guide : 5\n",
"lookout-flake8-analyzer : 1\n",
"wmd-relax : 4\n",
"borges : 2\n",
"enry : 3\n",
"lookout : 2\n",
"go-mysql-server : 25\n",
"framework : 1\n",
"\n",
"68 : anna tsolakou\n",
"--- total commit count : 146\n",
"gitbase : 3\n",
"guide : 139\n",
"borges : 1\n",
"okrs : 3\n",
"\n",
"78 : ricardo baeta\n",
"--- total commit count : 84\n",
"code-annotation : 5\n",
"blog : 2\n",
"design : 58\n",
"guide : 3\n",
"awesome-machine-learning-on-source-code : 1\n",
"sourced.ghost.io : 5\n",
"sourced-ui : 10\n",
"\n",
"79 : david riosalido\n",
"--- total commit count : 40\n",
"code-annotation : 1\n",
"charts : 28\n",
"gitbase-web : 3\n",
"blog : 2\n",
"infrastructure-dockerfiles : 5\n",
"landing : 1\n",
"\n",
"80 : romain keramitas\n",
"--- total commit count : 104\n",
"charts : 3\n",
"blog : 4\n",
"infrastructure-dockerfiles : 1\n",
"guide : 1\n",
"modelforge : 23\n",
"jgit-spark-connector : 1\n",
"code2vec : 3\n",
"apollo : 18\n",
"minhashcuda : 2\n",
"models : 12\n",
"ml : 36\n",
"\n",
"84 : filip navara\n",
"--- total commit count : 47\n",
"go-git-fixtures : 2\n",
"go-git : 45\n",
"\n",
"138 : esther garcía\n",
"--- total commit count : 42\n",
"guide : 42\n",
"\n",
"139 : jorge schnura becerro\n",
"--- total commit count : 143\n",
"guide : 143\n",
"\n",
"168 : jeremy stribling\n",
"--- total commit count : 38\n",
"go-git : 38\n"
]
}
],
"source": [
"for dev, commit_count_by_repo in datalist:\n",
" print()\n",
" print(\"%d : %s\" % (dev, id2dev[dev]))\n",
" print(\"--- total commit count :\", sum([c for c in commit_count_by_repo.values()]))\n",
" for repo, commit_count in commit_count_by_repo.items():\n",
" print(\"%s : %d\"% (repo, commit_count))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"def add_overall_repos_contributed_column(df, datalist):\n",
" overall_repos_contributed = [len(commit_count_by_repo) for _, commit_count_by_repo in datalist]\n",
" #experience = [s / sum(experience) for s in experience]\n",
" print(overall_repos_contributed)\n",
" df[\"Experience\"] = overall_repos_contributed\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(36, 120)"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(data=commit_matrix, columns=repo2id.keys())\n",
"#df = add_overall_repos_contributed_column(df, datalist)\n",
"df_norm = df.div(df.sum(axis=1), axis=0)\n",
"df_norm.shape"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>apollo</th>\n",
" <th>awesome-machine-learning-on-source-code</th>\n",
" <th>beanstool</th>\n",
" <th>berserker</th>\n",
" <th>best-practices-experiments</th>\n",
" <th>blog</th>\n",
" <th>borges</th>\n",
" <th>caddy-build</th>\n",
" <th>charts</th>\n",
" <th>ci</th>\n",
" <th>...</th>\n",
" <th>tab-vs-spaces</th>\n",
" <th>talks</th>\n",
" <th>tensorflow-codelab</th>\n",
" <th>tensorflow-swivel</th>\n",
" <th>terraform-provider-online</th>\n",
" <th>terraform-provisioner-online-rescue</th>\n",
" <th>tmsc</th>\n",
" <th>treediff</th>\n",
" <th>vecino</th>\n",
" <th>wmd-relax</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.038977</td>\n",
" <td>0.017603</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.081727</td>\n",
" <td>0.000000</td>\n",
" <td>0.00000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.003353</td>\n",
" <td>0.001676</td>\n",
" <td>0.002934</td>\n",
" <td>0.007125</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.006706</td>\n",
" <td>0.005868</td>\n",
" <td>0.015926</td>\n",
" <td>0.033110</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.051724</td>\n",
" <td>0.068966</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.00000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.010152</td>\n",
" <td>0.000000</td>\n",
" <td>0.00000</td>\n",
" <td>0.507614</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.015228</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.035533</td>\n",
" <td>0.020305</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.010989</td>\n",
" <td>0.000000</td>\n",
" <td>0.06044</td>\n",
" <td>0.450549</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.148352</td>\n",
" <td>0.021978</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.025275</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.051648</td>\n",
" <td>0.014286</td>\n",
" <td>0.00000</td>\n",
" <td>0.004396</td>\n",
" <td>0.005495</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.015385</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.007692</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.002198</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 120 columns</p>\n",
"</div>"
],
"text/plain": [
" apollo awesome-machine-learning-on-source-code beanstool berserker \\\n",
"0 0.038977 0.017603 0.000000 0.0 \n",
"1 0.051724 0.068966 0.000000 0.0 \n",
"2 0.000000 0.000000 0.000000 0.0 \n",
"3 0.000000 0.000000 0.000000 0.0 \n",
"4 0.000000 0.000000 0.025275 0.0 \n",
"\n",
" best-practices-experiments blog borges caddy-build charts \\\n",
"0 0.0 0.081727 0.000000 0.00000 0.000000 \n",
"1 0.0 0.000000 0.000000 0.00000 0.000000 \n",
"2 0.0 0.010152 0.000000 0.00000 0.507614 \n",
"3 0.0 0.010989 0.000000 0.06044 0.450549 \n",
"4 0.0 0.051648 0.014286 0.00000 0.004396 \n",
"\n",
" ci ... tab-vs-spaces talks tensorflow-codelab \\\n",
"0 0.000000 ... 0.003353 0.001676 0.002934 \n",
"1 0.000000 ... 0.000000 0.000000 0.000000 \n",
"2 0.000000 ... 0.000000 0.015228 0.000000 \n",
"3 0.000000 ... 0.000000 0.000000 0.000000 \n",
"4 0.005495 ... 0.000000 0.015385 0.000000 \n",
"\n",
" tensorflow-swivel terraform-provider-online \\\n",
"0 0.007125 0.000000 \n",
"1 0.000000 0.000000 \n",
"2 0.000000 0.035533 \n",
"3 0.000000 0.148352 \n",
"4 0.000000 0.007692 \n",
"\n",
" terraform-provisioner-online-rescue tmsc treediff vecino \\\n",
"0 0.000000 0.006706 0.005868 0.015926 \n",
"1 0.000000 0.000000 0.000000 0.000000 \n",
"2 0.020305 0.000000 0.000000 0.000000 \n",
"3 0.021978 0.000000 0.000000 0.000000 \n",
"4 0.000000 0.000000 0.000000 0.000000 \n",
"\n",
" wmd-relax \n",
"0 0.033110 \n",
"1 0.000000 \n",
"2 0.000000 \n",
"3 0.000000 \n",
"4 0.002198 \n",
"\n",
"[5 rows x 120 columns]"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_norm.head()"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(36, 2)"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"SEED = 1\n",
"n_neighbors = 7\n",
"min_dist = 0.1\n",
"n_components = 2\n",
"metric = \"cosine\"\n",
"dim_red = UMAP(n_neighbors=n_neighbors, min_dist=min_dist,\n",
" n_components=n_components,\n",
" metric=metric, random_state=SEED)\n",
"embeddings = dim_red.fit_transform(df_norm)\n",
"embeddings.shape"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.collections.PathCollection at 0x7f685b8949e8>"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAFlCAYAAADoPlOZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAafklEQVR4nO3dX2yk133e8edZcmcY/rFNcylRltZeC7ugo7ZR4hKq3TUKt1YCWSikOLEB6SZSa2ObtkIvelGoEJAUvmnSXhQIZNRduEYcoPCfClC9gTeVJTuG2i3kiCok6y8rSnGgXYveEc1IQ044s+T+esHhilyRyxm+78w7nPP9AARn3jmcc16Q8/DMec8544gQAKD/HSq6AQCA7iDwASARBD4AJILAB4BEEPgAkAgCHwASMVh0A67lyJEjcezYsaKbAQAHxjPPPPNWREzu9FhPB/6xY8c0OztbdDMA4MCw/Ve7PcaQDgAkgsAHgEQQ+ACQCAIfABJB4ANAIgh8AEgEgQ8AiSDwASARBD4AJILAB4BE9PTWCgCQgkp1VXMLVS3VGhofLml6akyTY0O510MPHwAKVKmu6tz8olYvXdbESFmrly7r3PyiKtXV3Osi8AGgQHMLVY2WBzVSHpRtjZQHNVoe1NxCNfe6GNIBgDblOQSzVGtoYqS87dhwaUCLK/U8mroNPXwAaEPeQzDjwyXVGuvbjtUa6xofLuXR3G3o4QNAG7YOwUi68n1uodpSL//qdwdHRkt6ZWFZ0kbPvtZY13J9Tbcenci97fTwAaANS7WGhksD244Nlwa0VGvs+bM7vTt4ZWFZH5sa1dDhQ1pcqWvo8CGdPD7RkVk69PABoA2bQzCbPXup9SGY3d4dvLXc0KdO7PiphLnKpYdv++u2L9p+YZfHP237bdvPNr9+L496AaDbpqfGtFxf00p9TRGhlfqalutrmp4a2/Nns7w7yENeQzp/LOmOPcr8r4j41ebXl3OqFwC6anJsSCePT+xrCKabF2h3ksuQTkQ8aftYHs8FAL1ucmxoX2Ps01NjOje/KKnzF2h30s2Ltp+0/ZztP7P9t3YrZPuU7Vnbs5VKpYvNA4DOyvLuIA/dumj7fyV9JCKWbd8p6X9IOrFTwYg4Lem0JM3MzESX2gcAXbHfdwd56EoPPyLeiYjl5u2zkg7bPtKNugEAG7rSw7c9JennERG2b9PGP5rFbtQNAEXr1m6Ye8kl8G1/U9KnJR2xfV7S70s6LEkR8VVJn5f0z22vSfobSfdEBMM1APre5mKr0fKgJkbKqjXWdW5+satj95vymqVz7x6PPyzp4TzqAoCDJOtWDHliawUA6KCiF1ttReADQAcVvdhqKwIfADooy1YMeSPwAaCDil5stRW7ZQJAhxW52GorevgAkAgCHwASwZAOAOSsV1bWXo0ePgDkKO8POc8TPXwAyNF+V9Z2410BgQ8AbdgrmJdqDU2MlLf9zHBpQIsr9Ws+Zzf222FIBwBa1MpwzX5W1m59V2BbI+VBjZYHNbdQzbX9BD4AtKiVYN7Pytpu7bdD4ANAi1oJ5v2srO3WfjuM4QNAizaDefNCrLRzMLe7srZbH25ODx8AWtSpjdC6td8OPXwAaNFmMM8tVLW4Utf4cEm3Hs0nmLux3w6BDwBt6JWN0PaDIR0ASASBDwCJIPABIBEEPgAkgsAHgEQQ+ACQCAIfABJB4ANAIgh8AEgEgQ8AiSDwASARBD4AJILAB4BEEPgAkAgCHwASQeADQCJyCXzbX7d90fYLuzxu239ke972T2x/PI96AQCty6uH/8eS7rjG45+VdKL5dUrSf86pXgBAi3IJ/Ih4UtIvrlHkbkl/EhuekvQB2zfkUTcAoDXdGsO/UdIbW+6fbx4DAHRJz32Iue1T2hj20Yc//OGCWwOg0yrVVc0tVLVUa2h8uKTpqbED+yHhva5bPfwLko5uuX9T89h7RMTpiJiJiJnJycmuNA5AMSrVVZ2bX9TqpcuaGClr9dJlnZtfVKW6WnTT+lK3Av+MpN9pztb5hKS3I+LNLtUNoEfNLVQ1Wh7USHlQtjVSHtRoeVBzC9Wim9aXchnSsf1NSZ+WdMT2eUm/L+mwJEXEVyWdlXSnpHlJNUn/JI96ARxsS7WGJkbK244Nlwa0uFIvqEX9LZfAj4h793g8JP3LPOoC0D/Gh0uqNdY1Un43imqNdY0PlwpsVf9ipS2AwkxPjWm5vqaV+poiQiv1NS3X1zQ9NVZ00/oSgQ+gMJNjQzp5fEJDhw9pcaWuocOHdPL4BLN0OqTnpmUCSMvk2BAB3yX08AEgEQQ+ACSCwAeARBD4AJAIAh8AEkHgA0AiCHwASATz8AF0HFsg9wZ6+AA6ii2QeweBD6Cj2AK5dzCkA6Cjit4CmeGkd9HDB9BRm1sgb9WtLZAZTtqOHj6AjpqeGtO5+UVJGz37WmNdy/U13Xp0QlJne+Bbh5MkXfk+t1BNspdPDx9AR11rC+RO98CXag0Nlwa2HRsuDWip1sjl+Q8aevgAOm63LZA73QPnE7W2o4cPoDCd7oHziVrbEfgACtPpC7p8otZ2DOkAKMxeF3TzwCdqvYsePoDC0APvLnr4AApFD7x76OEDQCIIfABIBIEPAIkg8AEgEQQ+ACSCwAeARBD4AJAIAh8AEkHgA0AiCHwASASBDwCJyCXwbd9he872vO0Hd3j8ftsV2882v76UR70AgNZl3jzN9oCkr0j6dUnnJT1t+0xEvHRV0W9HxANZ6wMA7E8ePfzbJM1HxOsR0ZD0LUl35/C8AIAc5RH4N0p6Y8v9881jV/tt2z+x/YjtoznUCwBoQ7cu2v6ppGMR8SuSHpf0jd0K2j5le9b2bKVS6VLzAKD/5RH4FyRt7bHf1Dx2RUQsRkS9efdrkv7ubk8WEacjYiYiZiYnJ3NoHgBAyifwn5Z0wvZHbZck3SPpzNYCtm/YcvcuSS/nUC8AoA2ZZ+lExJrtByQ9JmlA0tcj4kXbX5Y0GxFnJP0r23dJWpP0C0n3Z60XANAeR0TRbdjVzMxMzM7OFt0MADgwbD8TETM7PcZKWwBIBIEPAIkg8AEgEQQ+ACSCwAeARBD4AJAIAh8AEkHgA0AiCHwASASBDwCJIPABIBEEPgAkgsAHgERk3h4ZAHpNpbqquYWqlmoNjQ+XND01psmxoaKbVTh6+AD6SqW6qnPzi1q9dFkTI2WtXrqsc/OLqlRXi25a4Qh8AH1lbqGq0fKgRsqDsq2R8qBGy4OaW6gW3bTCEfgA+spSraHh0sC2Y8OlAS3VGgW1qHcQ+AD6yvhwSbXG+rZjtca6xodLBbWodxD4APrK9NSYlutrWqmvKSK0Ul/Tcn1N01NjRTetcAQ+gL4yOTakk8cnNHT4kBZX6ho6fEgnj08wS0dMywTQhybHhgj4HdDDB4BE0MMHcKCxyKp1BD6AA2tzkdVoeVATI2XVGus6N7+4bcyefwjvYkgHwIG11yIrVt1uRw8fQE+7Vg99qdbQxEh5W/nh0oAWV+qStv9DkHTl+9xCNclePj18AD1rrx76XousWHW7HYEPoGftNWSz1yIrVt1uR+AD6Fl79dD3WmTFqtvtkhnDL/pKfdH1AwfRZg99c+xdem8P/VqLrDb/IcwtVLW4Utf4cEm3Hk131W0Sgd/K1K1+rh84qKanxnRuflHSRs++1ljXcn1Ntx6daPk5WHX7riSGdIreH7vo+oGDin1x8pVED3+vqVv9Xj9wkNFDz08SPfyir9QXXT8ASDkFvu07bM/Znrf94A6Pl21/u/n4j20fy6PeVhV9pb7o+gFAyiHwbQ9I+oqkz0q6RdK9tm+5qtgXJS1FxHFJ/0nSH2attx1FjwMWXT8ASPmM4d8maT4iXpck29+SdLekl7aUuVvSv2vefkTSw7YdEZFD/S0pehyw6PoBII/Av1HSG1vun5f093YrExFrtt+WNCHprRzq7xjmzgO9gddiPnruoq3tU7Znbc9WKpXC2sEue0Bv4LWYnzx6+BckHd1y/6bmsZ3KnLc9KOn9khZ3erKIOC3ptCTNzMzsa8gnj94Au+wBvYHXYn7y6OE/LemE7Y/aLkm6R9KZq8qckXRf8/bnJf2wU+P3efUG2GUP6A28FvOTOfAjYk3SA5Iek/SypO9ExIu2v2z7rmax/yppwva8pH8t6T1TN/OS16pW5s4DvYHXYn5yWWkbEWclnb3q2O9tub0q6Qt51LWXvFa15rGHB4DseC3mp+cu2maVV2+AufNAb5gcG9LHpkb16sV39PhLC3r14jv62NQor8V96Lu9dPLsDbQzd55pY0BnVKqremVhWSeue59uvWnjNf3KwrImRsu8xtrUdz38rT3zv3yrqlcvvqNa45LmFqodm8bFtDGgc9htNj99F/jSRuhPT41ppFzSievep2MTox0NYf4ggc5hlk5++jLwpe6GMH+QQOcwSyc/fRv43Qxh/iCBzmG32fz0beB3M4T5gwQ6hxlz+em7WTqbujl3d+sHJf/lW1W9s7qm9//S4SvDR/xhAtmw22w++jbw2/20+qzTKjfLvrXc0PXvG77yT4YPKwfQK/o28KXWewWb0ypHy4OaGCnvO6jZ5AlAL+vbMfx25DWjh9k6AHoZga/8gprZOgB6GYGv/IKa2ToAehmBr/yCmuljAHpZX1+0bVU7M3r2ms3D9DEAvYrAb2olqPOazQMARWBIpw1skgbgICPw28C0SwAHGYHfBqZdAjjICPw2MO0SwEFG4LeBaZcADjJm6bSJaZcADip6+ACQCAIfABLBkE6HZd1nHwDyQg+/gzZX5q5euqyJkbJWL13WuflFVaqrRTcNQIII/A5iZS6AXkLgdxArcwH0EgK/g1iZC6CXEPgdxMpcAL2EwO8gVuYC6CVMy+wwVuYC6BX08AEgEQQ+ACSCwAeARGQKfNsftP247Veb38d3Kbdu+9nm15ksdQIA9idrD/9BST+IiBOSftC8v5O/iYhfbX7dlbFOAMA+ZA38uyV9o3n7G5J+M+PzAQA6JGvgXx8RbzZvL0i6fpdyQ7ZnbT9l+5r/FGyfapadrVQqGZsHANi05zx8209ImtrhoYe23omIsB27PM1HIuKC7Zsl/dD28xHx2k4FI+K0pNOSNDMzs9vzAQDatGfgR8Ttuz1m++e2b4iIN23fIOniLs9xofn9dds/kvRrknYMfABAZ2Qd0jkj6b7m7fskfffqArbHbZebt49IOinppYz1AgDalDXw/0DSr9t+VdLtzfuyPWP7a80yvyxp1vZzkv5c0h9EBIEPAF2WaS+diFiU9Jkdjs9K+lLz9v+R9Hey1AMAyI6VtgCQCAIfABJB4ANAIgh8AEgEgQ8AiSDwASARBD4AJILAB4BEEPgAkAgCHwASQeADQCIIfABIBIEPAIkg8AEgEQQ+ACSCwAeARBD4AJAIAh8AEkHgA0AiCHwASASBDwCJIPABIBEEPgAkgsAHgEQMFt0AAOiWSnVVcwtVLdUaGh8uaXpqTJNjQ0U3q2vo4QNIQqW6qnPzi1q9dFkTI2WtXrqsc/OLqlRXi25a1xD4AJIwt1DVaHlQI+VB2dZIeVCj5UHNLVSLblrXEPgAkrBUa2i4NLDt2HBpQEu1RkEt6j4CH0ASxodLqjXWtx2rNdY1PlwqqEXdR+ADSML01JiW62taqa8pIrRSX9NyfU3TU2NFN61rCHwASZgcG9LJ4xMaOnxIiyt1DR0+pJPHJ5KapcO0TADJmBwbSirgr0YPHwASQeADQCIyBb7tL9h+0fZl2zPXKHeH7Tnb87YfzFInAGB/svbwX5D0W5Ke3K2A7QFJX5H0WUm3SLrX9i0Z6wUAtCnTRduIeFmSbF+r2G2S5iPi9WbZb0m6W9JLWeoGALSnG2P4N0p6Y8v9881jO7J9yvas7dlKpdLxxgFAKvbs4dt+QtLUDg89FBHfzbtBEXFa0mlJmpmZibyfHwBStWfgR8TtGeu4IOnolvs3NY8BALqoG0M6T0s6YfujtkuS7pF0pgv1AgC2yDot83O2z0v6pKTv2X6sefxDts9KUkSsSXpA0mOSXpb0nYh4MVuzAQDtyjpL51FJj+5w/GeS7txy/6yks1nqAgBkw0pbAEgEgQ8AiSDwASARBD4AJILAB4BEEPgAkAgCHwASQeADQCIIfABIBIEPAInItLUCAPSiSnVVcwtVLdUaGh8uaXpqTJNjQ0U3q3D08AH0lUp1VefmF7V66bImRspavXRZ5+YXVamuFt20wtHDB9BX5haqGi0PaqS8EW+b3+cWqtt6+Sm+C6CHD6CvLNUaGi4NbDs2XBrQUq1x5X6q7wLo4QPoK+PDJdUa61d69pJUa6xrfLh05f5O7wLerjX0yDNv6Kbx4b7t8dPDB9BXpqfGtFxf00p9TRGhlfqalutrmp4au1Lm6ncBf11r6LW3VvTXK42+7vET+AD6yuTYkE4en9DQ4UNaXKlr6PAhnTw+sa23vvkuYNMbv6jpkKWJsSHZ1kh5UKPlQc0tVIs4hY5hSAdA35kcG7rmcMz01JjOzS9K2hjfX1yua2DA+vAHh6+UGS4NaHGl3vG2dhM9fADJufpdwAdGDuv4daN6/y+9O85/9bh/P6CHDyBJW98FbM7aWamvabg0oFpjXcv1Nd16dKLgVuaLHj6A5LUy7t8P6OEDgPYe9+8HBD6AvpPiKtpWMKQDoK+kuoq2FQQ+gL6ydRVtP8+p3w8CH0BfaWUvnVQR+AD6ytWraKX+nFO/HwQ+gL7Syl46qSLwAfSVVObU7wfTMgH0nRTm1O8HPXwASASBDwCJIPABIBEEPgAkIlPg2/6C7RdtX7Y9c41yP7X9vO1nbc9mqRMAsD9ZZ+m8IOm3JP2XFsr+w4h4K2N9AIB9yhT4EfGyJNnOpzUAgI7p1hh+SPq+7Wdsn7pWQdunbM/anq1UKl1qHgD0vz17+LafkDS1w0MPRcR3W6znUxFxwfZ1kh63/UpEPLlTwYg4Lem0JM3MzESLzw8A+5bK/vl7Bn5E3J61koi40Px+0fajkm6TtGPgA0A3be6fP1oe1MRIWbXGus7NL/bldgwdH9KxPWJ7bPO2pN/QxsVeAChcSvvnZ52W+Tnb5yV9UtL3bD/WPP4h22ebxa6X9L9tPyfpLyR9LyL+Z5Z6ASAvKe2fn3WWzqOSHt3h+M8k3dm8/bqkW7PUAwDtaGdMfnP//JHyu3HYr/vns9IWQF9p9zNtU9o/n8AH0FfaHZNPaf989sMH0FeWag1NjJS3HRsuDWhxpb7rz6Syfz49fAB9hc+03R2BD6CvpDQm3y4CH0BfSWlMvl2M4QPoO6mMybeLHj4AJILAB4BEMKQDIBmp7Iq5G3r4AJLQ7grcfkTgA0hCSrti7obAB5CElHbF3A2BDyAJrMAl8AEkghW4BD6ARLACl2mZABKS+gpcevgAkAgCHwASwZAOgCSkvspWoocPIAGsst1A4APoe6yy3UDgA+h7rLLdQOAD6Husst1A4APoe6yy3UDgA+h7rLLdwLRMAElIfZWtRA8fAJJB4ANAIgh8AEgEgQ8AiSDwASARBD4AJILAB4BEZAp82//R9iu2f2L7Udsf2KXcHbbnbM/bfjBLnQCA/cnaw39c0t+OiF+R9P8k/durC9gekPQVSZ+VdIuke23fkrFeAECbMgV+RHw/Itaad5+SdNMOxW6TNB8Rr0dEQ9K3JN2dpV4AQPvyHMP/p5L+bIfjN0p6Y8v9881jO7J9yvas7dlKpZJj8wAgbXvupWP7CUlTOzz0UER8t1nmIUlrkv5b1gZFxGlJp5vPW7H9V1mfs8uOSHqr6EZ0EOd3cPXzuUn9fX7tnNtHdntgz8CPiNuv9bjt+yX9Y0mfiYjYocgFSUe33L+peWxPETHZSrleYns2ImaKbkencH4HVz+fm9Tf55fXuWWdpXOHpH8j6a6IqO1S7GlJJ2x/1HZJ0j2SzmSpFwDQvqxj+A9LGpP0uO1nbX9Vkmx/yPZZSWpe1H1A0mOSXpb0nYh4MWO9AIA2ZdoPPyKO73L8Z5Lu3HL/rKSzWeo6QE4X3YAO4/wOrn4+N6m/zy+Xc/POw+4AgH7D1goAkAgCPyPbX7D9ou3Ltne9im77p7afb17rmO1mG7No4/wO5PYZtj9o+3Hbrza/j+9Sbr35u3vWdk9POtjrd2G7bPvbzcd/bPtY91u5fy2c3/3NKd2bv68vFdHO/bD9ddsXbb+wy+O2/UfNc/+J7Y+3VUFE8JXhS9IvS5qW9CNJM9co91NJR4pubyfOT9KApNck3SypJOk5SbcU3fYWz+8/SHqweftBSX+4S7nlotva4vns+buQ9C8kfbV5+x5J3y663Tmf3/2SHi66rfs8v38g6eOSXtjl8Tu1scDVkj4h6cftPD89/Iwi4uWImCu6HZ3S4vkd5O0z7pb0jebtb0j6zQLbkodWfhdbz/kRSZ+x7S62MYuD/Le2p4h4UtIvrlHkbkl/EhuekvQB2ze0+vwEfveEpO/bfsb2qaIbk7O2ts/oMddHxJvN2wuSrt+l3FBzy4+nbPfyP4VWfhdXysTGtOm3JU10pXXZtfq39tvNIY9HbB/d4fGDKtNrLdO0zFS0sr1ECz4VERdsX6eNdQuvNP+bFy6n8+tZ1zq/rXciImzvNm3tI83f382Sfmj7+Yh4Le+2Ihd/KumbEVG3/c+08W7mHxXcpp5A4Lcg9theosXnuND8ftH2o9p4a9oTgZ/D+e17+4xuuNb52f657Rsi4s3mW+OLuzzH5u/vdds/kvRr2hhL7jWt/C42y5y3PSjp/ZIWu9O8zPY8v4jYei5f08Z1mn6R6bXGkE4X2B6xPbZ5W9JvSNrxKvwBdZC3zzgj6b7m7fskvecdje1x2+Xm7SOSTkp6qWstbE8rv4ut5/x5ST+M5hXBA2DP87tqTPsubazw7xdnJP1Oc7bOJyS9vWVIcm9FX5U+6F+SPqeNcbS6pJ9Leqx5/EOSzjZv36yN2QTPSXpRG0Mlhbc9r/Nr3r9TGx+C89oBO78JST+Q9KqkJyR9sHl8RtLXmrf/vqTnm7+/5yV9seh273FO7/ldSPqyNva8kqQhSf9d0rykv5B0c9Ftzvn8/n3zdfacpD+X9LGi29zGuX1T0puSLjVfd1+U9LuSfrf5uLXxgVKvNf8Wd50ZuNMXK20BIBEM6QBAIgh8AEgEgQ8AiSDwASARBD4AJILAB4BEEPgAkAgCHwAS8f8BDpDZzm99j6oAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x432 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# normalize features\n",
"X_sc = StandardScaler().fit_transform(embeddings)\n",
"plt.figure(figsize=(6,6))\n",
"plt.scatter(X_sc[:, 0], X_sc[:, 1], alpha=0.25)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5\n"
]
}
],
"source": [
"clusters = DBSCAN(eps=1, min_samples=3).fit_predict(embeddings)\n",
"n_clusters = clusters.max() + 1\n",
"print(n_clusters)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 2, 2, 3, 0, 0, 3, 4, 0, 0, 0, 3, 3, 4, 3, 0, 3, 1, 1, 3, 4,\n",
" 4, 4, 4, 4, 0, 3, 1, 1, 2, 0, 3, 1, 1, 3])"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clusters"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"import chart_studio\n",
"chart_studio.tools.set_credentials_file(username='warensourced',\n",
" api_key=my_api_key)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# function used later on to describe the clusters \n",
"#langs_to_remove2 = r\"(Makefile|Markdown|Graphviz (DOT)|reStructuredText|YAML)\"\n",
"def get_cluster_repos(df, clusters, cluster_id, k=3):\n",
" mask = clusters == cluster_id\n",
" res = df.iloc[mask].sum(axis=1).sort([\"\"]ascending=False)\n",
" df.sum(axis=1)\n",
" #res = res.where(res > 0).dropna(axis=\"columns\", how=\"all\").fillna(0)\n",
" import pdb;pdb.set_trace()\n",
" n_devs, n_repos = res.shape\n",
" col_counts = res.astype(bool).sum(axis=0) # count nonzero values in each column\n",
" ind = numpy.argsort(col_counts)\n",
" k = min(k, n_repos) # show only top k repos\n",
" ind = ind[-k:]\n",
" top_repos = res.columns[ind]\n",
" repos_list = [r for r in top_repos[::-1]]\n",
" return \",\".join(repos_list)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"# function used later on to describe the clusters \n",
"#langs_to_remove2 = r\"(Makefile|Markdown|Graphviz (DOT)|reStructuredText|YAML)\"\n",
"def get_cluster_repos(df, clusters, cluster_id, k=3):\n",
" mask = clusters == cluster_id\n",
" res = df.iloc[mask].sum(axis=0).sort_values(ascending=False)\n",
" return \",\".join(res.index[:k])"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"1000px\"\n",
" height=\"1000px\"\n",
" src=\"https://plot.ly/~warensourced/43.embed\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0x7f685b2b0cf8>"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import chart_studio.plotly as py\n",
"import plotly.graph_objects as go\n",
"#import plotly.offline as py\n",
"\n",
"data_traces = []\n",
"cluster_colors = cm.rainbow(numpy.linspace(0, 1, n_clusters))\n",
"labels = [\"ML\", \"Management/Product\", \"Infra\", \"Data retrievel\", \"Applications\"]\n",
"\n",
"for cluster_id, color in enumerate(cluster_colors):\n",
" trace_devs = go.Scatter(\n",
" x = embeddings[numpy.where(clusters == cluster_id)[0], 0],\n",
" y = embeddings[numpy.where(clusters == cluster_id)[0], 1],\n",
" name = str(cluster_id) + \" - \" + get_cluster_repos(df, clusters, cluster_id),\n",
" mode = \"markers\",\n",
" hoverinfo=\"text\",\n",
" marker = dict(color = to_hex(color), size = 7, opacity = 1, symbol = \"circle\"),\n",
" text = [str(cluster_id) + \" - \" + dev_labels[i] for i in numpy.where(clusters == cluster_id)[0]]\n",
" \n",
" )\n",
" data_traces.append(trace_devs)\n",
" \n",
"\n",
"layout = go.Layout(\n",
" title = go.layout.Title(\n",
" text = \"Team detection algorithm on source{d} codebase\"),\n",
" autosize=False,\n",
" width=1000,\n",
" height=1000,\n",
" xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),\n",
" yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),\n",
" legend=dict(x=0, y=0)\n",
")\n",
"\n",
"fig = go.Figure(data=data_traces, layout=layout)\n",
"py.iplot(fig, filename='team-detection-source{d}.html')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment