Skip to content

Instantly share code, notes, and snippets.

@warenlg
Last active November 14, 2017 16:03
Show Gist options
  • Save warenlg/93a42f56582ace1635c255b680dd5de3 to your computer and use it in GitHub Desktop.
Save warenlg/93a42f56582ace1635c255b680dd5de3 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"%pylab inline\n",
"import importlib\n",
"import numpy as np\n",
"import matplotlib.pyplot as pl\n",
"import pyarrow.parquet as pq\n",
"import os\n",
"from collections import Counter\n",
"from operator import eq\n",
"from bblfsh.sdkversion import VERSION"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# UASTs extraction with the source{d} engine"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Uasts collected from the 100-siva-files dataset located in /data/siva/100-java\n",
"PATH_TO_UASTS = \"/home/waren/sourced/science3_local/code_duplication/100_uasts/\"\n",
"Node = importlib.import_module(\"bblfsh.gopkg.in.bblfsh.sdk.%s.uast.generated_pb2\" % VERSION).Node"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"nb of uasts : 200\n"
]
}
],
"source": [
"uasts = []\n",
"for root, d, files in os.walk(PATH_TO_UASTS):\n",
" for f in files:\n",
" path_to_uast = os.path.join(root, f)\n",
" uasts.append(Node.FromString(pq.read_table(path_to_uast)[6].data.to_pylist()[0][0]))\n",
"\n",
"print(\"nb of uasts :\", len(uasts))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_features(uasts):\n",
" features = []\n",
" for uast in uasts:\n",
" queue = [uast]\n",
" while queue:\n",
" child = queue.pop(0)\n",
" queue.extend(child.children)\n",
" features.append((tuple(sort(child.roles)), len(child.children)))\n",
" return features"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total number of nodes : 387967\n"
]
},
{
"data": {
"text/plain": [
"[((34, 57), 9),\n",
" ((18,), 1),\n",
" ((19, 41, 42), 1),\n",
" ((19, 41, 42), 1),\n",
" ((19, 41, 42), 1),\n",
" ((1, 41, 45, 47), 2),\n",
" ((1, 41, 45, 47), 2),\n",
" ((1, 41, 45, 47), 2),\n",
" ((1, 41, 45, 47), 2),\n",
" ((19, 60), 2)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"features = get_features(uasts)\n",
"print(\"total number of nodes :\", len(features))\n",
"features[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Quantization of the number of children"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true,
"scrolled": true
},
"outputs": [],
"source": [
"list_nb_children = []\n",
"list_roles = []\n",
"for feature in features:\n",
" list_roles.append(feature[0])\n",
" list_nb_children.append(feature[1])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAD8CAYAAACLrvgBAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFt5JREFUeJzt3XGsXvV93/H3Z3ZJk3QJJtwxapvZbZxKDtIW4gZP6ao0\nbsCEqmYSSY224WZWvC3QpVOlYLI/qJIgmS4rLVrC5MYeJspwEE2LVZy5HqHLJg3iS0gBQxh3DgnX\nAuxgB5pFCXPy3R/Pz9mTy72+J/e59nN97/slXT3nfM/vnPM7OsIfzjm/5zmpKiRJ6uJvDbsDkqSz\nh6EhSerM0JAkdWZoSJI6MzQkSZ0ZGpKkzgwNSVJnhoYkqTNDQ5LU2eJhd2C2nX/++bVixYphd0OS\nzioPP/zwt6tqZLp28y40VqxYwejo6LC7IUlnlSTf7NLO21OSpM4MDUlSZ4aGJKkzQ0OS1JmhIUnq\nzNCQJHVmaEiSOjM0JEmdGRqSpM7m3TfCB7Fi632d2j2z7crT3BNJmpu80pAkdWZoSJI6MzQkSZ0Z\nGpKkzqYNjSQ7kxxJ8viE+u8k+XqSg0n+oK9+Y5KxJE8lubyvvr7VxpJs7auvTPJQq38+yTmt/po2\nP9aWr5iNA5YkzVyXK407gPX9hSS/BmwA/n5VvRX4ZKuvBjYCb23rfDrJoiSLgE8BVwCrgWtaW4Bb\ngFur6s3AcWBzq28Gjrf6ra2dJGmIpg2NqvoycGxC+V8B26rqB63NkVbfAOyuqh9U1TeAMeAd7W+s\nqg5V1SvAbmBDkgDvBu5p6+8Crurb1q42fQ+wrrWXJA3JTJ9pvAX4R+220X9L8sutvhR4tq/deKtN\nVX8T8J2qOjGh/hPbastfau0lSUMy0y/3LQbOA9YCvwzcneQXZq1XP6UkW4AtABdddNGwuiFJ895M\nrzTGgS9Uz1eAHwHnA4eB5X3tlrXaVPUXgXOTLJ5Qp3+dtvyNrf2rVNX2qlpTVWtGRqZ9L7okaYZm\nGhp/DvwaQJK3AOcA3wb2ABvbyKeVwCrgK8ABYFUbKXUOvYfle6qqgAeAq9t2NwH3tuk9bZ62/Eut\nvSRpSKa9PZXkLuBdwPlJxoGbgJ3AzjYM9xVgU/sH/WCSu4EngBPAdVX1w7ad64F9wCJgZ1UdbLu4\nAdid5BPAI8COVt8BfDbJGL0H8Rtn4XglSQOYNjSq6popFv3TKdrfDNw8SX0vsHeS+iF6o6sm1r8P\nvG+6/kmSzhy/ES5J6szQkCR1ZmhIkjozNCRJnRkakqTODA1JUmeGhiSpM0NDktSZoSFJ6szQkCR1\nZmhIkjozNCRJnRkakqTODA1JUmeGhiSpM0NDktTZtKGRZGeSI+0tfROX/V6SSnJ+m0+S25KMJXk0\nySV9bTclebr9beqrvz3JY22d25Kk1c9Lsr+1359kyewcsiRpprpcadwBrJ9YTLIcuAz4Vl/5Cnrv\nBV8FbAFub23Po/ea2EvpvaXvpr4QuB34YN96J/e1Fbi/qlYB97d5SdIQTRsaVfVleu/onuhW4CNA\n9dU2AHdWz4PAuUkuBC4H9lfVsao6DuwH1rdlb6iqB9s7xu8Erurb1q42vauvLkkakhk900iyAThc\nVX89YdFS4Nm++fFWO1V9fJI6wAVV9Vybfh64YCZ9lSTNnsU/7QpJXgd8lN6tqTOiqipJTbU8yRZ6\nt8O46KKLzlS3JGnBmcmVxi8CK4G/TvIMsAz4apK/CxwGlve1XdZqp6ovm6QO8EK7fUX7PDJVh6pq\ne1Wtqao1IyMjMzgkSVIXP3VoVNVjVfV3qmpFVa2gd0vpkqp6HtgDXNtGUa0FXmq3mPYBlyVZ0h6A\nXwbsa8teTrK2jZq6Fri37WoPcHKU1aa+uiRpSLoMub0L+J/ALyUZT7L5FM33AoeAMeBPgA8BVNUx\n4OPAgfb3sVajtflMW+d/A19s9W3Ae5I8Dfx6m5ckDdG0zzSq6ppplq/omy7guina7QR2TlIfBS6e\npP4isG66/kmSzhy/ES5J6szQkCR1ZmhIkjozNCRJnRkakqTODA1JUmeGhiSpM0NDktSZoSFJ6szQ\nkCR1ZmhIkjozNCRJnRkakqTODA1JUmeGhiSpM0NDktRZlzf37UxyJMnjfbV/l+TrSR5N8mdJzu1b\ndmOSsSRPJbm8r76+1caSbO2rr0zyUKt/Psk5rf6aNj/Wlq+YrYOWJM3MtG/uA+4A/gNwZ19tP3Bj\nVZ1IcgtwI3BDktXARuCtwM8D/zXJW9o6nwLeQ++d4geS7KmqJ4BbgFuraneS/whsBm5vn8er6s1J\nNrZ2vzXY4c6OFVvv69TumW1XnuaeSNKZNe2VRlV9GTg2ofaXVXWizT4ILGvTG4DdVfWDqvoGvfd+\nv6P9jVXVoap6BdgNbEgS4N3APW39XcBVfdva1abvAda19pKkIZmNZxr/HPhim14KPNu3bLzVpqq/\nCfhOXwCdrP/Ettryl1r7V0myJcloktGjR48OfECSpMkNFBpJ/i1wAvjc7HRnZqpqe1Wtqao1IyMj\nw+yKJM1rXZ5pTCrJbwO/Aayrqmrlw8DyvmbLWo0p6i8C5yZZ3K4m+tuf3NZ4ksXAG1t7SdKQzOhK\nI8l64CPAb1bV9/oW7QE2tpFPK4FVwFeAA8CqNlLqHHoPy/e0sHkAuLqtvwm4t29bm9r01cCX+sJJ\nkjQE015pJLkLeBdwfpJx4CZ6o6VeA+xvz6YfrKp/WVUHk9wNPEHvttV1VfXDtp3rgX3AImBnVR1s\nu7gB2J3kE8AjwI5W3wF8NskYvQfxG2fheCVJA5g2NKrqmknKOyapnWx/M3DzJPW9wN5J6ofoja6a\nWP8+8L7p+idJOnP8RrgkqTNDQ5LUmaEhSerM0JAkdWZoSJI6MzQkSZ0ZGpKkzgwNSVJnhoYkqTND\nQ5LUmaEhSerM0JAkdWZoSJI6MzQkSZ0ZGpKkzgwNSVJn04ZGkp1JjiR5vK92XpL9SZ5un0taPUlu\nSzKW5NEkl/Sts6m1fzrJpr7625M81ta5Le1VgFPtQ5I0PF2uNO4A1k+obQXur6pVwP1tHuAKeu8F\nXwVsAW6HXgDQe03spfTe0ndTXwjcDnywb7310+xDkjQk04ZGVX2Z3ju6+20AdrXpXcBVffU7q+dB\n4NwkFwKXA/ur6lhVHQf2A+vbsjdU1YNVVcCdE7Y12T4kSUMy02caF1TVc236eeCCNr0UeLav3Xir\nnao+Pkn9VPt4lSRbkowmGT169OgMDkeS1MXAD8LbFULNQl9mvI+q2l5Va6pqzcjIyOnsiiQtaDMN\njRfarSXa55FWPwws72u3rNVOVV82Sf1U+5AkDclMQ2MPcHIE1Cbg3r76tW0U1VrgpXaLaR9wWZIl\n7QH4ZcC+tuzlJGvbqKlrJ2xrsn1IkoZk8XQNktwFvAs4P8k4vVFQ24C7k2wGvgm8vzXfC7wXGAO+\nB3wAoKqOJfk4cKC1+1hVnXy4/iF6I7ReC3yx/XGKfUiShmTa0Kiqa6ZYtG6StgVcN8V2dgI7J6mP\nAhdPUn9xsn1IkobHb4RLkjozNCRJnRkakqTODA1JUmeGhiSpM0NDktSZoSFJ6szQkCR1ZmhIkjoz\nNCRJnRkakqTODA1JUmeGhiSpM0NDktSZoSFJ6myg0Ejyb5IcTPJ4kruS/GySlUkeSjKW5PNJzmlt\nX9Pmx9ryFX3bubHVn0pyeV99fauNJdk6SF8lSYObcWgkWQr8a2BNVV0MLAI2ArcAt1bVm4HjwOa2\nymbgeKvf2tqRZHVb763AeuDTSRYlWQR8CrgCWA1c09pKkoZk0NtTi4HXJlkMvA54Dng3cE9bvgu4\nqk1vaPO05evae8E3ALur6gdV9Q16r4p9R/sbq6pDVfUKsLu1lSQNyYxDo6oOA58EvkUvLF4CHga+\nU1UnWrNxYGmbXgo829Y90dq/qb8+YZ2p6pKkIRnk9tQSev/nvxL4eeD19G4vnXFJtiQZTTJ69OjR\nYXRBkhaEQW5P/Trwjao6WlX/F/gC8E7g3Ha7CmAZcLhNHwaWA7TlbwRe7K9PWGeq+qtU1faqWlNV\na0ZGRgY4JEnSqQwSGt8C1iZ5XXs2sQ54AngAuLq12QTc26b3tHna8i9VVbX6xja6aiWwCvgKcABY\n1UZjnUPvYfmeAforSRrQ4umbTK6qHkpyD/BV4ATwCLAduA/YneQTrbajrbID+GySMeAYvRCgqg4m\nuZte4JwArquqHwIkuR7YR29k1s6qOjjT/kqSBjfj0ACoqpuAmyaUD9Eb+TSx7feB902xnZuBmyep\n7wX2DtJHSdLs8RvhkqTODA1JUmeGhiSpM0NDktSZoSFJ6szQkCR1ZmhIkjozNCRJnRkakqTODA1J\nUmeGhiSpM0NDktSZoSFJ6szQkCR1ZmhIkjozNCRJnQ0UGknOTXJPkq8neTLJP0xyXpL9SZ5un0ta\n2yS5LclYkkeTXNK3nU2t/dNJNvXV357ksbbObe21spKkIRnozX3AHwP/paqubu/xfh3wUeD+qtqW\nZCuwFbgBuILe+79XAZcCtwOXJjmP3tv/1gAFPJxkT1Udb20+CDxE7w1+64EvDtjnM2bF1vs6t31m\n25WnsSeSNDtmfKWR5I3Ar9LeAV5Vr1TVd4ANwK7WbBdwVZveANxZPQ8C5ya5ELgc2F9Vx1pQ7AfW\nt2VvqKoHq6qAO/u2JUkagkFuT60EjgL/KckjST6T5PXABVX1XGvzPHBBm14KPNu3/nirnao+Pkld\nkjQkg4TGYuAS4Paqehvwf+jdivqxdoVQA+yjkyRbkowmGT169Ojp3p0kLViDhMY4MF5VD7X5e+iF\nyAvt1hLt80hbfhhY3rf+slY7VX3ZJPVXqartVbWmqtaMjIwMcEiSpFOZcWhU1fPAs0l+qZXWAU8A\ne4CTI6A2Afe26T3AtW0U1VrgpXYbax9wWZIlbaTVZcC+tuzlJGvbqKlr+7YlSRqCQUdP/Q7wuTZy\n6hDwAXpBdHeSzcA3gfe3tnuB9wJjwPdaW6rqWJKPAwdau49V1bE2/SHgDuC19EZNnTUjpyRpPhoo\nNKrqa/SGyk60bpK2BVw3xXZ2AjsnqY8CFw/SR0nS7PEb4ZKkzgwNSVJnhoYkqTNDQ5LUmaEhSerM\n0JAkdWZoSJI6MzQkSZ0ZGpKkzgwNSVJnhoYkqTNDQ5LUmaEhSerM0JAkdWZoSJI6MzQkSZ0NHBpJ\nFiV5JMlftPmVSR5KMpbk8+2tfiR5TZsfa8tX9G3jxlZ/KsnlffX1rTaWZOugfZUkDWY2rjQ+DDzZ\nN38LcGtVvRk4Dmxu9c3A8Va/tbUjyWpgI/BWYD3w6RZEi4BPAVcAq4FrWltJ0pAMFBpJlgFXAp9p\n8wHeDdzTmuwCrmrTG9o8bfm61n4DsLuqflBV36D3DvF3tL+xqjpUVa8Au1tbSdKQDHql8UfAR4Af\ntfk3Ad+pqhNtfhxY2qaXAs8CtOUvtfY/rk9YZ6r6qyTZkmQ0yejRo0cHPCRJ0lRmHBpJfgM4UlUP\nz2J/ZqSqtlfVmqpaMzIyMuzuSNK8tXiAdd8J/GaS9wI/C7wB+GPg3CSL29XEMuBwa38YWA6MJ1kM\nvBF4sa9+Uv86U9UlSUMw4yuNqrqxqpZV1Qp6D7K/VFX/BHgAuLo12wTc26b3tHna8i9VVbX6xja6\naiWwCvgKcABY1UZjndP2sWem/ZUkDW6QK42p3ADsTvIJ4BFgR6vvAD6bZAw4Ri8EqKqDSe4GngBO\nANdV1Q8BklwP7AMWATur6uBp6K8kqaNZCY2q+ivgr9r0IXojnya2+T7wvinWvxm4eZL6XmDvbPRR\nkjQ4vxEuSerM0JAkdWZoSJI6MzQkSZ0ZGpKkzgwNSVJnhoYkqTNDQ5LUmaEhSerM0JAkdXY6fntK\nM7Bi632d2j2z7crT3BNJmppXGpKkzgwNSVJnhoYkqTNDQ5LU2SDvCF+e5IEkTyQ5mOTDrX5ekv1J\nnm6fS1o9SW5LMpbk0SSX9G1rU2v/dJJNffW3J3msrXNbkgxysJKkwQxypXEC+L2qWg2sBa5LshrY\nCtxfVauA+9s8wBX0XuW6CtgC3A69kAFuAi6l9/Kmm04GTWvzwb711g/QX0nSgAZ5R/hzVfXVNv03\nwJPAUmADsKs12wVc1aY3AHdWz4PAuUkuBC4H9lfVsao6DuwH1rdlb6iqB9u7xO/s25YkaQhm5ZlG\nkhXA24CHgAuq6rm26Hnggja9FHi2b7XxVjtVfXySuiRpSAYOjSQ/B/wp8LtV9XL/snaFUIPuo0Mf\ntiQZTTJ69OjR0707SVqwBgqNJD9DLzA+V1VfaOUX2q0l2ueRVj8MLO9bfVmrnaq+bJL6q1TV9qpa\nU1VrRkZGBjkkSdIpDDJ6KsAO4Mmq+sO+RXuAkyOgNgH39tWvbaOo1gIvtdtY+4DLkixpD8AvA/a1\nZS8nWdv2dW3ftiRJQzDIb0+9E/hnwGNJvtZqHwW2AXcn2Qx8E3h/W7YXeC8wBnwP+ABAVR1L8nHg\nQGv3sao61qY/BNwBvBb4YvuTJA3JjEOjqv4HMNX3JtZN0r6A66bY1k5g5yT1UeDimfZRkjS7/Ea4\nJKkzQ0OS1JmhIUnqzNCQJHVmaEiSOjM0JEmdGRqSpM4MDUlSZ4aGJKkzQ0OS1JmhIUnqzNCQJHVm\naEiSOjM0JEmdGRqSpM4GeQmThmDF1vs6tXtm25WnuSeSFqI5f6WRZH2Sp5KMJdk67P5I0kI2p0Mj\nySLgU8AVwGrgmiSrh9srSVq45vrtqXcAY1V1CCDJbmAD8MRQe3UW8DaWpNNhrofGUuDZvvlx4NIh\n9WVemu1wMazmJs+LZstcD41OkmwBtrTZ7yZ5aoabOh/49uz0as77qY41t8zuzmd7e6fgOf0pnMHz\nMgjP6enx97o0muuhcRhY3je/rNV+QlVtB7YPurMko1W1ZtDtnA0WyrEulOOEhXOsC+U4YW4e65x+\nEA4cAFYlWZnkHGAjsGfIfZKkBWtOX2lU1Ykk1wP7gEXAzqo6OORuSdKCNadDA6Cq9gJ7z9DuBr7F\ndRZZKMe6UI4TFs6xLpTjhDl4rKmqYfdBknSWmOvPNCRJc4ih0SyUnytJ8kySx5J8LcnosPszm5Ls\nTHIkyeN9tfOS7E/ydPtcMsw+zpYpjvX3kxxu5/ZrSd47zD7OhiTLkzyQ5IkkB5N8uNXn1Xk9xXHO\nuXPq7Sl+/HMl/wt4D70vEB4ArqmqeffN8yTPAGuqat6Nc0/yq8B3gTur6uJW+wPgWFVta/8zsKSq\nbhhmP2fDFMf6+8B3q+qTw+zbbEpyIXBhVX01yd8GHgauAn6beXReT3Gc72eOnVOvNHp+/HMlVfUK\ncPLnSnQWqaovA8cmlDcAu9r0Lnr/IZ71pjjWeaeqnquqr7bpvwGepPdLEfPqvJ7iOOccQ6Nnsp8r\nmZMnbBYU8JdJHm7fpJ/vLqiq59r088AFw+zMGXB9kkfb7auz+pbNRElWAG8DHmIen9cJxwlz7Jwa\nGgvPr1TVJfR+Ofi6dptjQajevdj5fD/2duAXgX8APAf8++F2Z/Yk+TngT4HfraqX+5fNp/M6yXHO\nuXNqaPR0+rmS+aCqDrfPI8Cf0bs1N5+90O4Xn7xvfGTI/TltquqFqvphVf0I+BPmyblN8jP0/iH9\nXFV9oZXn3Xmd7Djn4jk1NHoWxM+VJHl9e8hGktcDlwGPn3qts94eYFOb3gTcO8S+nFYn/xFt/jHz\n4NwmCbADeLKq/rBv0bw6r1Md51w8p46eatpQtj/i//9cyc1D7tKsS/IL9K4uoPdrAP95Ph1nkruA\nd9H7ZdAXgJuAPwfuBi4Cvgm8v6rO+gfIUxzru+jdxijgGeBf9N33Pysl+RXgvwOPAT9q5Y/Su98/\nb87rKY7zGubYOTU0JEmdeXtKktSZoSFJ6szQkCR1ZmhIkjozNCRJnRkakqTODA1JUmeGhiSps/8H\njtx6qw9lqFcAAAAASUVORK5CYII=\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7ff98516fbe0>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"values = array(list_nb_children)\n",
"_, bins, _ = hist(values[values < 30], bins=30)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"proportion of leaves : 0.43184858506006957\n"
]
}
],
"source": [
"print(\"proportion of leaves :\", len([x for x in list_nb_children if x==0]) / len(list_nb_children))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"distinct number of children : 30\n"
]
}
],
"source": [
"print(\"distinct number of children :\", len(set(list_nb_children)))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"partition : [0, 1, 2, 3, 4, 6, 26, 78]\n"
]
}
],
"source": [
"# quatization of the number of children\n",
"\n",
"def get_quantization(list_nb_children, fineness_of_partition=2): # fineness must be a positive integer\n",
"\n",
" nodes_in_partition = len(list_nb_children) / (100 * fineness_of_partition)\n",
" list_nb_children.sort()\n",
" unique_nb_children = list(set(list_nb_children))\n",
" partition = []\n",
" id_nb_children = 0\n",
"\n",
" while True:\n",
" nb_nodes_cumulate = 0\n",
" while (nb_nodes_cumulate < nodes_in_partition) and (unique_nb_children[id_nb_children] < max(list_nb_children)):\n",
" nb_nodes_cumulate += len([x for x in list_nb_children if x == unique_nb_children[id_nb_children]])\n",
" id_nb_children += 1\n",
" if unique_nb_children[id_nb_children] != max(list_nb_children):\n",
" partition.append(unique_nb_children[id_nb_children - 1])\n",
" else:\n",
" partition.append(unique_nb_children[id_nb_children])\n",
" break\n",
" return partition\n",
"\n",
"print(\"partition :\", get_quantization(list_nb_children))"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"Distribution of nodes based on the previous number of children partition :"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0, 1[ : 167543 nodes\n",
"[1, 2[ : 122345 nodes\n",
"[2, 3[ : 62102 nodes\n",
"[3, 4[ : 27518 nodes\n",
"[4, 6[ : 5387 nodes\n",
"[6, 26[ : 2874 nodes\n",
"[26, 78[ : 170 nodes\n"
]
}
],
"source": [
"partition = get_quantization(list_nb_children)\n",
"for id, nb_c in enumerate(partition[:-1]):\n",
" nb_nodes = 0\n",
" for i in range(nb_c, partition[id+1]):\n",
" nb_nodes += len([x for x in list_nb_children if x == i])\n",
" print(\"[{}, {}[ : {} nodes\".format(nb_c, partition[id+1], nb_nodes))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"linear_values = np.arange(len(partition))\n",
"def stairvalue(value):\n",
" idx = np.searchsorted(partition, value, side=\"right\")\n",
" if value == max(partition):\n",
" return linear_values[idx-1] - 1\n",
" else:\n",
" return linear_values[idx-1]\n",
"\n",
"stairvalue(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Merge rare combinations of roles to their nearest neighbor based on the Jaccard similiarity"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[((1, 18, 84, 85), 28441),\n",
" ((1, 18), 24241),\n",
" ((106,), 21252),\n",
" ((18,), 19002),\n",
" ((1, 6, 18), 18840),\n",
" ((1, 2, 18), 18688),\n",
" ((4, 18, 104), 16518),\n",
" ((1, 2, 18, 48, 84), 16461),\n",
" ((1, 18, 45, 47, 49, 84, 86), 15933),\n",
" ((18, 45, 84), 13943)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"roles_counts = Counter(roles for roles in list_roles)\n",
"roles_counts.most_common(10)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_roles2merge(roles_counts, threshold=100):\n",
" roles2merge = []\n",
" for roles in set(roles_counts):\n",
" if roles_counts[roles] <= threshold:\n",
" roles2merge.append(roles)\n",
" return roles2merge\n",
"\n",
"roles2merge = get_roles2merge(roles_counts)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_intersects(node_roles, bag_of_roles):\n",
" node_roles_intersect = {}\n",
" roles_candidate = [r for r in set(bag_of_roles) if r not in roles2merge + [node_roles]]\n",
" for roles in roles_candidate:\n",
" for role in roles:\n",
" if role in node_roles:\n",
" try:\n",
" node_roles_intersect[roles] += 1\n",
" except KeyError:\n",
" node_roles_intersect[roles] = 1\n",
" return node_roles_intersect\n",
"\n",
"def get_jaccard_similarities(node_roles, nearest_neighbor, val_inter):\n",
" jac_similarities = {}\n",
" for roles in nearest_neighbor:\n",
" jac_similarities[roles] = len(node_roles)\n",
" for role in roles:\n",
" if role not in node_roles:\n",
" jac_similarities[roles] += 1\n",
" jac_similarities[roles] = val_inter / jac_similarities[roles]\n",
" return jac_similarities"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(18, 45, 47, 49, 84, 86)"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def get_nearest_neighbor(node_roles, bag_of_roles):\n",
" node_roles_intersect = get_intersects(node_roles, bag_of_roles)\n",
"\n",
" if node_roles_intersect:\n",
" max_inter = max(node_roles_intersect.values())\n",
" nearest_neighbors = [tuple(sort(k)) for k, v in node_roles_intersect.items() if v == max_inter]\n",
"\n",
" jac_similarities = get_jaccard_similarities(node_roles, nearest_neighbors, max_inter)\n",
" max_jac_similarity = max(jac_similarities.values())\n",
" nearest_neighbors = [k for k, v in jac_similarities.items() if v == max_jac_similarity]\n",
" return nearest_neighbors[0]\n",
" else:\n",
" return None\n",
"\n",
"get_nearest_neighbor((1, 18, 45, 47, 49, 84, 86), roles_counts)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_filtered_features_per_uast(uasts):\n",
" filtered_features_per_uast = [[] for _ in range(len(uasts))]\n",
" for i in range(len(uasts)):\n",
" queue = [uasts[i]]\n",
" while queue:\n",
" child = queue.pop(0)\n",
" queue.extend(child.children)\n",
" roles = tuple(sort(child.roles))\n",
" if roles not in roles2merge:\n",
" filtered_features_per_uast[i].append((roles, stairvalue(len(child.children))))\n",
" elif get_nearest_neighbor(roles, set(roles_counts)):\n",
" filtered_features_per_uast[i].append((get_nearest_neighbor(roles, set(roles_counts)), stairvalue(len(child.children))))\n",
" return filtered_features_per_uast\n",
"\n",
"filtered_features_per_uast = get_filtered_features_per_uast(uasts)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"filtered_features = [f for features_uast in filtered_features_per_uast for f in features_uast] "
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[((34, 57), 5),\n",
" ((18,), 1),\n",
" ((19, 41, 42), 1),\n",
" ((19, 41, 42), 1),\n",
" ((19, 41, 42), 1),\n",
" ((1, 41, 45, 47), 2),\n",
" ((1, 41, 45, 47), 2),\n",
" ((1, 41, 45, 47), 2),\n",
" ((1, 41, 45, 47), 2),\n",
" ((19, 60), 2)]"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filtered_features[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# TF-IDF computation"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"[(((18,), 1), 40),\n",
" (((106,), 0), 34),\n",
" (((1, 18), 0), 30),\n",
" (((4, 18, 104), 2), 24),\n",
" (((18, 88, 98, 103), 0), 24),\n",
" (((1, 6, 18), 0), 23),\n",
" (((1, 18, 84, 85), 1), 21),\n",
" (((1, 2, 18, 48, 84), 0), 19),\n",
" (((18, 45, 47, 49, 84, 86, 88, 98, 103), 0), 18),\n",
" (((1, 18, 45, 47, 49, 84, 86), 0), 16)]"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"features_TF = []\n",
"for i in range(len(uasts)):\n",
" features_TF.append(Counter(feature for feature in filtered_features_per_uast[i]))\n",
"features_TF[0].most_common(10)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": true,
"scrolled": true
},
"outputs": [],
"source": [
"features_DF = {}\n",
"for feature in set(filtered_features):\n",
" for i in range(len(uasts)):\n",
" if feature in filtered_features_per_uast[i]:\n",
" try:\n",
" features_DF[feature] += 1\n",
" except:\n",
" features_DF[feature] = 1"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"TFIDF_weights = [{} for _ in range(len(uasts))]\n",
"for i in range(len(uasts)):\n",
" for feature in set(filtered_features):\n",
" TFIDF_weights[i][feature] = features_TF[i][feature] / features_DF[feature]"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{((1, 2, 18), 0): 0.025,\n",
" ((1, 2, 18), 1): 0.0,\n",
" ((1, 2, 18, 48, 84), 0): 0.095,\n",
" ((1, 2, 18, 48, 84), 1): 0.0,\n",
" ((1, 4, 6, 18), 0): 0.0,\n",
" ((1, 4, 6, 18), 1): 0.0,\n",
" ((1, 4, 7, 18), 0): 0.0,\n",
" ((1, 4, 7, 18), 1): 0.0,\n",
" ((1, 6, 18), 0): 0.11616161616161616,\n",
" ((1, 6, 18), 1): 0.03,\n",
" ((1, 7, 18), 0): 0.010309278350515464,\n",
" ((1, 7, 18), 1): 0.0,\n",
" ((1, 18), 0): 0.15151515151515152,\n",
" ((1, 18), 1): 0.015,\n",
" ((1, 18, 19, 67, 69), 0): 0.0,\n",
" ((1, 18, 19, 67, 69), 2): 0.0,\n",
" ((1, 18, 41, 45, 47, 49), 0): 0.02040816326530612,\n",
" ((1, 18, 41, 45, 47, 49), 1): 0.025252525252525252,\n",
" ((1, 18, 45, 47, 49, 84, 86), 0): 0.08080808080808081,\n",
" ((1, 18, 45, 47, 49, 84, 86), 1): 0.0,\n",
" ((1, 18, 45, 84, 85), 0): 0.0,\n",
" ((1, 18, 45, 84, 85), 1): 0.00558659217877095,\n",
" ((1, 18, 45, 84, 85), 2): 0.011976047904191617,\n",
" ((1, 18, 45, 84, 85), 3): 0.01098901098901099,\n",
" ((1, 18, 48, 84), 1): 0.00510204081632653,\n",
" ((1, 18, 49, 50), 0): 0.0,\n",
" ((1, 18, 49, 50), 1): 0.0,\n",
" ((1, 18, 49, 50), 2): 0.0,\n",
" ((1, 18, 49, 50), 3): 0.0,\n",
" ((1, 18, 49, 84, 85, 86), 0): 0.0,\n",
" ((1, 18, 49, 84, 85, 86), 1): 0.008928571428571428,\n",
" ((1, 18, 49, 84, 86), 0): 0.013888888888888888,\n",
" ((1, 18, 49, 84, 86), 1): 0.015503875968992248,\n",
" ((1, 18, 50, 93), 0): 0.0,\n",
" ((1, 18, 50, 93), 1): 0.0,\n",
" ((1, 18, 60, 61), 0): 0.0,\n",
" ((1, 18, 60, 61), 1): 0.0,\n",
" ((1, 18, 61, 71), 0): 0.0,\n",
" ((1, 18, 67), 0): 0.011695906432748537,\n",
" ((1, 18, 67), 1): 0.0,\n",
" ((1, 18, 67, 69), 0): 0.011560693641618497,\n",
" ((1, 18, 67, 69), 1): 0.0,\n",
" ((1, 18, 84, 85), 0): 0.08080808080808081,\n",
" ((1, 18, 84, 85), 1): 0.105,\n",
" ((1, 19, 41, 100), 1): 0.0,\n",
" ((1, 19, 41, 100), 2): 0.0,\n",
" ((1, 41, 45, 47), 2): 0.025252525252525252,\n",
" ((1, 41, 45, 47), 3): 0.0,\n",
" ((1, 42, 43), 0): 0.015,\n",
" ((1, 42, 43), 1): 0.0,\n",
" ((1, 42, 44), 0): 0.0,\n",
" ((3, 4, 5, 18), 0): 0.0,\n",
" ((3, 4, 5, 18), 2): 0.00684931506849315,\n",
" ((3, 4, 5, 18, 60, 61), 2): 0.00510204081632653,\n",
" ((3, 4, 11, 15), 0): 0.005988023952095809,\n",
" ((3, 4, 11, 17), 0): 0.0,\n",
" ((3, 4, 11, 21), 0): 0.01020408163265306,\n",
" ((3, 4, 18, 35, 115), 0): 0.0,\n",
" ((3, 4, 18, 37, 115), 0): 0.0,\n",
" ((3, 4, 18, 38, 115), 0): 0.0,\n",
" ((3, 4, 18, 39, 49, 84, 86, 115), 0): 0.005649717514124294,\n",
" ((3, 4, 18, 39, 115), 0): 0.0,\n",
" ((3, 4, 19, 104), 3): 0.0,\n",
" ((3, 4, 20, 21, 116), 0): 0.005154639175257732,\n",
" ((3, 4, 20, 116), 0): 0.04081632653061224,\n",
" ((3, 4, 21, 26, 116), 0): 0.0,\n",
" ((3, 4, 21, 27, 116), 0): 0.0,\n",
" ((3, 4, 22, 116), 0): 0.0,\n",
" ((3, 4, 25, 116), 0): 0.0,\n",
" ((3, 4, 26, 116), 0): 0.0,\n",
" ((3, 4, 27, 116), 0): 0.026041666666666668,\n",
" ((3, 4, 35, 115), 0): 0.0,\n",
" ((4, 6, 18), 2): 0.0,\n",
" ((4, 6, 18), 3): 0.0,\n",
" ((4, 6, 18, 45, 84), 1): 0.0,\n",
" ((4, 6, 18, 45, 84), 2): 0.0,\n",
" ((4, 6, 18, 45, 84), 3): 0.0,\n",
" ((4, 6, 18, 49, 84, 86, 88, 98, 103), 0): 0.005649717514124294,\n",
" ((4, 6, 18, 49, 84, 86, 88, 98, 103), 1): 0.0,\n",
" ((4, 6, 18, 88, 95, 103), 0): 0.0,\n",
" ((4, 6, 18, 88, 95, 103), 3): 0.0,\n",
" ((4, 6, 18, 88, 95, 103), 4): 0.0,\n",
" ((4, 6, 18, 88, 98, 103), 0): 0.0,\n",
" ((4, 6, 18, 88, 98, 103), 1): 0.0,\n",
" ((4, 6, 18, 109), 2): 0.0,\n",
" ((4, 7, 18), 3): 0.0,\n",
" ((4, 7, 18, 45, 84), 1): 0.0,\n",
" ((4, 7, 18, 45, 84), 2): 0.0,\n",
" ((4, 7, 18, 45, 84), 3): 0.0,\n",
" ((4, 7, 18, 49, 84, 86, 88, 99, 103), 0): 0.0,\n",
" ((4, 7, 18, 49, 84, 86, 88, 99, 103), 1): 0.0,\n",
" ((4, 7, 18, 49, 84, 86, 88, 99, 103), 2): 0.008333333333333333,\n",
" ((4, 7, 18, 49, 84, 86, 88, 99, 103), 3): 0.0,\n",
" ((4, 7, 18, 49, 84, 86, 88, 99, 103), 4): 0.0,\n",
" ((4, 7, 18, 49, 84, 86, 88, 99, 103), 5): 0.0,\n",
" ((4, 7, 18, 88, 95, 103), 0): 0.0,\n",
" ((4, 7, 18, 88, 98, 103), 0): 0.0,\n",
" ((4, 7, 18, 88, 99, 103), 1): 0.0,\n",
" ((4, 7, 18, 88, 99, 103), 2): 0.0,\n",
" ((4, 7, 18, 88, 99, 103), 3): 0.0,\n",
" ((4, 7, 18, 88, 99, 103), 4): 0.0,\n",
" ((4, 7, 18, 88, 99, 103), 5): 0.0,\n",
" ((4, 18), 3): 0.005847953216374269,\n",
" ((4, 18, 45, 47, 49, 84, 86), 2): 0.0,\n",
" ((4, 18, 45, 47, 49, 84, 86), 3): 0.0,\n",
" ((4, 18, 49, 84, 86), 3): 0.005649717514124294,\n",
" ((4, 18, 50, 93), 3): 0.0,\n",
" ((4, 18, 60, 61), 3): 0.06565656565656566,\n",
" ((4, 18, 93, 102), 3): 0.0,\n",
" ((4, 18, 104), 2): 0.12,\n",
" ((4, 18, 104), 3): 0.0,\n",
" ((6, 18, 45, 84), 1): 0.0,\n",
" ((6, 18, 45, 84), 2): 0.005154639175257732,\n",
" ((6, 18, 45, 84), 3): 0.0,\n",
" ((6, 18, 88, 98, 103), 0): 0.044444444444444446,\n",
" ((6, 18, 88, 98, 103), 1): 0.0,\n",
" ((6, 18, 88, 99, 103), 1): 0.0,\n",
" ((6, 18, 88, 99, 103), 2): 0.005780346820809248,\n",
" ((6, 18, 88, 99, 103), 3): 0.0,\n",
" ((6, 18, 88, 99, 103), 4): 0.0,\n",
" ((6, 18, 88, 99, 103), 5): 0.0,\n",
" ((6, 18, 109), 2): 0.017142857142857144,\n",
" ((7, 18), 1): 0.0707070707070707,\n",
" ((7, 18), 2): 0.0,\n",
" ((7, 18), 3): 0.0,\n",
" ((7, 18, 45, 84), 1): 0.00510204081632653,\n",
" ((7, 18, 45, 84), 2): 0.025,\n",
" ((7, 18, 45, 84), 3): 0.015463917525773196,\n",
" ((7, 18, 45, 84), 4): 0.0,\n",
" ((7, 18, 45, 84), 5): 0.0,\n",
" ((7, 18, 67, 92), 2): 0.0,\n",
" ((7, 18, 88, 92, 103), 0): 0.005988023952095809,\n",
" ((7, 18, 88, 92, 103), 1): 0.0072992700729927005,\n",
" ((7, 18, 88, 92, 103), 2): 0.01639344262295082,\n",
" ((7, 18, 88, 92, 103), 3): 0.0,\n",
" ((7, 18, 88, 92, 103), 4): 0.0,\n",
" ((7, 18, 88, 92, 103), 5): 0.0,\n",
" ((7, 18, 88, 93, 103), 0): 0.012121212121212121,\n",
" ((7, 18, 88, 93, 103), 2): 0.0,\n",
" ((7, 18, 88, 93, 103), 4): 0.0,\n",
" ((7, 18, 88, 93, 103), 5): 0.0,\n",
" ((7, 18, 88, 93, 103), 6): 0.0,\n",
" ((7, 18, 88, 95, 103), 0): 0.0,\n",
" ((7, 18, 88, 98, 103), 0): 0.032520325203252036,\n",
" ((7, 18, 88, 99, 103), 2): 0.0,\n",
" ((7, 18, 88, 99, 103), 3): 0.012658227848101266,\n",
" ((7, 18, 88, 99, 103), 4): 0.0,\n",
" ((7, 18, 88, 99, 103), 5): 0.0,\n",
" ((7, 18, 109), 2): 0.015306122448979591,\n",
" ((11, 18, 60, 61, 109), 2): 0.0,\n",
" ((11, 18, 60, 61, 109), 3): 0.005988023952095809,\n",
" ((11, 18, 60, 61, 109), 4): 0.0,\n",
" ((18,), 1): 0.20202020202020202,\n",
" ((18,), 2): 0.0,\n",
" ((18,), 3): 0.0,\n",
" ((18, 19, 45, 67, 69, 84), 1): 0.0,\n",
" ((18, 19, 45, 67, 69, 84), 2): 0.0,\n",
" ((18, 45, 47, 49, 67, 84, 86, 92), 2): 0.0,\n",
" ((18, 45, 47, 49, 84, 86), 1): 0.006211180124223602,\n",
" ((18, 45, 47, 49, 84, 86), 2): 0.0,\n",
" ((18, 45, 47, 49, 84, 86), 3): 0.0,\n",
" ((18, 45, 47, 49, 84, 86), 4): 0.0,\n",
" ((18, 45, 47, 49, 84, 86, 88, 92, 103), 0): 0.0,\n",
" ((18, 45, 47, 49, 84, 86, 88, 92, 103), 1): 0.0,\n",
" ((18, 45, 47, 49, 84, 86, 88, 92, 103), 2): 0.0,\n",
" ((18, 45, 47, 49, 84, 86, 88, 92, 103), 3): 0.0,\n",
" ((18, 45, 47, 49, 84, 86, 88, 92, 103), 4): 0.011363636363636364,\n",
" ((18, 45, 47, 49, 84, 86, 88, 92, 103), 5): 0.0,\n",
" ((18, 45, 47, 49, 84, 86, 88, 95, 103), 0): 0.020202020202020204,\n",
" ((18, 45, 47, 49, 84, 86, 88, 95, 103), 5): 0.0,\n",
" ((18, 45, 47, 49, 84, 86, 88, 98, 103), 0): 0.09090909090909091,\n",
" ((18, 45, 47, 49, 84, 86, 88, 99, 103), 1): 0.0,\n",
" ((18, 45, 47, 49, 84, 86, 88, 99, 103), 2): 0.0,\n",
" ((18, 45, 47, 49, 84, 86, 88, 99, 103), 4): 0.0,\n",
" ((18, 45, 47, 49, 84, 86, 109), 1): 0.0,\n",
" ((18, 45, 47, 49, 84, 86, 109), 2): 0.03954802259887006,\n",
" ((18, 45, 48, 84), 1): 0.0,\n",
" ((18, 45, 48, 84), 2): 0.0,\n",
" ((18, 45, 48, 84), 3): 0.0,\n",
" ((18, 45, 48, 84), 4): 0.0,\n",
" ((18, 45, 49, 50, 84), 1): 0.0,\n",
" ((18, 45, 49, 84, 86), 2): 0.008771929824561403,\n",
" ((18, 45, 49, 84, 86), 3): 0.0,\n",
" ((18, 45, 50, 84, 93), 3): 0.0,\n",
" ((18, 45, 60, 61, 84), 2): 0.006944444444444444,\n",
" ((18, 45, 60, 61, 84), 3): 0.015873015873015872,\n",
" ((18, 45, 60, 61, 84), 4): 0.0,\n",
" ((18, 45, 67, 84), 1): 0.0,\n",
" ((18, 45, 67, 84), 2): 0.0,\n",
" ((18, 45, 67, 84), 3): 0.0,\n",
" ((18, 45, 67, 84), 4): 0.0,\n",
" ((18, 45, 84), 1): 0.01020408163265306,\n",
" ((18, 45, 84), 2): 0.0707070707070707,\n",
" ((18, 45, 84), 3): 0.04142011834319527,\n",
" ((18, 45, 84), 4): 0.005988023952095809,\n",
" ((18, 45, 84), 5): 0.0,\n",
" ((18, 48, 84, 88, 98, 103), 0): 0.006134969325153374,\n",
" ((18, 48, 84, 88, 98, 103), 1): 0.0,\n",
" ((18, 48, 84, 109), 2): 0.00847457627118644,\n",
" ((18, 49, 50, 88, 92, 103), 0): 0.0,\n",
" ((18, 49, 50, 88, 92, 103), 2): 0.0,\n",
" ((18, 49, 50, 88, 92, 103), 4): 0.0,\n",
" ((18, 49, 50, 88, 92, 103), 5): 0.0,\n",
" ((18, 49, 50, 88, 98, 103), 0): 0.0,\n",
" ((18, 49, 84, 86, 88, 98, 103), 0): 0.005917159763313609,\n",
" ((18, 49, 84, 86, 88, 98, 103), 1): 0.00909090909090909,\n",
" ((18, 50, 88, 92, 93, 103), 2): 0.0,\n",
" ((18, 50, 88, 92, 93, 103), 3): 0.0,\n",
" ((18, 50, 88, 93, 98, 103), 0): 0.0,\n",
" ((18, 50, 88, 93, 98, 103), 1): 0.0,\n",
" ((18, 50, 88, 93, 98, 103), 2): 0.0,\n",
" ((18, 67, 69, 88, 99, 103), 2): 0.0,\n",
" ((18, 67, 69, 88, 99, 103), 3): 0.0,\n",
" ((18, 67, 69, 88, 99, 103), 4): 0.0,\n",
" ((18, 67, 70, 109), 2): 0.0,\n",
" ((18, 67, 70, 109), 3): 0.0,\n",
" ((18, 88, 93, 98, 102, 103), 0): 0.0,\n",
" ((18, 88, 93, 98, 102, 103), 4): 0.0,\n",
" ((18, 88, 95, 103), 0): 0.03,\n",
" ((18, 88, 95, 103), 1): 0.0,\n",
" ((18, 88, 98, 103), 0): 0.12244897959183673,\n",
" ((18, 88, 98, 103), 1): 0.006060606060606061,\n",
" ((18, 88, 99, 103), 0): 0.0,\n",
" ((18, 88, 99, 103), 2): 0.0,\n",
" ((18, 88, 99, 103), 3): 0.0,\n",
" ((18, 88, 99, 103), 4): 0.0,\n",
" ((18, 88, 99, 103), 5): 0.0,\n",
" ((18, 109), 1): 0.075,\n",
" ((18, 109), 2): 0.005952380952380952,\n",
" ((18, 109), 5): 0.0,\n",
" ((19, 41, 42), 1): 0.015463917525773196,\n",
" ((19, 41, 42), 2): 0.0,\n",
" ((19, 41, 42), 3): 0.0,\n",
" ((19, 41, 42), 4): 0.0,\n",
" ((19, 60), 2): 0.05555555555555555,\n",
" ((19, 60), 3): 0.030612244897959183,\n",
" ((19, 67, 70), 3): 0.011560693641618497,\n",
" ((19, 67, 70), 4): 0.0,\n",
" ((19, 71), 2): 0.0,\n",
" ((19, 73), 0): 0.0,\n",
" ((19, 73), 1): 0.0,\n",
" ((19, 74), 0): 0.008771929824561403,\n",
" ((19, 74), 1): 0.0,\n",
" ((19, 78), 0): 0.0,\n",
" ((19, 78), 1): 0.015306122448979591,\n",
" ((19, 79, 80), 1): 0.0,\n",
" ((19, 79, 80), 2): 0.0,\n",
" ((19, 79, 80), 3): 0.0,\n",
" ((19, 79, 80), 4): 0.0,\n",
" ((19, 79, 80), 5): 0.0,\n",
" ((19, 79, 81), 3): 0.0,\n",
" ((19, 79, 81), 4): 0.0,\n",
" ((19, 79, 81), 5): 0.0,\n",
" ((19, 82), 0): 0.0,\n",
" ((19, 82), 1): 0.0,\n",
" ((19, 82), 2): 0.0,\n",
" ((19, 83), 1): 0.0,\n",
" ((19, 83), 2): 0.0,\n",
" ((19, 87), 0): 0.0,\n",
" ((19, 87), 1): 0.0,\n",
" ((34, 57), 4): 0.0,\n",
" ((34, 57), 5): 0.016129032258064516,\n",
" ((34, 57), 6): 0.0,\n",
" ((41, 45, 46), 1): 0.006944444444444444,\n",
" ((41, 45, 46), 2): 0.0,\n",
" ((41, 45, 46), 3): 0.0,\n",
" ((41, 45, 46), 4): 0.0,\n",
" ((41, 45, 46), 5): 0.020618556701030927,\n",
" ((41, 45, 46), 6): 0.0,\n",
" ((41, 45, 49, 50, 109), 1): 0.0,\n",
" ((41, 45, 49, 50, 109), 2): 0.0,\n",
" ((41, 45, 49, 50, 109), 3): 0.0,\n",
" ((41, 45, 49, 109), 0): 0.0,\n",
" ((41, 45, 49, 109), 1): 0.01020408163265306,\n",
" ((41, 45, 49, 109), 2): 0.010638297872340425,\n",
" ((41, 45, 49, 109), 3): 0.005988023952095809,\n",
" ((41, 45, 49, 109), 4): 0.0,\n",
" ((41, 45, 49, 109), 5): 0.0,\n",
" ((41, 45, 84, 109), 1): 0.0,\n",
" ((41, 46, 100), 1): 0.0,\n",
" ((41, 46, 100), 3): 0.0,\n",
" ((41, 46, 100), 4): 0.0,\n",
" ((41, 46, 100), 5): 0.0,\n",
" ((41, 46, 100), 6): 0.0,\n",
" ((41, 52, 100), 1): 0.0,\n",
" ((45, 47, 49, 84), 1): 0.0,\n",
" ((46, 60, 62), 1): 0.050505050505050504,\n",
" ((46, 60, 62), 2): 0.030303030303030304,\n",
" ((46, 60, 62), 3): 0.0,\n",
" ((46, 60, 62), 4): 0.005208333333333333,\n",
" ((46, 60, 62), 5): 0.0,\n",
" ((46, 60, 63), 1): 0.02040816326530612,\n",
" ((46, 60, 63), 2): 0.005917159763313609,\n",
" ((46, 60, 63), 3): 0.0,\n",
" ((46, 60, 63), 4): 0.01098901098901099,\n",
" ((46, 60, 63), 5): 0.0,\n",
" ((46, 67), 1): 0.005847953216374269,\n",
" ((46, 67), 2): 0.0,\n",
" ((46, 67), 3): 0.0,\n",
" ((46, 67), 4): 0.006944444444444444,\n",
" ((46, 67), 5): 0.0,\n",
" ((46, 71), 1): 0.0,\n",
" ((46, 71), 2): 0.0,\n",
" ((46, 71), 3): 0.0,\n",
" ((46, 71), 4): 0.0,\n",
" ((46, 71), 5): 0.0,\n",
" ((49, 84, 86, 108), 1): 0.009259259259259259,\n",
" ((49, 84, 86, 108), 2): 0.0,\n",
" ((106,), 0): 0.17,\n",
" ((108,), 1): 0.035,\n",
" ((108,), 2): 0.02857142857142857,\n",
" ((108,), 3): 0.0,\n",
" ((108,), 4): 0.0,\n",
" ((108,), 5): 0.005050505050505051}"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"TFIDF_weights[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Internal types extraction"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_internal_types(node):\n",
" bag_of_internal_types = []\n",
" for uast in uasts:\n",
" queue = [uast]\n",
" while queue:\n",
" child = queue.pop(0)\n",
" queue.extend(child.children)\n",
" bag_of_internal_types.append(child.internal_type)\n",
" return bag_of_internal_types"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Name', 106009),\n",
" ('Attribute', 40977),\n",
" ('Call', 28867),\n",
" ('Str', 26104),\n",
" ('NoopLine', 21135),\n",
" ('Assign', 16518),\n",
" ('PreviousNoops', 13493),\n",
" ('Expr', 13400),\n",
" ('If.body', 8257),\n",
" ('If', 8257)]"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bag_of_internal_types = get_internal_types(uasts)\n",
"internal_types_counts = Counter(bag_of_internal_types)\n",
"internal_types_counts.most_common(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment