immuntasir/Finding the Functions.ipynb

## Finding the Functions.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import re\n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "REPO_DIR_PARENT = '../../data/package_popularity/numpy/clones/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_lines_from_file (filename):\n",
    "    lines = []\n",
    "    with open(filename, \"r\") as f:\n",
    "        for line in f:\n",
    "            lines.append(line.rstrip())\n",
    "    return lines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This function will return the (shorthand, full_form) pairs \n",
    "def parse_import_statement (statement):\n",
    "    statement = statement.rstrip()\n",
    "    if \" as \" in statement:\n",
    "        splitted = statement.split(\" as \")\n",
    "        statement = splitted[0].rstrip()\n",
    "        shorthand = splitted[1].rstrip()\n",
    "    else: \n",
    "        shorthand = None\n",
    "        \n",
    "    words = statement.split()\n",
    "    \n",
    "    if len(words) == 4 and words[0] == 'from' and words[2] == 'import':\n",
    "        return shorthand, '.'.join([words[1], words[3]])\n",
    "    elif len(words) == 2 and words[0] == 'import':\n",
    "        return shorthand, words[1]\n",
    "    else:\n",
    "        return None, None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_imported_instances (lines):\n",
    "    # This dictionary will keep track of all the imported instances and the shorthands\n",
    "    # If we have a statement like this -> from numpy import abc as def\n",
    "    # We will add a key-value pair like ret_dict['def'] = numpy.abc\n",
    "    # So, when we encounter def in our code, we will know that this means numpy.abc\n",
    "    ret_dict = dict()\n",
    "    \n",
    "    for line in lines:\n",
    "        if \"import\" in line:\n",
    "            shorthand, inst = parse_import_statement(line)\n",
    "            if inst != None and inst.split('.')[0] == 'numpy' and '(' not in inst:\n",
    "                if shorthand == None:\n",
    "                    shorthand = inst\n",
    "                ret_dict[shorthand] = inst\n",
    "    \n",
    "    return ret_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "function_count = defaultdict(lambda: 0)\n",
    "\n",
    "for subdir in os.listdir(REPO_DIR_PARENT):\n",
    "    # I am excluding the numpy codebase from my search\n",
    "    if subdir == 'numpy_numpy':\n",
    "        continue\n",
    "    \n",
    "    for file in os.listdir(os.path.join(REPO_DIR_PARENT, subdir)):\n",
    "        filename = os.path.join(REPO_DIR_PARENT, subdir, file)\n",
    "        all_lines = get_lines_from_file(filename)\n",
    "        instances = get_imported_instances(all_lines)\n",
    "        \n",
    "        for line in all_lines:\n",
    "            for shorthand in instances.keys():\n",
    "                search_str = shorthand + '.*?\\('\n",
    "                full_form = instances[shorthand]\n",
    "                \n",
    "                regex_res = re.search(search_str, line)\n",
    "\n",
    "                if regex_res is not None:\n",
    "                    function_full_form = (regex_res[0] + ')').replace(shorthand, full_form)\n",
    "                    function_count[function_full_form] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>numpy.array()</th>\n",
       "      <td>10114</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.arange()</th>\n",
       "      <td>4889</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.zeros()</th>\n",
       "      <td>3280</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.ones()</th>\n",
       "      <td>1966</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.testing.assert_array_equal()</th>\n",
       "      <td>1759</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.dtype()</th>\n",
       "      <td>1468</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.random.uniform()</th>\n",
       "      <td>1246</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.asarray()</th>\n",
       "      <td>1230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.empty()</th>\n",
       "      <td>1226</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.testing.assert_equal()</th>\n",
       "      <td>1209</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.linspace()</th>\n",
       "      <td>1149</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.all()</th>\n",
       "      <td>1102</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.sum()</th>\n",
       "      <td>1096</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.random.randint()</th>\n",
       "      <td>1069</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.random.rand()</th>\n",
       "      <td>1063</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.allclose()</th>\n",
       "      <td>1038</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.random.random()</th>\n",
       "      <td>939</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.testing.assert_almost_equal()</th>\n",
       "      <td>900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.dot()</th>\n",
       "      <td>868</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numpy.testing.assert_allclose()</th>\n",
       "      <td>857</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     Count\n",
       "numpy.array()                        10114\n",
       "numpy.arange()                        4889\n",
       "numpy.zeros()                         3280\n",
       "numpy.ones()                          1966\n",
       "numpy.testing.assert_array_equal()    1759\n",
       "numpy.dtype()                         1468\n",
       "numpy.random.uniform()                1246\n",
       "numpy.asarray()                       1230\n",
       "numpy.empty()                         1226\n",
       "numpy.testing.assert_equal()          1209\n",
       "numpy.linspace()                      1149\n",
       "numpy.all()                           1102\n",
       "numpy.sum()                           1096\n",
       "numpy.random.randint()                1069\n",
       "numpy.random.rand()                   1063\n",
       "numpy.allclose()                      1038\n",
       "numpy.random.random()                  939\n",
       "numpy.testing.assert_almost_equal()    900\n",
       "numpy.dot()                            868\n",
       "numpy.testing.assert_allclose()        857"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Display the 20 functions with the highest count\n",
    "pd.DataFrame(function_count, index=['Count']).transpose().sort_values(by='Count', ascending=False).head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import os\n",
	"import re\n",
	"from collections import defaultdict"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"REPO_DIR_PARENT = '../../data/package_popularity/numpy/clones/'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_lines_from_file (filename):\n",
	" lines = []\n",
	" with open(filename, \"r\") as f:\n",
	" for line in f:\n",
	" lines.append(line.rstrip())\n",
	" return lines"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"# This function will return the (shorthand, full_form) pairs \n",
	"def parse_import_statement (statement):\n",
	" statement = statement.rstrip()\n",
	" if \" as \" in statement:\n",
	" splitted = statement.split(\" as \")\n",
	" statement = splitted[0].rstrip()\n",
	" shorthand = splitted[1].rstrip()\n",
	" else: \n",
	" shorthand = None\n",
	" \n",
	" words = statement.split()\n",
	" \n",
	" if len(words) == 4 and words[0] == 'from' and words[2] == 'import':\n",
	" return shorthand, '.'.join([words[1], words[3]])\n",
	" elif len(words) == 2 and words[0] == 'import':\n",
	" return shorthand, words[1]\n",
	" else:\n",
	" return None, None"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_imported_instances (lines):\n",
	" # This dictionary will keep track of all the imported instances and the shorthands\n",
	" # If we have a statement like this -> from numpy import abc as def\n",
	" # We will add a key-value pair like ret_dict['def'] = numpy.abc\n",
	" # So, when we encounter def in our code, we will know that this means numpy.abc\n",
	" ret_dict = dict()\n",
	" \n",
	" for line in lines:\n",
	" if \"import\" in line:\n",
	" shorthand, inst = parse_import_statement(line)\n",
	" if inst != None and inst.split('.')[0] == 'numpy' and '(' not in inst:\n",
	" if shorthand == None:\n",
	" shorthand = inst\n",
	" ret_dict[shorthand] = inst\n",
	" \n",
	" return ret_dict"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"function_count = defaultdict(lambda: 0)\n",
	"\n",
	"for subdir in os.listdir(REPO_DIR_PARENT):\n",
	" # I am excluding the numpy codebase from my search\n",
	" if subdir == 'numpy_numpy':\n",
	" continue\n",
	" \n",
	" for file in os.listdir(os.path.join(REPO_DIR_PARENT, subdir)):\n",
	" filename = os.path.join(REPO_DIR_PARENT, subdir, file)\n",
	" all_lines = get_lines_from_file(filename)\n",
	" instances = get_imported_instances(all_lines)\n",
	" \n",
	" for line in all_lines:\n",
	" for shorthand in instances.keys():\n",
	" search_str = shorthand + '.*?\\('\n",
	" full_form = instances[shorthand]\n",
	" \n",
	" regex_res = re.search(search_str, line)\n",
	"\n",
	" if regex_res is not None:\n",
	" function_full_form = (regex_res[0] + ')').replace(shorthand, full_form)\n",
	" function_count[function_full_form] += 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Count</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>numpy.array()</th>\n",
	" <td>10114</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.arange()</th>\n",
	" <td>4889</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.zeros()</th>\n",
	" <td>3280</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.ones()</th>\n",
	" <td>1966</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.testing.assert_array_equal()</th>\n",
	" <td>1759</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.dtype()</th>\n",
	" <td>1468</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.random.uniform()</th>\n",
	" <td>1246</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.asarray()</th>\n",
	" <td>1230</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.empty()</th>\n",
	" <td>1226</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.testing.assert_equal()</th>\n",
	" <td>1209</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.linspace()</th>\n",
	" <td>1149</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.all()</th>\n",
	" <td>1102</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.sum()</th>\n",
	" <td>1096</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.random.randint()</th>\n",
	" <td>1069</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.random.rand()</th>\n",
	" <td>1063</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.allclose()</th>\n",
	" <td>1038</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.random.random()</th>\n",
	" <td>939</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.testing.assert_almost_equal()</th>\n",
	" <td>900</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.dot()</th>\n",
	" <td>868</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>numpy.testing.assert_allclose()</th>\n",
	" <td>857</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Count\n",
	"numpy.array() 10114\n",
	"numpy.arange() 4889\n",
	"numpy.zeros() 3280\n",
	"numpy.ones() 1966\n",
	"numpy.testing.assert_array_equal() 1759\n",
	"numpy.dtype() 1468\n",
	"numpy.random.uniform() 1246\n",
	"numpy.asarray() 1230\n",
	"numpy.empty() 1226\n",
	"numpy.testing.assert_equal() 1209\n",
	"numpy.linspace() 1149\n",
	"numpy.all() 1102\n",
	"numpy.sum() 1096\n",
	"numpy.random.randint() 1069\n",
	"numpy.random.rand() 1063\n",
	"numpy.allclose() 1038\n",
	"numpy.random.random() 939\n",
	"numpy.testing.assert_almost_equal() 900\n",
	"numpy.dot() 868\n",
	"numpy.testing.assert_allclose() 857"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Display the 20 functions with the highest count\n",
	"pd.DataFrame(function_count, index=['Count']).transpose().sort_values(by='Count', ascending=False).head(20)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}