Skip to content

Instantly share code, notes, and snippets.

@immuntasir
Created October 3, 2020 14:04
Show Gist options
  • Save immuntasir/15aa310d3e5245aae2e98b17ba2face1 to your computer and use it in GitHub Desktop.
Save immuntasir/15aa310d3e5245aae2e98b17ba2face1 to your computer and use it in GitHub Desktop.
Finding the most frequently used functions
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"import re\n",
"from collections import defaultdict"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"REPO_DIR_PARENT = '../../data/package_popularity/numpy/clones/'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def get_lines_from_file (filename):\n",
" lines = []\n",
" with open(filename, \"r\") as f:\n",
" for line in f:\n",
" lines.append(line.rstrip())\n",
" return lines"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# This function will return the (shorthand, full_form) pairs \n",
"def parse_import_statement (statement):\n",
" statement = statement.rstrip()\n",
" if \" as \" in statement:\n",
" splitted = statement.split(\" as \")\n",
" statement = splitted[0].rstrip()\n",
" shorthand = splitted[1].rstrip()\n",
" else: \n",
" shorthand = None\n",
" \n",
" words = statement.split()\n",
" \n",
" if len(words) == 4 and words[0] == 'from' and words[2] == 'import':\n",
" return shorthand, '.'.join([words[1], words[3]])\n",
" elif len(words) == 2 and words[0] == 'import':\n",
" return shorthand, words[1]\n",
" else:\n",
" return None, None"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def get_imported_instances (lines):\n",
" # This dictionary will keep track of all the imported instances and the shorthands\n",
" # If we have a statement like this -> from numpy import abc as def\n",
" # We will add a key-value pair like ret_dict['def'] = numpy.abc\n",
" # So, when we encounter def in our code, we will know that this means numpy.abc\n",
" ret_dict = dict()\n",
" \n",
" for line in lines:\n",
" if \"import\" in line:\n",
" shorthand, inst = parse_import_statement(line)\n",
" if inst != None and inst.split('.')[0] == 'numpy' and '(' not in inst:\n",
" if shorthand == None:\n",
" shorthand = inst\n",
" ret_dict[shorthand] = inst\n",
" \n",
" return ret_dict"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"function_count = defaultdict(lambda: 0)\n",
"\n",
"for subdir in os.listdir(REPO_DIR_PARENT):\n",
" # I am excluding the numpy codebase from my search\n",
" if subdir == 'numpy_numpy':\n",
" continue\n",
" \n",
" for file in os.listdir(os.path.join(REPO_DIR_PARENT, subdir)):\n",
" filename = os.path.join(REPO_DIR_PARENT, subdir, file)\n",
" all_lines = get_lines_from_file(filename)\n",
" instances = get_imported_instances(all_lines)\n",
" \n",
" for line in all_lines:\n",
" for shorthand in instances.keys():\n",
" search_str = shorthand + '.*?\\('\n",
" full_form = instances[shorthand]\n",
" \n",
" regex_res = re.search(search_str, line)\n",
"\n",
" if regex_res is not None:\n",
" function_full_form = (regex_res[0] + ')').replace(shorthand, full_form)\n",
" function_count[function_full_form] += 1"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>numpy.array()</th>\n",
" <td>10114</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.arange()</th>\n",
" <td>4889</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.zeros()</th>\n",
" <td>3280</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.ones()</th>\n",
" <td>1966</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.testing.assert_array_equal()</th>\n",
" <td>1759</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.dtype()</th>\n",
" <td>1468</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.random.uniform()</th>\n",
" <td>1246</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.asarray()</th>\n",
" <td>1230</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.empty()</th>\n",
" <td>1226</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.testing.assert_equal()</th>\n",
" <td>1209</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.linspace()</th>\n",
" <td>1149</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.all()</th>\n",
" <td>1102</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.sum()</th>\n",
" <td>1096</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.random.randint()</th>\n",
" <td>1069</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.random.rand()</th>\n",
" <td>1063</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.allclose()</th>\n",
" <td>1038</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.random.random()</th>\n",
" <td>939</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.testing.assert_almost_equal()</th>\n",
" <td>900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.dot()</th>\n",
" <td>868</td>\n",
" </tr>\n",
" <tr>\n",
" <th>numpy.testing.assert_allclose()</th>\n",
" <td>857</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Count\n",
"numpy.array() 10114\n",
"numpy.arange() 4889\n",
"numpy.zeros() 3280\n",
"numpy.ones() 1966\n",
"numpy.testing.assert_array_equal() 1759\n",
"numpy.dtype() 1468\n",
"numpy.random.uniform() 1246\n",
"numpy.asarray() 1230\n",
"numpy.empty() 1226\n",
"numpy.testing.assert_equal() 1209\n",
"numpy.linspace() 1149\n",
"numpy.all() 1102\n",
"numpy.sum() 1096\n",
"numpy.random.randint() 1069\n",
"numpy.random.rand() 1063\n",
"numpy.allclose() 1038\n",
"numpy.random.random() 939\n",
"numpy.testing.assert_almost_equal() 900\n",
"numpy.dot() 868\n",
"numpy.testing.assert_allclose() 857"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Display the 20 functions with the highest count\n",
"pd.DataFrame(function_count, index=['Count']).transpose().sort_values(by='Count', ascending=False).head(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment