-
-
Save Melevir/5754a1b553eb11839238e43734d0eb79 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ast | |
import os | |
import collections | |
from nltk import pos_tag | |
def flat(_list): | |
""" [(1,2), (3,4)] -> [1, 2, 3, 4]""" | |
return sum([list(item) for item in _list], []) | |
def is_verb(word): | |
if not word: | |
return False | |
pos_info = pos_tag([word]) | |
return pos_info[0][1] == 'VB' | |
Path = '' | |
def get_trees(_path, with_filenames=False, with_file_content=False): | |
filenames = [] | |
trees = [] | |
path= Path | |
for dirname, dirs, files in os.walk(path, topdown=True): | |
for file in files: | |
if file.endswith('.py'): | |
filenames.append(os.path.join(dirname, file)) | |
if len(filenames) == 100: | |
break | |
print('total %s files' % len(filenames)) | |
for filename in filenames: | |
with open(filename, 'r', encoding='utf-8') as attempt_handler: | |
main_file_content = attempt_handler.read() | |
try: | |
tree = ast.parse(main_file_content) | |
except SyntaxError as e: | |
print(e) | |
tree = None | |
if with_filenames: | |
if with_file_content: | |
trees.append((filename, main_file_content, tree)) | |
else: | |
trees.append((filename, tree)) | |
else: | |
trees.append(tree) | |
print('trees generated') | |
return trees | |
def get_all_names(tree): | |
return [node.id for node in ast.walk(tree) if isinstance(node, ast.Name)] | |
def get_verbs_from_function_name(function_name): | |
return [word for word in function_name.split('_') if is_verb(word)] | |
def get_all_words_in_path(path): | |
trees = [t for t in get_trees(path) if t] | |
function_names = [f for f in flat([get_all_names(t) for t in trees]) if not (f.startswith('__') and f.endswith('__'))] | |
def split_snake_case_name_to_words(name): | |
return [n for n in name.split('_') if n] | |
return flat([split_snake_case_name_to_words(function_name) for function_name in function_names]) | |
def get_top_verbs_in_path(path, top_size=10): | |
global Path | |
Path = path | |
trees = [t for t in get_trees(None) if t] | |
fncs = [f for f in flat([[node.name.lower() for node in ast.walk(t) if isinstance(node, ast.FunctionDef)] for t in trees]) if not (f.startswith('__') and f.endswith('__'))] | |
print('functions extracted') | |
verbs = flat([get_verbs_from_function_name(function_name) for function_name in fncs]) | |
return collections.Counter(verbs).most_common(top_size) | |
def get_top_functions_names_in_path(path, top_size=10): | |
t = get_trees(path) | |
nms = [f for f in flat([[node.name.lower() for node in ast.walk(t) if isinstance(node, ast.FunctionDef)] for t in t]) if not (f.startswith('__') and f.endswith('__'))] | |
return collections.Counter(nms).most_common(top_size) | |
wds = [] | |
projects = [ | |
'django', | |
'flask', | |
'pyramid', | |
'reddit', | |
'requests', | |
'sqlalchemy', | |
] | |
for project in projects: | |
path = os.path.join('.', project) | |
wds += get_top_verbs_in_path(path) | |
top_size = 200 | |
print('total %s words, %s unique' % (len(wds), len(set(wds)))) | |
for word, occurence in collections.Counter(wds).most_common(top_size): | |
print(word, occurence) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment