Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Uses Python AST Module to build a dict/list representaiton of only module/function/classes and then pulls keywords from their docstrings
import ast
import sys
import pprint
import re
type_lookup = {
ast.Module: 'Module',
ast.FunctionDef: 'Function',
ast.ClassDef: 'Class'
}
pattern = re.compile('[\W_]+')
stopwords = [
'the','of','and','to','in','be','will','for','on','is', \
'with', 'by', 'as', 'this', 'are', 'from', 'that', 'or', \
'at', 'been', 'an', 'was', 'were', 'have', 'has', 'it', ''
]
def parse_tree (node):
"""
Uses the stack to navigate our parse tree and discover Module, Classes,
and Functions doc strings and all other comments.
"""
tree = dict(
type = type_lookup[type(node)],
name = node.name if 'name' in node.__dict__ else __file__.split('.')[0],
doc = ast.get_docstring(node),
children = []
)
for child in node.body:
if type(child) not in [ast.Module, ast.FunctionDef, ast.ClassDef]:
continue
tree['children'].append(parse_tree(child))
return tree
def tokenize (text):
"""
Takes a string and tokenizes it into terms
"""
output = []
if not text:
return output
for term in text.lower().split(' '):
term = pattern.sub('', term)
if term in stopwords:
continue
output.append(term)
return output
def get_terms (node):
terms = tokenize(node['doc'])
for child in node['children']:
child_terms = get_terms(child)
unique_terms = [term for term in child_terms if term not in terms]
terms.extend(unique_terms)
return terms
def main ():
f = open(sys.argv[1],'r')
pt = ast.parse(f.read(), filename=sys.argv[1])
tree = parse_tree(pt)
terms = get_terms(tree)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(tree)
pp.pprint(terms)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment