Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Uses Python AST Module to build a dict/list representaiton of only module/function/classes and then pulls keywords from their docstrings
import ast
import sys
import pprint
import re
type_lookup = {
ast.Module: 'Module',
ast.FunctionDef: 'Function',
ast.ClassDef: 'Class'
}
pattern = re.compile('[\W_]+')
stopwords = [
'the','of','and','to','in','be','will','for','on','is', \
'with', 'by', 'as', 'this', 'are', 'from', 'that', 'or', \
'at', 'been', 'an', 'was', 'were', 'have', 'has', 'it', ''
]
def parse_tree (node):
"""
Uses the stack to navigate our parse tree and discover Module, Classes,
and Functions doc strings and all other comments.
"""
tree = dict(
type = type_lookup[type(node)],
name = node.name if 'name' in node.__dict__ else __file__.split('.')[0],
doc = ast.get_docstring(node),
children = []
)
for child in node.body:
if type(child) not in [ast.Module, ast.FunctionDef, ast.ClassDef]:
continue
tree['children'].append(parse_tree(child))
return tree
def tokenize (text):
"""
Takes a string and tokenizes it into terms
"""
output = []
if not text:
return output
for term in text.lower().split(' '):
term = pattern.sub('', term)
if term in stopwords:
continue
output.append(term)
return output
def get_terms (node):
terms = tokenize(node['doc'])
for child in node['children']:
child_terms = get_terms(child)
unique_terms = [term for term in child_terms if term not in terms]
terms.extend(unique_terms)
return terms
def main ():
f = open(sys.argv[1],'r')
pt = ast.parse(f.read(), filename=sys.argv[1])
tree = parse_tree(pt)
terms = get_terms(tree)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(tree)
pp.pprint(terms)
if __name__ == '__main__':
main()
@ianlivingstone
Copy link
Author

ianlivingstone commented Jul 14, 2011

Just a note, this was a proof of concept that you could use the Python parser to index terms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment