Skip to content

Instantly share code, notes, and snippets.

@ianlivingstone
Created July 9, 2011 18:05
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ianlivingstone/1073806 to your computer and use it in GitHub Desktop.
Save ianlivingstone/1073806 to your computer and use it in GitHub Desktop.
Uses Python AST Module to build a dict/list representaiton of only module/function/classes and then pulls keywords from their docstrings
import ast
import sys
import pprint
import re
type_lookup = {
ast.Module: 'Module',
ast.FunctionDef: 'Function',
ast.ClassDef: 'Class'
}
pattern = re.compile('[\W_]+')
stopwords = [
'the','of','and','to','in','be','will','for','on','is', \
'with', 'by', 'as', 'this', 'are', 'from', 'that', 'or', \
'at', 'been', 'an', 'was', 'were', 'have', 'has', 'it', ''
]
def parse_tree (node):
"""
Uses the stack to navigate our parse tree and discover Module, Classes,
and Functions doc strings and all other comments.
"""
tree = dict(
type = type_lookup[type(node)],
name = node.name if 'name' in node.__dict__ else __file__.split('.')[0],
doc = ast.get_docstring(node),
children = []
)
for child in node.body:
if type(child) not in [ast.Module, ast.FunctionDef, ast.ClassDef]:
continue
tree['children'].append(parse_tree(child))
return tree
def tokenize (text):
"""
Takes a string and tokenizes it into terms
"""
output = []
if not text:
return output
for term in text.lower().split(' '):
term = pattern.sub('', term)
if term in stopwords:
continue
output.append(term)
return output
def get_terms (node):
terms = tokenize(node['doc'])
for child in node['children']:
child_terms = get_terms(child)
unique_terms = [term for term in child_terms if term not in terms]
terms.extend(unique_terms)
return terms
def main ():
f = open(sys.argv[1],'r')
pt = ast.parse(f.read(), filename=sys.argv[1])
tree = parse_tree(pt)
terms = get_terms(tree)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(tree)
pp.pprint(terms)
if __name__ == '__main__':
main()
@ianlivingstone
Copy link
Author

Just a note, this was a proof of concept that you could use the Python parser to index terms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment