Skip to content

Instantly share code, notes, and snippets.

@salt-die
Last active May 31, 2021 12:22
Show Gist options
  • Save salt-die/8ae39f4914ef1d603f3705a43941557c to your computer and use it in GitHub Desktop.
Save salt-die/8ae39f4914ef1d603f3705a43941557c to your computer and use it in GitHub Desktop.
Measuring AST similarity for plagiarism detection.
from ast import iter_child_nodes, parse, Load, Store, Delete
from igraph import Graph
class EnumDict(dict):
def __init__(self):
self.types = [ ]
def __missing__(self, key):
self[key] = len(self.types)
self.types.append(key)
return self[key]
TYPE_ENUM = EnumDict() # AST node type -> int
CONTEXTS = Load, Store, Delete
def sources_to_igraph(*sources, skip_contexts=True):
for source in sources:
graph = Graph(directed=True)
tree = parse(source)
_ast_visit(graph, tree, skip_contexts)
yield graph
def _ast_visit(graph, node, skip_contexts):
vertex = graph.add_vertex(label=TYPE_ENUM[type(node).__name__])
for child in iter_child_nodes(node):
if skip_contexts and isinstance(child, CONTEXTS):
continue
descendent = _ast_visit(graph, child, skip_contexts)
graph.add_edge(vertex, descendent)
return vertex
if __name__ == '__main__':
# Plotting AST
from igraph import plot
g = next(sources_to_igraph('for i in range(5): print(i)'))
plot(g, target='astgraph.png', vertex_label=[TYPE_ENUM.types[i] for i in g.vs['label']], vertex_label_dist=2, margin=40)
# Kernels
from graphkernels.kernels import CalculateWLKernel as WLKernel
entry_1 = 'for i in range(5): print(i)'
entry_2 = 'for k in range(20): print(k + "!")'
entry_3 = 'for i, k in enumerate(range(20)): print(i, k)'
graphs = list( sources_to_igraph(entry_1, entry_2, entry_3) )
k = WLKernel(graphs)
print(k)
# [[ 88. 50. 56.]
# [ 50. 96. 52.]
# [ 56. 52. 192.]]
@salt-die
Copy link
Author

astgraph

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment