Skip to content

Instantly share code, notes, and snippets.

@kleem
Last active February 22, 2023 09:52
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kleem/6ab92f48ef961da271ab to your computer and use it in GitHub Desktop.
Save kleem/6ab92f48ef961da271ab to your computer and use it in GitHub Desktop.
WordNet noun graph

This experiment converts an SQL version of WordNet 3.0 into a graph, using the python library graph-tool. In order to create a taxonomical structure, only noun synsets, hyponym links and hypernym links are considered.

The result of the conversion is saved as GraphML, then rendered as the following hairball:

WordNet 3.0 taxonomy as a graph

Since the graph can be considered a tangled tree, i.e. a tree in which some nodes have multiple parents, two untangled versions (using longest and shortest paths) are also provided as GraphML. Only a few links are lost (about 2%), making the tree a good approximation of the noun taxonomy graph.

from graph_tool.all import *
g = load_graph('wnen30_noun_taxonomy.graphml.xml.gz')
graph_draw(g,
output_size=(2000, 2000),
output='wnen30_noun_taxonomy.png',
vertex_fill_color=(0.9,0.9,0.9,1),
edge_pen_width=1.2
)
g = load_graph('wnen30_noun_tree_shortest.graphml.xml.gz')
# use the same layout for the two trees
layout = sfdp_layout(g)
graph_draw(g,
pos=layout,
output_size=(2000, 2000),
output='wnen30_noun_tree_shortest.png',
vertex_fill_color=(0.9,0.9,0.9,1),
edge_pen_width=1.2
)
g = load_graph('wnen30_noun_tree_longest.graphml.xml.gz')
graph_draw(g,
pos=layout,
output_size=(2000, 2000),
output='wnen30_noun_tree_longest.png',
vertex_fill_color=(0.9,0.9,0.9,1),
edge_pen_width=1.2
)
# read from WordNet 3.0 MySQL database
import MySQLdb
import MySQLdb.cursors
def get_cursor():
return MySQLdb.connect(
host='opendata',
db='wordnet_en_3.0',
user='read',
passwd='read',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
charset='utf8'
).cursor()
# generate a python-graph-tools Graph
from graph_tool.all import *
g = Graph()
# define properties for synset nodes
synsetid = g.new_vertex_property('int32_t')
pos = g.new_vertex_property('string')
definition = g.new_vertex_property('string')
# store properties into the graph
g.vertex_properties['synsetid'] = synsetid
g.vertex_properties['pos'] = pos
g.vertex_properties['definition'] = definition
# maintain a map from synsetids to vertexes
synset_map = {}
print 'Retrieving synsets...'
# for each synset with pos=n, create a vertex
c = get_cursor()
c.execute("""SELECT * FROM synsets WHERE pos='n'""")
for i, row in enumerate(c.fetchall()):
print 'Synsets added to the graph: %d\r' % (i+1),
v = g.add_vertex()
synset_map[row['synsetid']] = v
synsetid[v] = row['synsetid']
pos[v] = row['pos']
definition[v] = row['definition']
print
print 'Retrieving semlinks...'
# for each semlink, create an edge if both ends are found in the synset map (noun synsets)
c = get_cursor()
c.execute("""(SELECT synset1id AS parent, synset2id AS child FROM semlinks WHERE linkid IN (2,4))
UNION DISTINCT
(SELECT synset2id AS parent, synset1id AS child FROM semlinks WHERE linkid IN (1,3))""")
i = 0
for row in c.fetchall():
print 'Semlinks added to the graph: %d\r' % (i+1),
try:
e = g.add_edge(synset_map[row['parent']], synset_map[row['child']])
i += 1
except KeyError:
print
print 'Link between %d and %d skipped' % (row['parent'], row['child'])
print
print 'Saving the graph...'
g.save('wnen30_noun_taxonomy.graphml.xml.gz')
# LOAD
from graph_tool.all import *
print 'Loading graph...'
g = load_graph('wnen30_noun_taxonomy.graphml.xml.gz')
# read properties for synset nodes
synsetid = g.vertex_properties['synsetid']
pos = g.vertex_properties['pos']
definition = g.vertex_properties['definition']
# COMPUTE
print 'Computing topological sort...'
topological_order = topological_sort(g)
# print topological_order
print 'Computing longest distances from root (entity synset is vertex 0)...'
print 'Assigning initial distances...'
longest_dist = g.new_vertex_property('int32_t')
for v in g.vertices():
longest_dist[v] = -2147483648 # minimum int32_t number as -Inf ???
longest_dist[g.vertex(0)] = 0 # root is initialized to 0
from itertools import imap
# unweighted topological sort (all edges have weight=1)
# adapted from http://www.geeksforgeeks.org/find-longest-path-directed-acyclic-graph/
for u in imap(g.vertex, reversed(topological_order)):
# print 'Considering node %d (current distance %d)' % (u, longest_dist[u])
for v in u.out_neighbours():
# print ' Comparing with node %d (current distance %d)' % (v, longest_dist[v])
if longest_dist[v] < longest_dist[u]+1:
longest_dist[v] = longest_dist[u]+1
# print ' Updated distance to %d' % (longest_dist[v])
print 'Creating the longest path tree...'
# all edges have to lead to a node with depth equal to longest_dist, otherwise they are part of a not-maximal path
# in the case of a tie, an already seen node must not be taken twice
from itertools import izip
retain_longest = g.new_edge_property('bool')
already_visited = set()
def walk(node, depth):
for edge, child in izip(node.out_edges(), node.out_neighbours()):
if longest_dist[child] == depth and child not in already_visited:
retain_longest[edge] = 1
already_visited.add(child)
walk(child, depth+1)
else:
retain_longest[edge] = 0
tree = walk(g.vertex(0), 1)
# EXPORT
print 'Saving the graph (retained links are marked with "tree_link" = 1)...'
g.edge_properties['tree_link'] = retain_longest
g.save('wnen30_noun_tree_longest.graphml.xml.gz')
# LOAD
from graph_tool.all import *
print 'Loading graph...'
g = load_graph('wnen30_noun_taxonomy.graphml.xml.gz')
# read properties for synset nodes
synsetid = g.vertex_properties['synsetid']
pos = g.vertex_properties['pos']
definition = g.vertex_properties['definition']
# COMPUTE
print 'Computing shortest distances from root (entity synset is vertex 0)...'
shortest_dist = shortest_distance(g, source=g.vertex(0), directed=None)
print 'Creating the shortest path tree...'
# all edges have to lead to a node with depth equal to shortest_dist, otherwise they are part of a not-minimal path
# in the case of a tie, an already seen node must not be taken twice
from itertools import izip
retain_shortest = g.new_edge_property('bool')
already_visited = set()
def walk(node, depth):
for edge, child in izip(node.out_edges(), node.out_neighbours()):
if shortest_dist[child] == depth and child not in already_visited:
retain_shortest[edge] = 1
already_visited.add(child)
walk(child, depth+1)
else:
retain_shortest[edge] = 0
tree = walk(g.vertex(0), 1)
# EXPORT
print 'Saving the graph (retained links are marked with "tree_link" = 1)...'
g.edge_properties['tree_link'] = retain_shortest
g.save('wnen30_noun_tree_shortest.graphml.xml.gz')
View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

@ezequias
Copy link

Is there any program could I open the taxonomy?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment