Skip to content

Instantly share code, notes, and snippets.

@barrbrain
Last active August 29, 2015 14:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save barrbrain/8f32260a89c76a2e2eff to your computer and use it in GitHub Desktop.
Save barrbrain/8f32260a89c76a2e2eff to your computer and use it in GitHub Desktop.
Generate English vocabulary guide from eXtended WordNet; requires python-igraph
#!/usr/bin/python
import xml.sax
from igraph import Graph
from collections import defaultdict
class XWN( xml.sax.ContentHandler ):
def __init__(self):
self.graph = Graph(directed = True)
self.synSets = []
self.currentTag = ""
self.synString = ""
self.gloss = ""
self.glosses = []
self.pos = ""
self.wf_pos = ""
def startElement(self, tag, attributes):
self.currentTag = tag
if tag == "gloss":
self.synString = ""
self.gloss = ""
self.glosses = []
self.pos = attributes['pos']
elif tag == "wf":
self.wf_pos = attributes['pos']
def filter_pos(self, pos):
if pos == 'VERB' or pos.startswith('VB'):
return 'VB'
elif pos == 'NOUN' or pos.startswith('NN'):
return 'NN'
elif pos == 'ADJ' or pos.startswith('JJ'):
return 'JJ'
elif pos == 'ADV' or pos.startswith('RB'):
return 'RB'
return pos
def characters(self, content):
if self.currentTag == "synonymSet":
self.synString = content
elif self.currentTag == "text":
self.gloss += content
elif self.currentTag == "wf":
self.glosses.append(self.filter_pos(self.wf_pos) + ':' + content)
def endElement(self, tag):
if tag == "gloss":
pos = self.filter_pos(self.pos)
self.synSets.append({
'synString': self.synString,
'synonyms': set([pos + ':' + s for s in self.synString.split(', ')]),
'gloss': self.gloss.strip(),
'glosses': set(self.glosses)})
self.currentTag = ""
def computeEdges(self):
self.glosswords = set()
index = {}
for v in self.synSets:
self.glosswords.update(v['glosses'])
for v in self.synSets:
v['synonyms'] &= self.glosswords
v['synString'] = ', '.join(s.split(':')[1] for s in v['synonyms'])
self.synSets = [v for v in self.synSets if v['synonyms'] and v['glosses']]
for i, w in enumerate(self.glosswords):
self.graph.add_vertex(name = w)
index[w] = i
edges = set()
for v in self.synSets:
for w in v['synonyms']:
edges.update([(index[w], index[x]) for x in v['glosses']])
self.graph.add_edges(edges)
def computeGlossary(self):
word_rank = dict(zip(self.glosswords, self.graph.pagerank()))
glossary = [
("%s: %s\n" % (v['synString'], v['gloss']),
min([word_rank[w] for w in v['glosses'] | v['synonyms']]))
for v in self.synSets]
glossary.sort(key=lambda g: g[1], reverse = True)
return [g[0] for g in glossary if g[1] > glossary[-1][1]]
if ( __name__ == "__main__"):
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
xwn = XWN()
parser.setContentHandler( xwn )
parser.parse("adj.xml")
parser.parse("adv.xml")
parser.parse("noun.xml")
parser.parse("verb.xml")
xwn.computeEdges()
f = open("dict.txt", "w")
f.writelines(xwn.computeGlossary())
f.close
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment