Skip to content

Instantly share code, notes, and snippets.

@erickrf
Last active October 23, 2018 20:53
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save erickrf/15739831760ff261c6d0d39c2937ed6a to your computer and use it in GitHub Desktop.
Save erickrf/15739831760ff261c6d0d39c2937ed6a to your computer and use it in GitHub Desktop.
Functions to access the OpenWordnetPT graph
# -*- coding: utf-8 -*-
'''
Functions to read the OpenWordnetPT from RDF files and provide
access to it.
'''
import rdflib
from six.moves import cPickle
ownns = rdflib.Namespace('https://w3id.org/own-pt/wn30/schema/')
nomlexns = rdflib.Namespace('https://w3id.org/own-pt/nomlex/schema/')
lexical_form_predicate = rdflib.URIRef(
u'https://w3id.org/own-pt/wn30/schema/lexicalForm')
word_type = ownns['Word']
word_pred = ownns['word']
word_sense_type = ownns['WordSense']
type_pred = rdflib.RDF.type
contains_sense_pred = ownns['containsWordSense']
nomlex_verb_pred = nomlexns['verb']
nomlex_noun_pred = nomlexns['noun']
_wn_graph = None
def load_wordnet(path, force=False):
"""
Load the wordnet graph from the given path. A call to this function
is necessary before using the other ones in this module.
:param path: path to either a .pickle or .nt file. If it is a pickled
file, it should contain a previously serialized wordnet graph.
:param force: if True, reloads the file even if one has been previously
loaded.
"""
global _wn_graph
if _wn_graph is not None and not force:
return
if path.endswith('.nt'):
_wn_graph = rdflib.Graph()
_wn_graph.parse(path, format='nt')
elif path.endswith('.pickle'):
with open(path, 'rb') as f:
_wn_graph = cPickle.load(f)
else:
raise ValueError('Wordnet file extension is neither .nt or .pickle')
def find_synonyms(word):
"""
Find all synonyms of the given word in the wordnet graph, considering
all possible synsets.
:return: a set of unicode strings
"""
synonyms = set()
synsets = find_synsets(word)
for synset in synsets:
synonyms.update(get_synset_words(synset))
return synonyms
def are_synonyms(word1, word2):
"""
Return True if word1 and word2 share at least one synset in graph.
"""
synsets1 = find_synsets(word1)
synsets2 = find_synsets(word2)
return len(synsets1.intersection(synsets2)) > 0
def get_word_node(word):
"""
Return the RDF node used in own-pt to represent a given word.
"""
word_literal = rdflib.Literal(word, 'pt')
word_node = _wn_graph.value(None, lexical_form_predicate, word_literal)
return word_node
def word_node_to_string(word_node):
"""
Return the string corresponding to the given own-pt word node.
"""
word_literal = _wn_graph.value(word_node, lexical_form_predicate,
any=False)
return word_literal.toPython()
def find_synsets(word):
'''
Find and return all synsets containing the given word in the given graph.
:param word: unicode string
:return: a set of synsets (rdflib objects). It is empty is the word is not
in the wordnet
'''
all_synsets = set()
word_node = get_word_node(word)
if word_node is None:
# this word is not in the wordnet
return all_synsets
# word nodes are linked to word sense nodes
word_senses_iter = _wn_graph.subjects(word_pred, word_node)
for word_sense in word_senses_iter:
synsets_iter = _wn_graph.subjects(contains_sense_pred, word_sense)
synsets = list(synsets_iter)
all_synsets.update(synsets)
return all_synsets
def get_synset_words(synset):
'''
Return the words of a synset
:return: a list of strings
'''
words = []
# a synset has many word senses
# each word sense has a Word object and each Word has a lexical form
senses = _wn_graph.objects(synset, contains_sense_pred)
for sense in senses:
word_node = _wn_graph.value(sense, word_pred, any=False)
words.append(word_node_to_string(word_node))
return words
def find_nominalizations(word):
"""
Find and return nominalizations of the given verb.
:return: a list of possible nominalizations, as strings
"""
word_node = get_word_node(word)
if word_node is None:
return []
nouns = []
# a nominalization object links nouns and verbs
nominalizations = _wn_graph.subjects(nomlex_verb_pred, word_node)
for nom in nominalizations:
noun = _wn_graph.value(nom, nomlex_noun_pred, None)
noun_string = word_node_to_string(noun)
nouns.append(noun_string)
return nouns
@nathanshartmann
Copy link

Request to add a functionality to calculate how many hypernyms and hyponyms a word has.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment