Skip to content

Instantly share code, notes, and snippets.

@sstults
Created November 11, 2015 02:31
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sstults/715267e06c4d77c37e06 to your computer and use it in GitHub Desktop.
Save sstults/715267e06c4d77c37e06 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python -x
import pysolr
import sys
from nltk.corpus import wordnet as wn
class Indexer:
"""
Loads hypernyms, hyponyms, holonyms, and lemmas from WordNet into Solr
"""
def __init__(self, args):
self.args = args
self.doc_buffer = []
self.doc_count = 0
self.solr = pysolr.Solr(args.url, 100)
def add_doc(self, solrdoc):
self.doc_buffer.append(solrdoc)
self.doc_count += 1
if len(self.doc_buffer) >= self.args.batch_size:
self.post()
self.doc_buffer = []
def post(self):
self.solr.add(self.doc_buffer)
self.doc_buffer = []
self.status()
def status(self):
sys.stdout.write("Added %d records\r" % self.doc_count)
sys.stdout.flush()
@staticmethod
def make_doc_from_synset(synset):
return {
'id': synset.name(),
'hypernyms': [x.name() for x in synset.hypernyms()],
'hyponyms': [x.name() for x in synset.hyponyms()],
'holonyms': [x.name() for x in synset.member_holonyms()],
'lemmas': [x.name() for x in synset.lemmas()]
}
def load_all_synsets(self):
if self.args.delete:
self.solr.delete('*:*')
for synset in wn.all_synsets('n'):
self.add_doc(self.make_doc_from_synset(synset))
self.post() # add the remainder of docs
print()
print("Done")
if __name__ == "__main__":
import argparse
import textwrap
parser = argparse.ArgumentParser(prog='wordnet-indexer',
description='Loads some WordNet into Solr',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=textwrap.dedent('''
Example: \n\n
./wordnet-indexer.py -d -b 1000
'''))
parser.add_argument('-u', '--url', help='Solr base url', action='store',
default='http://localhost:8983/solr/gettingstarted')
parser.add_argument('-d', '--delete', help='Delete all docs first', action='store_true')
parser.add_argument('-b', '--batch_size', help='Solr update batch size', action='store', type=int, default=1000)
i = Indexer(parser.parse_args())
i.load_all_synsets()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment