Skip to content

Instantly share code, notes, and snippets.

@TMPxyz
Created March 27, 2022 12:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TMPxyz/8def28453f80dd6f58203551a12983a3 to your computer and use it in GitHub Desktop.
Save TMPxyz/8def28453f80dd6f58203551a12983a3 to your computer and use it in GitHub Desktop.
Python: Use Whoosh to indexing wordnet and search fields
# %%
import nltk
from nltk.corpus import wordnet2021 as wn
from whoosh.index import create_in, open_dir
from whoosh.fields import *
from tqdm.notebook import tqdm
from whoosh.qparser import QueryParser
from IPython.display import display
ix = None
# %% [markdown]
# ## Create Index
# %% [markdown]
# ### Prepare schema and index dir
# %%
schema = Schema(
sskey=ID(stored=True),
lemma_names=ID(stored=True),
cmn_lemma_names=NGRAMWORDS(1, 4, stored=True),
definitions=TEXT(stored=True),
pos=ID(stored=True)
)
ix = create_in('wn_index', schema)
# %%
all_ss = list(wn.all_synsets())
# %% [markdown]
# ### Write index
# %%
writer = ix.writer()
for ss in tqdm(all_ss):
sskey = ss.name()
lemma_names = ss.lemma_names()
cmn_lemma_names = ' '.join(ss.lemma_names('cmn'))
defi = ss.definition()
pos = ss.pos()
writer.add_document(sskey=sskey, lemma_names=lemma_names, cmn_lemma_names=cmn_lemma_names, definitions=defi, pos=pos)
writer.commit()
# %% [markdown]
# ## Query
# %%
if ix is None:
ix = open_dir('wn_index', readonly=True)
with ix.searcher() as searcher:
query = QueryParser("cmn_lemma_names", ix.schema).parse("氯化")
results = searcher.search(query)
for r in results:
display(r)
# %% [markdown]
# ## Close
# %%
if ix is not None:
ix.close()
ix = None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment