Skip to content

Instantly share code, notes, and snippets.

@sdockray
Created May 6, 2020 13:13
Show Gist options
  • Save sdockray/1efd892641473405ae3e21d89ae7d968 to your computer and use it in GitHub Desktop.
Save sdockray/1efd892641473405ae3e21d89ae7d968 to your computer and use it in GitHub Desktop.
Two functions for indexing a page
import spacy
nlp = spacy.load('en_core_web_sm')
def add_to_index(index, content, file_name, page_num):
doc = nlp(filter_text(content))
interesting_entities = ['PERSON', 'WORK_OF_ART', 'LOC', 'GPE']
the_entities = []
for entity in doc.ents:
if entity.label_ in interesting_entities:
index.setdefault(entity.text, {})
index[entity.text].setdefault(file_name, {})
if page_num not in index[entity.text][file_name]:
index[entity.text][file_name][page_num] = 0
index[entity.text][file_name][page_num] += 1
def add_to_phrasebook(index, content, file_name, page_num):
doc = nlp(filter_text(content))
for chunk in doc.noun_chunks:
index.setdefault(chunk.text, {})
index[chunk.text].setdefault(file_name, {})
if page_num not in index[chunk.text][file_name]:
index[chunk.text][file_name][page_num] = 0
index[chunk.text][file_name][page_num] += 1
index = {}
# for every document, extract every page's content into a list
# then call the above functions with the empty index, the page content, the filename of the document, and the page number
# it will build the index
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment