Created
May 6, 2020 13:13
-
-
Save sdockray/1efd892641473405ae3e21d89ae7d968 to your computer and use it in GitHub Desktop.
Two functions for indexing a page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
nlp = spacy.load('en_core_web_sm') | |
def add_to_index(index, content, file_name, page_num): | |
doc = nlp(filter_text(content)) | |
interesting_entities = ['PERSON', 'WORK_OF_ART', 'LOC', 'GPE'] | |
the_entities = [] | |
for entity in doc.ents: | |
if entity.label_ in interesting_entities: | |
index.setdefault(entity.text, {}) | |
index[entity.text].setdefault(file_name, {}) | |
if page_num not in index[entity.text][file_name]: | |
index[entity.text][file_name][page_num] = 0 | |
index[entity.text][file_name][page_num] += 1 | |
def add_to_phrasebook(index, content, file_name, page_num): | |
doc = nlp(filter_text(content)) | |
for chunk in doc.noun_chunks: | |
index.setdefault(chunk.text, {}) | |
index[chunk.text].setdefault(file_name, {}) | |
if page_num not in index[chunk.text][file_name]: | |
index[chunk.text][file_name][page_num] = 0 | |
index[chunk.text][file_name][page_num] += 1 | |
index = {} | |
# for every document, extract every page's content into a list | |
# then call the above functions with the empty index, the page content, the filename of the document, and the page number | |
# it will build the index |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment