sdockray/example.py

## example.py
import spacy
nlp = spacy.load('en_core_web_sm')

def add_to_index(index, content, file_name, page_num):
    doc = nlp(filter_text(content))
    interesting_entities = ['PERSON', 'WORK_OF_ART', 'LOC', 'GPE']
    the_entities = []
    for entity in doc.ents:
        if entity.label_ in interesting_entities:
            index.setdefault(entity.text, {})
            index[entity.text].setdefault(file_name, {})
            if page_num not in index[entity.text][file_name]:
                index[entity.text][file_name][page_num] = 0
            index[entity.text][file_name][page_num] += 1


def add_to_phrasebook(index, content, file_name, page_num):
    doc = nlp(filter_text(content))
    for chunk in doc.noun_chunks:
        index.setdefault(chunk.text, {})
        index[chunk.text].setdefault(file_name, {})
        if page_num not in index[chunk.text][file_name]:
            index[chunk.text][file_name][page_num] = 0
        index[chunk.text][file_name][page_num] += 1

index = {}
# for every document, extract every page's content into a list
# then call the above functions with the empty index, the page content, the filename of the document, and the page number
# it will build the index
	import spacy
	nlp = spacy.load('en_core_web_sm')

	def add_to_index(index, content, file_name, page_num):
	doc = nlp(filter_text(content))
	interesting_entities = ['PERSON', 'WORK_OF_ART', 'LOC', 'GPE']
	the_entities = []
	for entity in doc.ents:
	if entity.label_ in interesting_entities:
	index.setdefault(entity.text, {})
	index[entity.text].setdefault(file_name, {})
	if page_num not in index[entity.text][file_name]:
	index[entity.text][file_name][page_num] = 0
	index[entity.text][file_name][page_num] += 1


	def add_to_phrasebook(index, content, file_name, page_num):
	doc = nlp(filter_text(content))
	for chunk in doc.noun_chunks:
	index.setdefault(chunk.text, {})
	index[chunk.text].setdefault(file_name, {})
	if page_num not in index[chunk.text][file_name]:
	index[chunk.text][file_name][page_num] = 0
	index[chunk.text][file_name][page_num] += 1

	index = {}
	# for every document, extract every page's content into a list
	# then call the above functions with the empty index, the page content, the filename of the document, and the page number
	# it will build the index