justindavies/extract_ners.py

## extract_ners.py
from elasticsearch import Elasticsearch
import spacy
import os
import json
from pymongo import MongoClient
from spacy.pipeline import EntityRuler
import hashlib
import inflection

def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ";":
            doc[token.i+1].is_sent_start = True
        if token.text == "•":
            doc[token.i+1].is_sent_start = True
        if token.text == "●":
            doc[token.i+1].is_sent_start = True
        if token.text == ".":
            doc[token.i+1].is_sent_start = True

    return doc


es = Elasticsearch(os.environ["ES"])

nlp = spacy.load("en_core_web_sm")

nlp.add_pipe(set_custom_boundaries, before="ner")
nlp.add_pipe(nlp.create_pipe('sentencizer'), before="ner")

client = MongoClient(os.environ['DB'])
db = client.fundy

training = db.training


RULES = []
for ele in training.find({}):
    if ele["entitytype"] == "person":
        entitytype = "PERSON"
    if ele["entitytype"] == "org":
        entitytype = "ORG"
    if ele["entitytype"] == "law":
        entitytype = "LAW"
    if ele["entitytype"] == "product":
        entitytype = "PRODUCT"
    if ele["entitytype"] == "gpes":
        entitytype = "GPE"
    if ele["entitytype"] == "woa":
        entitytype = "WORK_OF_ART"
    if ele["entitytype"] == "ignore":
        entitytype = "IGNORE"
    if ele["entitytype"] == "money":
        entitytype = "MONEY"
    if ele["entitytype"] == "fac":
        entitytype = "FAC"

    try:
        patterns_string = ele["entity"].split()

        patterns = []
        if len(patterns_string):
            for p in patterns_string:
                patterns.append({"LOWER": p.lower()})
        else:
            patterns = [{"LOWER": ele["entity"].lower()}]

        RULES.append({"label": entitytype, "pattern": patterns})
    except Exception as e:
        pass


ner = nlp.get_pipe("ner")

ner.add_label("OTHER")

ruler = EntityRuler(nlp, overwrite_ents=True)

ruler.add_patterns(RULES)
nlp.add_pipe(ruler, after="ner")

# nlp.add_pipe(ruler, before="ner")

# nlp.add_pipe(nlp.create_pipe('sentencizer'))

hashes = []

def query_docs(from_page):
    res = es.search(index="item_1a", body={
        "_source": ["risks", "date", "form"],
        "size": 10,
        "from": from_page,

        "query": {
                "bool": {

                          "must": [{
          "match": {
            "form": "10-K"
          }
        },
        {
          "range": {"date": {"gte": "2019", "format": "yyyy"}}
        }

        ]
                }
        }})


    sentences = []

    for doc in res["hits"]["hits"]:
            try:
                if( doc["_source"]["form"]) == "10-K":
                    docner = nlp(inflection.transliterate(doc["_source"]["risks"]).replace("\n", " "))
                    for sent in docner.sents:
                        labels = []
                        text = ""

                        # print(sent.start)
                        # print(sent)
                        for ent in sent.ents:
                            # print(ent.sent.start_char)
                            # print(ent.text, ent.start_char-ent.sent.start_char, ent.end_char-ent.sent.start_char, ent.label_)
                            if str(ent.label_) != "IGNORE":
                                labels.append([ent.start_char-ent.sent.start_char, ent.end_char-ent.sent.start_char, ent.label_])

                        text = str(sent)

                        result = hashlib.sha224(text.encode()).hexdigest()

                        if result not in hashes:
                            if(len(text.strip()) > 10):
                                if (len(labels) > 0):
                                    sentences.append({"text": text, "labels": labels})
                                else:
                                    sentences.append({"text": text })

                            hashes.append(result)
            except Exception as e:
                continue


    for sentence in sentences:
        print(json.dumps(sentence))

for x in range(0, 999):
    query_docs(x)
	from elasticsearch import Elasticsearch
	import spacy
	import os
	import json
	from pymongo import MongoClient
	from spacy.pipeline import EntityRuler
	import hashlib
	import inflection

	def set_custom_boundaries(doc):
	for token in doc[:-1]:
	if token.text == ";":
	doc[token.i+1].is_sent_start = True
	if token.text == "•":
	doc[token.i+1].is_sent_start = True
	if token.text == "●":
	doc[token.i+1].is_sent_start = True
	if token.text == ".":
	doc[token.i+1].is_sent_start = True

	return doc


	es = Elasticsearch(os.environ["ES"])

	nlp = spacy.load("en_core_web_sm")

	nlp.add_pipe(set_custom_boundaries, before="ner")
	nlp.add_pipe(nlp.create_pipe('sentencizer'), before="ner")

	client = MongoClient(os.environ['DB'])
	db = client.fundy

	training = db.training


	RULES = []
	for ele in training.find({}):
	if ele["entitytype"] == "person":
	entitytype = "PERSON"
	if ele["entitytype"] == "org":
	entitytype = "ORG"
	if ele["entitytype"] == "law":
	entitytype = "LAW"
	if ele["entitytype"] == "product":
	entitytype = "PRODUCT"
	if ele["entitytype"] == "gpes":
	entitytype = "GPE"
	if ele["entitytype"] == "woa":
	entitytype = "WORK_OF_ART"
	if ele["entitytype"] == "ignore":
	entitytype = "IGNORE"
	if ele["entitytype"] == "money":
	entitytype = "MONEY"
	if ele["entitytype"] == "fac":
	entitytype = "FAC"

	try:
	patterns_string = ele["entity"].split()

	patterns = []
	if len(patterns_string):
	for p in patterns_string:
	patterns.append({"LOWER": p.lower()})
	else:
	patterns = [{"LOWER": ele["entity"].lower()}]

	RULES.append({"label": entitytype, "pattern": patterns})
	except Exception as e:
	pass



	ner = nlp.get_pipe("ner")

	ner.add_label("OTHER")

	ruler = EntityRuler(nlp, overwrite_ents=True)

	ruler.add_patterns(RULES)
	nlp.add_pipe(ruler, after="ner")

	# nlp.add_pipe(ruler, before="ner")

	# nlp.add_pipe(nlp.create_pipe('sentencizer'))

	hashes = []

	def query_docs(from_page):
	res = es.search(index="item_1a", body={
	"_source": ["risks", "date", "form"],
	"size": 10,
	"from": from_page,

	"query": {
	"bool": {

	"must": [{
	"match": {
	"form": "10-K"
	}
	},
	{
	"range": {"date": {"gte": "2019", "format": "yyyy"}}
	}

	]
	}
	}})



	sentences = []

	for doc in res["hits"]["hits"]:
	try:
	if( doc["_source"]["form"]) == "10-K":
	docner = nlp(inflection.transliterate(doc["_source"]["risks"]).replace("\n", " "))
	for sent in docner.sents:
	labels = []
	text = ""

	# print(sent.start)
	# print(sent)
	for ent in sent.ents:
	# print(ent.sent.start_char)
	# print(ent.text, ent.start_char-ent.sent.start_char, ent.end_char-ent.sent.start_char, ent.label_)
	if str(ent.label_) != "IGNORE":
	labels.append([ent.start_char-ent.sent.start_char, ent.end_char-ent.sent.start_char, ent.label_])

	text = str(sent)

	result = hashlib.sha224(text.encode()).hexdigest()

	if result not in hashes:
	if(len(text.strip()) > 10):
	if (len(labels) > 0):
	sentences.append({"text": text, "labels": labels})
	else:
	sentences.append({"text": text })

	hashes.append(result)
	except Exception as e:
	continue


	for sentence in sentences:
	print(json.dumps(sentence))

	for x in range(0, 999):
	query_docs(x)