Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Extract NER
from elasticsearch import Elasticsearch
import spacy
import os
import json
from pymongo import MongoClient
from spacy.pipeline import EntityRuler
import hashlib
import inflection
def set_custom_boundaries(doc):
for token in doc[:-1]:
if token.text == ";":
doc[token.i+1].is_sent_start = True
if token.text == "•":
doc[token.i+1].is_sent_start = True
if token.text == "●":
doc[token.i+1].is_sent_start = True
if token.text == ".":
doc[token.i+1].is_sent_start = True
return doc
es = Elasticsearch(os.environ["ES"])
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(set_custom_boundaries, before="ner")
nlp.add_pipe(nlp.create_pipe('sentencizer'), before="ner")
client = MongoClient(os.environ['DB'])
db = client.fundy
training = db.training
RULES = []
for ele in training.find({}):
if ele["entitytype"] == "person":
entitytype = "PERSON"
if ele["entitytype"] == "org":
entitytype = "ORG"
if ele["entitytype"] == "law":
entitytype = "LAW"
if ele["entitytype"] == "product":
entitytype = "PRODUCT"
if ele["entitytype"] == "gpes":
entitytype = "GPE"
if ele["entitytype"] == "woa":
entitytype = "WORK_OF_ART"
if ele["entitytype"] == "ignore":
entitytype = "IGNORE"
if ele["entitytype"] == "money":
entitytype = "MONEY"
if ele["entitytype"] == "fac":
entitytype = "FAC"
try:
patterns_string = ele["entity"].split()
patterns = []
if len(patterns_string):
for p in patterns_string:
patterns.append({"LOWER": p.lower()})
else:
patterns = [{"LOWER": ele["entity"].lower()}]
RULES.append({"label": entitytype, "pattern": patterns})
except Exception as e:
pass
ner = nlp.get_pipe("ner")
ner.add_label("OTHER")
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns(RULES)
nlp.add_pipe(ruler, after="ner")
# nlp.add_pipe(ruler, before="ner")
# nlp.add_pipe(nlp.create_pipe('sentencizer'))
hashes = []
def query_docs(from_page):
res = es.search(index="item_1a", body={
"_source": ["risks", "date", "form"],
"size": 10,
"from": from_page,
"query": {
"bool": {
"must": [{
"match": {
"form": "10-K"
}
},
{
"range": {"date": {"gte": "2019", "format": "yyyy"}}
}
]
}
}})
sentences = []
for doc in res["hits"]["hits"]:
try:
if( doc["_source"]["form"]) == "10-K":
docner = nlp(inflection.transliterate(doc["_source"]["risks"]).replace("\n", " "))
for sent in docner.sents:
labels = []
text = ""
# print(sent.start)
# print(sent)
for ent in sent.ents:
# print(ent.sent.start_char)
# print(ent.text, ent.start_char-ent.sent.start_char, ent.end_char-ent.sent.start_char, ent.label_)
if str(ent.label_) != "IGNORE":
labels.append([ent.start_char-ent.sent.start_char, ent.end_char-ent.sent.start_char, ent.label_])
text = str(sent)
result = hashlib.sha224(text.encode()).hexdigest()
if result not in hashes:
if(len(text.strip()) > 10):
if (len(labels) > 0):
sentences.append({"text": text, "labels": labels})
else:
sentences.append({"text": text })
hashes.append(result)
except Exception as e:
continue
for sentence in sentences:
print(json.dumps(sentence))
for x in range(0, 999):
query_docs(x)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment