Skip to content

Instantly share code, notes, and snippets.

@justindavies
Last active May 19, 2020 00:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save justindavies/2562a6d32190dc2980472753f55fc47e to your computer and use it in GitHub Desktop.
Save justindavies/2562a6d32190dc2980472753f55fc47e to your computer and use it in GitHub Desktop.
Extract NER
from elasticsearch import Elasticsearch
import spacy
import os
import json
from pymongo import MongoClient
from spacy.pipeline import EntityRuler
import hashlib
import inflection
def set_custom_boundaries(doc):
for token in doc[:-1]:
if token.text == ";":
doc[token.i+1].is_sent_start = True
if token.text == "•":
doc[token.i+1].is_sent_start = True
if token.text == "●":
doc[token.i+1].is_sent_start = True
if token.text == ".":
doc[token.i+1].is_sent_start = True
return doc
es = Elasticsearch(os.environ["ES"])
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(set_custom_boundaries, before="ner")
nlp.add_pipe(nlp.create_pipe('sentencizer'), before="ner")
client = MongoClient(os.environ['DB'])
db = client.fundy
training = db.training
RULES = []
for ele in training.find({}):
if ele["entitytype"] == "person":
entitytype = "PERSON"
if ele["entitytype"] == "org":
entitytype = "ORG"
if ele["entitytype"] == "law":
entitytype = "LAW"
if ele["entitytype"] == "product":
entitytype = "PRODUCT"
if ele["entitytype"] == "gpes":
entitytype = "GPE"
if ele["entitytype"] == "woa":
entitytype = "WORK_OF_ART"
if ele["entitytype"] == "ignore":
entitytype = "IGNORE"
if ele["entitytype"] == "money":
entitytype = "MONEY"
if ele["entitytype"] == "fac":
entitytype = "FAC"
try:
patterns_string = ele["entity"].split()
patterns = []
if len(patterns_string):
for p in patterns_string:
patterns.append({"LOWER": p.lower()})
else:
patterns = [{"LOWER": ele["entity"].lower()}]
RULES.append({"label": entitytype, "pattern": patterns})
except Exception as e:
pass
ner = nlp.get_pipe("ner")
ner.add_label("OTHER")
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns(RULES)
nlp.add_pipe(ruler, after="ner")
# nlp.add_pipe(ruler, before="ner")
# nlp.add_pipe(nlp.create_pipe('sentencizer'))
hashes = []
def query_docs(from_page):
res = es.search(index="item_1a", body={
"_source": ["risks", "date", "form"],
"size": 10,
"from": from_page,
"query": {
"bool": {
"must": [{
"match": {
"form": "10-K"
}
},
{
"range": {"date": {"gte": "2019", "format": "yyyy"}}
}
]
}
}})
sentences = []
for doc in res["hits"]["hits"]:
try:
if( doc["_source"]["form"]) == "10-K":
docner = nlp(inflection.transliterate(doc["_source"]["risks"]).replace("\n", " "))
for sent in docner.sents:
labels = []
text = ""
# print(sent.start)
# print(sent)
for ent in sent.ents:
# print(ent.sent.start_char)
# print(ent.text, ent.start_char-ent.sent.start_char, ent.end_char-ent.sent.start_char, ent.label_)
if str(ent.label_) != "IGNORE":
labels.append([ent.start_char-ent.sent.start_char, ent.end_char-ent.sent.start_char, ent.label_])
text = str(sent)
result = hashlib.sha224(text.encode()).hexdigest()
if result not in hashes:
if(len(text.strip()) > 10):
if (len(labels) > 0):
sentences.append({"text": text, "labels": labels})
else:
sentences.append({"text": text })
hashes.append(result)
except Exception as e:
continue
for sentence in sentences:
print(json.dumps(sentence))
for x in range(0, 999):
query_docs(x)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment