Skip to content

Instantly share code, notes, and snippets.

@cristiano74
Last active January 31, 2019 06:35
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cristiano74/d62b741351fe9508d209bb4b82faf1d6 to your computer and use it in GitHub Desktop.
Save cristiano74/d62b741351fe9508d209bb4b82faf1d6 to your computer and use it in GitHub Desktop.
apply_model_ner: quick and dirty way to apply a spacy NER model to a JSONL file (prodigy.ai)
def apply_model_ner(source,spacy_model):
"""
source: "./data/T_4_slot_1.jsonl"
spacy_model="./model_T_2_1"
example --> apply_model_ner("./data/T_4_slot_1.jsonl","./model_T_2_1")
"""
from prodigy.components.loaders import JSONL
import copy
import spacy
from prodigy.util import set_hashes
stream = JSONL(source)
nlp = spacy.load(spacy_model)
lst=[]
texts = ((eg['text'], eg) for eg in stream)
for doc, eg in nlp.pipe(texts, as_tuples=True):
task = copy.deepcopy(eg)
spans = []
for ent in doc.ents:
spans.append({
'token_start': ent.start,
'token_end': ent.end-1,
'start': ent.start_char,
'end': ent.end_char,
'text': ent.text,
'label': ent.label_,
'source': spacy_model
#'input_hash': eg[INPUT_HASH_ATTR]
})
task['spans'] = spans
task = set_hashes(task)
lst.append(task)
#print(examples)
t=[]
e=[]
m=[]
for eg in lst:
entities = [(span['start'], span['end'], span['label']) for span in eg.get('spans', [])]
#entities = [span['label'] for span in eg.get('spans', [])]
tokens = [eg['text'][entities[i][0]:entities[i][1]] for i in range(len(entities))]
labels = [entities[i][2] for i in range(len(entities))]
meta= [eg['meta'] for i in range(len(entities))]
t.append(tokens)
e.append(labels)
m.append(meta)
tokens_flat= [item for sublist in t for item in sublist]
labels_flat =[item for sublist in e for item in sublist]
meta_flat =[item for sublist in m for item in sublist]
topic_flat=[i['topic'] for i in meta_flat]
industry_flat= [i['industry'] for i in meta_flat]
pos_flat= [i['pos'] for i in meta_flat]
keyword_flat= [i['key'] for i in meta_flat]
import pandas as pd
New_df = pd.DataFrame()
New_df = pd.DataFrame(
{'entity': tokens_flat,
'label': labels_flat,
'meta': meta_flat,
'topic': topic_flat,
'industry': industry_flat,
'pos': pos_flat,
'key': keyword_flat
})
New_df=New_df.drop_duplicates(subset=['entity'],keep="first")
return New_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment