Last active
April 15, 2021 10:17
-
-
Save kristjan-eljand/4d19bedeb54fadf1fca4de6377c00b38 to your computer and use it in GitHub Desktop.
Named Entity Recognition using Estonian text as an input and pre-trained English models for NER
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1. Initiate pipeline for Named Entity Recognition (ner) | |
ner = pipeline("ner") | |
# The output included encoded classes | |
# Here I give reasonable Estonian names to these classes | |
classes_est = { | |
"O": "Ei ole nimi", | |
"B-MIS": "Nime algus kohe pärast teist nimeüksust", | |
"I-MIS": "Muu üksus", | |
"B-PER": "Inimese nime algus kohe pärast teise inimese nime", | |
"I-PER": "Inimene", | |
"B-ORG": "Organisatsiooni nime algus kohe pärast teise organisatsiooni nime", | |
"I-ORG": "Organisatsioon", | |
"B-LOC": "Asukoha nime algus kohe pärast teist asukohta", | |
"I-LOC": "Asukoht" | |
} | |
# 2. Input sentence in Estonian from where the named entities | |
# are being searched from | |
sentence_est = """ | |
E-Lab on Eesti Energia IT osakonda kuuluv uurimis- ja arendusüksus. | |
Hando Sutter on Eesti Energia tegevjuht. | |
""" | |
# 3. Translating the estonian sentence to english | |
# with our pre-defined 'translate' function | |
sentence_eng = translate(sentence_est, EST_TO_ENG)[0]['translation_text'] | |
print("Est-to-Eng translation of original sentence:\n", sentence_eng) | |
# 4. Function for NER and reformatting the results | |
def ner_fun(sentence): | |
result = ner(sentence) | |
output = [] | |
for i,ent in enumerate(result): | |
# Replace cryptic entity keys with readable values | |
ent_key = ent['entity'] | |
ent_readable = classes_est[ent_key] | |
ent['entity'] = ent_readable | |
output.append(ent) | |
# Group together strings that belong to same entity | |
return ner.group_entities(output) | |
output = ner_fun(sentence_eng) | |
# 5. Print entities that were found and their groups | |
print("Named entities that were found:\n") | |
for o in output: | |
print(f"{o['word']}: {o['entity_group']}") | |
# Output: | |
# Est-to-Eng translation of original sentence: | |
# E-Lab is a research and development unit belonging to the IT department of Eesti Energia. | |
# Hando Sutter is the CEO of Eesti Energia. | |
# | |
# Named entities that were found: | |
# E - Lab: Organisatsioon | |
# Eesti Energia: Organisatsioon | |
# Hando Sutter: Inimene | |
# Eesti Energia: Organisatsioon |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment