Last active
July 20, 2022 16:43
-
-
Save wesslen/25f8f694ce82934d74912f873785b7a1 to your computer and use it in GitHub Desktop.
tagtog tsv synonym sub_type in spacy example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1 | Bulbasaur | Fushigidane | |
---|---|---|---|
2 | Ivysaur | Fushigisou | |
3 | Venusaur | Fushigibana | |
4 | Charmander | Hitokage | |
5 | Charmeleon | Lizardo |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Assume we have an existing pattern matching rule-based entity (could also be a trained NER). This entity only identifies five different Pokemon characters as POKEMON. | |
from spacy.lang.en import English | |
nlp = English() | |
ruler = nlp.add_pipe("entity_ruler") | |
patterns = [{"label": "POKEMON", "pattern": [{"LOWER": "bulbasaur"}]}, | |
{"label": "POKEMON", "pattern": [{"LOWER": "ivysaur"}]}, | |
{"label": "POKEMON", "pattern": [{"LOWER": "venusaur"}]}, | |
{"label": "POKEMON", "pattern": [{"LOWER": "charmander"}]}, | |
{"label": "POKEMON", "pattern": [{"LOWER": "charmeleon"}]}] | |
ruler.add_patterns(patterns) | |
doc = nlp("Bulbasaur is a small, quadrupedal amphibian Pokémon that has blue-green skin with darker patches.") | |
print([(ent.text, ent.label_) for ent in doc.ents]) | |
# Let's now load a `.tsv` file with synonyms for each of the pokemon character names. | |
import csv | |
# five example from https://raw.githubusercontent.com/tagtogorg/tagtog-doc/master/assets/dictionaries/pokemondict.tsv | |
file_name = "pokemondict.tsv" | |
def convert_tsv_dict(file_name): | |
""" | |
Convert tsv entity to dictionary | |
""" | |
with open(file_name) as file: | |
rd = csv.reader(file, delimiter="\t", quotechar='"') | |
types = {} | |
for row in rd: | |
types[row[1]] = row[2] | |
return types | |
POKEMON_SUBTYPES = convert_tsv_dict(file_name) | |
# We can now add a new sub-type entity that uses it as a synonym replacement. | |
from spacy.tokens import Span | |
# register global span._.entity_subtype extension | |
Span.set_extension('pokemon_subtypes', default=None) | |
from spacy.tokens import Doc | |
from spacy.language import Language | |
@Language.component("pokemon_subtypes") | |
def find_dates(doc: Doc): | |
for ent in doc.ents: | |
if ent.label_ == "POKEMON": | |
ent._.pokemon_subtypes = POKEMON_SUBTYPES.get(ent.text) | |
return doc | |
# add the new component pipe to your model | |
# only run once or else need to restart kernel | |
nlp.add_pipe("pokemon_subtypes") | |
doc = nlp("Bulbasaur is a small, quadrupedal amphibian Pokémon that has blue-green skin with darker patches.") | |
print([(ent.text, ent.label_, ent._.pokemon_subtypes) for ent in doc.ents]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment