Skip to content

Instantly share code, notes, and snippets.

@wesslen
Last active July 20, 2022 16:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wesslen/25f8f694ce82934d74912f873785b7a1 to your computer and use it in GitHub Desktop.
Save wesslen/25f8f694ce82934d74912f873785b7a1 to your computer and use it in GitHub Desktop.
tagtog tsv synonym sub_type in spacy example
1 Bulbasaur Fushigidane
2 Ivysaur Fushigisou
3 Venusaur Fushigibana
4 Charmander Hitokage
5 Charmeleon Lizardo
# Assume we have an existing pattern matching rule-based entity (could also be a trained NER). This entity only identifies five different Pokemon characters as POKEMON.
from spacy.lang.en import English
nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "POKEMON", "pattern": [{"LOWER": "bulbasaur"}]},
{"label": "POKEMON", "pattern": [{"LOWER": "ivysaur"}]},
{"label": "POKEMON", "pattern": [{"LOWER": "venusaur"}]},
{"label": "POKEMON", "pattern": [{"LOWER": "charmander"}]},
{"label": "POKEMON", "pattern": [{"LOWER": "charmeleon"}]}]
ruler.add_patterns(patterns)
doc = nlp("Bulbasaur is a small, quadrupedal amphibian Pokémon that has blue-green skin with darker patches.")
print([(ent.text, ent.label_) for ent in doc.ents])
# Let's now load a `.tsv` file with synonyms for each of the pokemon character names.
import csv
# five example from https://raw.githubusercontent.com/tagtogorg/tagtog-doc/master/assets/dictionaries/pokemondict.tsv
file_name = "pokemondict.tsv"
def convert_tsv_dict(file_name):
"""
Convert tsv entity to dictionary
"""
with open(file_name) as file:
rd = csv.reader(file, delimiter="\t", quotechar='"')
types = {}
for row in rd:
types[row[1]] = row[2]
return types
POKEMON_SUBTYPES = convert_tsv_dict(file_name)
# We can now add a new sub-type entity that uses it as a synonym replacement.
from spacy.tokens import Span
# register global span._.entity_subtype extension
Span.set_extension('pokemon_subtypes', default=None)
from spacy.tokens import Doc
from spacy.language import Language
@Language.component("pokemon_subtypes")
def find_dates(doc: Doc):
for ent in doc.ents:
if ent.label_ == "POKEMON":
ent._.pokemon_subtypes = POKEMON_SUBTYPES.get(ent.text)
return doc
# add the new component pipe to your model
# only run once or else need to restart kernel
nlp.add_pipe("pokemon_subtypes")
doc = nlp("Bulbasaur is a small, quadrupedal amphibian Pokémon that has blue-green skin with darker patches.")
print([(ent.text, ent.label_, ent._.pokemon_subtypes) for ent in doc.ents])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment