wesslen/pokemondict.tsv

## pokemondict.tsv

          
            1
            Bulbasaur
            Fushigidane

            
              2
              Ivysaur
              Fushigisou

            
              3
              Venusaur
              Fushigibana

            
              4
              Charmander
              Hitokage

            
              5
              Charmeleon
              Lizardo

## spacy_synonym_subtype.py
# Assume we have an existing pattern matching rule-based entity (could also be a trained NER). This entity only identifies five different Pokemon characters as POKEMON.

from spacy.lang.en import English

nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "POKEMON", "pattern": [{"LOWER": "bulbasaur"}]},
            {"label": "POKEMON", "pattern": [{"LOWER": "ivysaur"}]},
            {"label": "POKEMON", "pattern": [{"LOWER": "venusaur"}]},
            {"label": "POKEMON", "pattern": [{"LOWER": "charmander"}]},
            {"label": "POKEMON", "pattern": [{"LOWER": "charmeleon"}]}]
ruler.add_patterns(patterns)

doc = nlp("Bulbasaur is a small, quadrupedal amphibian Pokémon that has blue-green skin with darker patches.")
print([(ent.text, ent.label_) for ent in doc.ents])

# Let's now load a `.tsv` file with synonyms for each of the pokemon character names.

import csv

# five example from https://raw.githubusercontent.com/tagtogorg/tagtog-doc/master/assets/dictionaries/pokemondict.tsv
file_name = "pokemondict.tsv"

def convert_tsv_dict(file_name):
    """
    Convert tsv entity to dictionary
    """
    with open(file_name) as file:
        rd = csv.reader(file, delimiter="\t", quotechar='"')

        types = {}
        for row in rd:
            types[row[1]] = row[2]

    return types

POKEMON_SUBTYPES = convert_tsv_dict(file_name)

# We can now add a new sub-type entity that uses it as a synonym replacement.

from spacy.tokens import Span

# register global span._.entity_subtype extension
Span.set_extension('pokemon_subtypes', default=None)

from spacy.tokens import Doc
from spacy.language import Language

@Language.component("pokemon_subtypes")
def find_dates(doc: Doc):
    for ent in doc.ents:
        if ent.label_ == "POKEMON":
            ent._.pokemon_subtypes = POKEMON_SUBTYPES.get(ent.text)
    return doc

# add the new component pipe to your model
# only run once or else need to restart kernel
nlp.add_pipe("pokemon_subtypes")

doc = nlp("Bulbasaur is a small, quadrupedal amphibian Pokémon that has blue-green skin with darker patches.")

print([(ent.text, ent.label_, ent._.pokemon_subtypes) for ent in doc.ents])
1	Bulbasaur	Fushigidane
2	Ivysaur	Fushigisou
3	Venusaur	Fushigibana
4	Charmander	Hitokage
5	Charmeleon	Lizardo
	# Assume we have an existing pattern matching rule-based entity (could also be a trained NER). This entity only identifies five different Pokemon characters as POKEMON.

	from spacy.lang.en import English

	nlp = English()
	ruler = nlp.add_pipe("entity_ruler")
	patterns = [{"label": "POKEMON", "pattern": [{"LOWER": "bulbasaur"}]},
	{"label": "POKEMON", "pattern": [{"LOWER": "ivysaur"}]},
	{"label": "POKEMON", "pattern": [{"LOWER": "venusaur"}]},
	{"label": "POKEMON", "pattern": [{"LOWER": "charmander"}]},
	{"label": "POKEMON", "pattern": [{"LOWER": "charmeleon"}]}]
	ruler.add_patterns(patterns)

	doc = nlp("Bulbasaur is a small, quadrupedal amphibian Pokémon that has blue-green skin with darker patches.")
	print([(ent.text, ent.label_) for ent in doc.ents])

	# Let's now load a `.tsv` file with synonyms for each of the pokemon character names.

	import csv

	# five example from https://raw.githubusercontent.com/tagtogorg/tagtog-doc/master/assets/dictionaries/pokemondict.tsv
	file_name = "pokemondict.tsv"

	def convert_tsv_dict(file_name):
	"""
	Convert tsv entity to dictionary
	"""
	with open(file_name) as file:
	rd = csv.reader(file, delimiter="\t", quotechar='"')

	types = {}
	for row in rd:
	types[row[1]] = row[2]

	return types

	POKEMON_SUBTYPES = convert_tsv_dict(file_name)

	# We can now add a new sub-type entity that uses it as a synonym replacement.

	from spacy.tokens import Span

	# register global span._.entity_subtype extension
	Span.set_extension('pokemon_subtypes', default=None)

	from spacy.tokens import Doc
	from spacy.language import Language

	@Language.component("pokemon_subtypes")
	def find_dates(doc: Doc):
	for ent in doc.ents:
	if ent.label_ == "POKEMON":
	ent._.pokemon_subtypes = POKEMON_SUBTYPES.get(ent.text)
	return doc

	# add the new component pipe to your model
	# only run once or else need to restart kernel
	nlp.add_pipe("pokemon_subtypes")

	doc = nlp("Bulbasaur is a small, quadrupedal amphibian Pokémon that has blue-green skin with darker patches.")

	print([(ent.text, ent.label_, ent._.pokemon_subtypes) for ent in doc.ents])