Skip to content

Instantly share code, notes, and snippets.

@sanusart
Last active August 29, 2019 20:44
Show Gist options
  • Save sanusart/464c48aec3e912c842ee11b7689a67bf to your computer and use it in GitHub Desktop.
Save sanusart/464c48aec3e912c842ee11b7689a67bf to your computer and use it in GitHub Desktop.
Create training data #spacy #nlp
import spacy
from spacy.matcher import Matcher
from spacy.lang.en import English
nlp = English()
matcher = Matcher(nlp.vocab)
# create some patterns and add to matcher
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]
matcher.add("GADGET", None, pattern1, pattern2)
TRAINING_DATA = []
for doc in nlp.pipe(TEXTS):
# match on the doc and create a list of matched spans
spans = [doc[start:end] for match_id, start, end in matcher(doc)]
# Get (start character, end character, label) tuples of matches
entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
# Format the matches as a (doc.text, entities) tuple
training_example = (doc.text, {"entities": entities})
# Append the example to the training data
TRAINING_DATA.append(training_example)
# structure of resulting training data
TRAINING_DATA = [
("How to preorder the iPhone X", {'entities': [(20, 28, 'GADGET')]})
("Examples without labels are also needed", {'entities': []})
# And many more examples...
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment