sanusart/create_training_data.py

## create_training_data.py
import spacy
from spacy.matcher import Matcher
from spacy.lang.en import English

nlp = English()
matcher = Matcher(nlp.vocab)

# create some patterns and add to matcher
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]
matcher.add("GADGET", None, pattern1, pattern2)

TRAINING_DATA = []

for doc in nlp.pipe(TEXTS):
    # match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {"entities": entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)

# structure of resulting training data
TRAINING_DATA = [
    ("How to preorder the iPhone X", {'entities': [(20, 28, 'GADGET')]})
    ("Examples without labels are also needed", {'entities': []})
    # And many more examples...
]

## mvfzf
 hghj
	import spacy
	from spacy.matcher import Matcher
	from spacy.lang.en import English

	nlp = English()
	matcher = Matcher(nlp.vocab)

	# create some patterns and add to matcher
	pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
	pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]
	matcher.add("GADGET", None, pattern1, pattern2)

	TRAINING_DATA = []

	for doc in nlp.pipe(TEXTS):
	# match on the doc and create a list of matched spans
	spans = [doc[start:end] for match_id, start, end in matcher(doc)]
	# Get (start character, end character, label) tuples of matches
	entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
	# Format the matches as a (doc.text, entities) tuple
	training_example = (doc.text, {"entities": entities})
	# Append the example to the training data
	TRAINING_DATA.append(training_example)

	# structure of resulting training data
	TRAINING_DATA = [
	("How to preorder the iPhone X", {'entities': [(20, 28, 'GADGET')]})
	("Examples without labels are also needed", {'entities': []})
	# And many more examples...
	]