Hironsan/create_dataset.py

## create_dataset.py
import spacy
from spacy.tokens import DocBin
from spacy_partial_tagger.tokenizer import CharacterTokenizer

text = "Selegiline - induced postural hypotension in Parkinson's disease: a longitudinal study on the effects of drug withdrawal."
patterns = [
    {"label": "Chemical", "pattern": [{"LOWER": "selegiline"}]},
    {"label": "Disease", "pattern": [{"LOWER": "hypotension"}]},
    {
        "label": "Disease",
        "pattern": [{"LOWER": "parkinson"}, {"LOWER": "'s"}, {"LOWER": "disease"}],
    },
]

# Add an entity ruler to the pipeline.
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)

# Extract entities from the text.
doc = nlp(text)
entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

# Create a DocBin object.
nlp = spacy.blank("en")
nlp.tokenizer = CharacterTokenizer(nlp.vocab)
doc_bin = DocBin()
doc = nlp.make_doc(text)
doc.ents = [
    doc.char_span(start, end, label=label) for start, end, label in entities
]
doc_bin.add(doc)
doc_bin.to_disk("/path/to/data.spacy")
	import spacy
	from spacy.tokens import DocBin
	from spacy_partial_tagger.tokenizer import CharacterTokenizer

	text = "Selegiline - induced postural hypotension in Parkinson's disease: a longitudinal study on the effects of drug withdrawal."
	patterns = [
	{"label": "Chemical", "pattern": [{"LOWER": "selegiline"}]},
	{"label": "Disease", "pattern": [{"LOWER": "hypotension"}]},
	{
	"label": "Disease",
	"pattern": [{"LOWER": "parkinson"}, {"LOWER": "'s"}, {"LOWER": "disease"}],
	},
	]

	# Add an entity ruler to the pipeline.
	nlp = spacy.blank("en")
	ruler = nlp.add_pipe("entity_ruler")
	ruler.add_patterns(patterns)

	# Extract entities from the text.
	doc = nlp(text)
	entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

	# Create a DocBin object.
	nlp = spacy.blank("en")
	nlp.tokenizer = CharacterTokenizer(nlp.vocab)
	doc_bin = DocBin()
	doc = nlp.make_doc(text)
	doc.ents = [
	doc.char_span(start, end, label=label) for start, end, label in entities
	]
	doc_bin.add(doc)
	doc_bin.to_disk("/path/to/data.spacy")