Skip to content

Instantly share code, notes, and snippets.

@Hironsan
Created November 9, 2022 12:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Hironsan/2b422939c76a2801dcb8eca958799f38 to your computer and use it in GitHub Desktop.
Save Hironsan/2b422939c76a2801dcb8eca958799f38 to your computer and use it in GitHub Desktop.
import spacy
from spacy.tokens import DocBin
from spacy_partial_tagger.tokenizer import CharacterTokenizer
text = "Selegiline - induced postural hypotension in Parkinson's disease: a longitudinal study on the effects of drug withdrawal."
patterns = [
{"label": "Chemical", "pattern": [{"LOWER": "selegiline"}]},
{"label": "Disease", "pattern": [{"LOWER": "hypotension"}]},
{
"label": "Disease",
"pattern": [{"LOWER": "parkinson"}, {"LOWER": "'s"}, {"LOWER": "disease"}],
},
]
# Add an entity ruler to the pipeline.
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
# Extract entities from the text.
doc = nlp(text)
entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
# Create a DocBin object.
nlp = spacy.blank("en")
nlp.tokenizer = CharacterTokenizer(nlp.vocab)
doc_bin = DocBin()
doc = nlp.make_doc(text)
doc.ents = [
doc.char_span(start, end, label=label) for start, end, label in entities
]
doc_bin.add(doc)
doc_bin.to_disk("/path/to/data.spacy")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment