Skip to content

Instantly share code, notes, and snippets.

@jaidevd
Created April 20, 2022 05:29
Show Gist options
  • Save jaidevd/41e77ea0a5147a138005cd78f94e22b9 to your computer and use it in GitHub Desktop.
Save jaidevd/41e77ea0a5147a138005cd78f94e22b9 to your computer and use it in GitHub Desktop.
Spacy pipelines
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "2a151d00-da41-46f1-9724-7438b8cb93da",
"metadata": {},
"outputs": [],
"source": [
"from spacy import load\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 121,
"id": "f3684a78-ee84-4b5e-a6ab-661b96cf4b08",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('patient_notes.csv')"
]
},
{
"cell_type": "code",
"execution_count": 122,
"id": "77910dd7-f122-4a7c-9082-8eea35400d0e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"17-year-old male, has come to the student health clinic complaining of heart pounding. Mr. Cleveland's mother has given verbal consent for a history, physical examination, and treatment\n",
"-began 2-3 months ago,sudden,intermittent for 2 days(lasting 3-4 min),worsening,non-allev/aggrav\n",
"-associated with dispnea on exersion and rest,stressed out about school\n",
"-reports fe feels like his heart is jumping out of his chest\n",
"-ros:denies chest pain,dyaphoresis,wt loss,chills,fever,nausea,vomiting,pedal edeam\n",
"-pmh:non,meds :aderol (from a friend),nkda\n",
"-fh:father had MI recently,mother has thyroid dz\n",
"-sh:non-smoker,mariguana 5-6 months ago,3 beers on the weekend, basketball at school\n",
"-sh:no std\n"
]
}
],
"source": [
"print(docs[0])"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "3f97af4d-a749-49f6-b0bb-52dc6da7bf12",
"metadata": {},
"outputs": [],
"source": [
"from spacy.language import Language\n",
"\n",
"@Language.component('sentence_boundary')\n",
"def sentence_boundary(doc):\n",
" # If there's a bullet list, treat it as a sentence.\n",
" for token in doc:\n",
" # If the 0th char is a hyphen preceded by a newline\n",
" prev = token.idx - 1\n",
" if prev >=0 and prev < len(doc.text):\n",
" if token.text[0] == '-' and doc.text[prev] == \"\\n\":\n",
" token.is_sent_start = True\n",
" else:\n",
" token.is_sent_start = False\n",
" return doc"
]
},
{
"cell_type": "code",
"execution_count": 109,
"id": "d1d26838-e108-429b-ae1a-581dcba26c23",
"metadata": {},
"outputs": [],
"source": [
"# Add entity recognition\n",
"from spacy.matcher import Matcher\n",
"from spacy.tokens import Span\n",
"from spacy.util import filter_spans\n",
"# Zero or more adjectives followed by one or more noun, if it is not already an entity\n",
"pattern = [\n",
" {'POS': 'ADJ', 'OP': '*'},\n",
" {'POS': 'NOUN', 'OP': '+'}\n",
"]\n",
"matcher = Matcher(nlp.vocab)\n",
"matcher.add(\"MedicalCondition\", [pattern])"
]
},
{
"cell_type": "code",
"execution_count": 119,
"id": "d7ae3883-fdce-4e47-bdd6-e394cac04f6c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"17-year-old male, has come to the student health clinic complaining of heart pounding. Mr. Cleveland's mother has given verbal consent for a history, physical examination, and treatment\n",
"\n",
"================================================================================\n",
"-began 2-3 months ago,sudden,intermittent for 2 days(lasting 3-4 min),worsening,non-allev/aggrav\n",
"\n",
"================================================================================\n",
"-associated with dispnea on exersion and rest,stressed out about school\n",
"\n",
"================================================================================\n",
"-reports fe feels like his heart is jumping out of his chest\n",
"\n",
"================================================================================\n",
"-ros:denies chest pain,dyaphoresis,wt loss,chills,fever,nausea,vomiting,pedal edeam\n",
"\n",
"================================================================================\n",
"-pmh:non,meds :aderol (from a friend),nkda\n",
"\n",
"================================================================================\n",
"-fh:father had MI recently,mother has thyroid dz\n",
"\n",
"================================================================================\n",
"-sh:non-smoker,mariguana 5-6 months ago,3 beers on the weekend, basketball at school\n",
"\n",
"================================================================================\n",
"-sh:no std\n",
"================================================================================\n"
]
}
],
"source": [
"nlp = load('en_core_web_sm')\n",
"nlp.add_pipe(\"sentence_boundary\", before=\"parser\")\n",
"\n",
"\n",
"doc = nlp(docs[0])\n",
"for sent in doc.sents:\n",
" print(sent)\n",
" print('=' * 80)"
]
},
{
"cell_type": "code",
"execution_count": 120,
"id": "60be2ff2-6d47-42b1-a40a-bb520fcb13f3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"year ENTITY\n",
"old male ENTITY\n",
"student health clinic complaining ENTITY\n",
"heart ENTITY\n",
"mother ENTITY\n",
"verbal consent ENTITY\n",
"history ENTITY\n",
"physical examination ENTITY\n",
"treatment ENTITY\n",
"-began ENTITY\n",
"months ENTITY\n",
"min),worsening ENTITY\n",
"dispnea ENTITY\n",
"exersion ENTITY\n",
"rest ENTITY\n",
"school ENTITY\n",
"-reports ENTITY\n",
"heart ENTITY\n",
"chest ENTITY\n",
"chest pain ENTITY\n",
"dyaphoresis ENTITY\n",
"wt loss ENTITY\n",
"chills ENTITY\n",
"fever ENTITY\n",
"nausea ENTITY\n",
"vomiting ENTITY\n",
"pedal edeam ENTITY\n",
"non ENTITY\n",
"meds ENTITY\n",
"aderol ENTITY\n",
"-fh ENTITY\n",
"father ENTITY\n",
"mother ENTITY\n",
"thyroid ENTITY\n",
"non- ENTITY\n",
"months ENTITY\n",
"ago,3 beers ENTITY\n",
"weekend ENTITY\n",
"basketball ENTITY\n",
"school ENTITY\n",
"std ENTITY\n"
]
}
],
"source": [
"spans = [Span(doc, start, end, label='ENTITY') for _, start, end in matcher(doc)]\n",
"spans = filter_spans(spans)\n",
"doc.set_ents(spans)\n",
"\n",
"for ent in doc.ents:\n",
" print(ent.text, ent.label_)"
]
},
{
"cell_type": "markdown",
"id": "e652d860-4be1-4324-bd8f-f9d320764d63",
"metadata": {},
"source": [
"### Gramex.yaml\n",
"\n",
"```yaml\n",
"handler: MLHandler\n",
"kwargs:\n",
" data:\n",
" # Must be parseable into a dataframe that has a text column.\n",
" # Other columns can hold classification labels, entity spans, etc.\n",
" url: $YAMLPATH/data.csv\n",
" class:\n",
" model: en_core_web_sm\n",
" # inserting something into the pipeline\n",
" pipeline:\n",
" - function: mymodule.sentence_detector\n",
" before: parser\n",
" - function: foo.bar\n",
" last: true\n",
" # postprocessing the doc\n",
" postporcess:\n",
" - function: foo.bar\n",
" - function: string.lower\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c403c438-c6af-434f-bb05-3f2c9e338b0a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment