Text Summarization Using NLP
### Process of text summarization
- Text cleaning
- Sentence tokenization
- Word tokenization
- Word frequencey table
- Clustering
- Summarization
text = '''
After the first vaccination, antibody responses were higher with higher dose (day 29 enzyme-linked immunosorbent assay anti–S-2P antibody geometric mean titer [GMT], 40,227 in the 25-μg group, 109,209 in the 100-μg group, and 213,526 in the 250-μg group). After the second vaccination, the titers increased (day 57 GMT, 299,751, 782,719, and 1,192,154, respectively). After the second vaccination, serum-neutralizing activity was detected by two methods in all participants evaluated, with values generally similar to those in the upper half of the distribution of a panel of control convalescent serum specimens. Solicited adverse events that occurred in more than half the participants included fatigue, chills, headache, myalgia, and pain at the injection site. Systemic adverse events were more common after the second vaccination, particularly with the highest dose, and three participants (21%) in the 250-μg dose group reported one or more severe adverse events.
'''
text
'\\nAfter the first vaccination, antibody responses were higher with higher dose (day 29 enzyme-linked immunosorbent assay anti–S-2P antibody geometric mean titer [GMT], 40,227 in the 25-μg group, 109,209 in the 100-μg group, and 213,526 in the 250-μg group). After the second vaccination, the titers increased (day 57 GMT, 299,751, 782,719, and 1,192,154, respectively). After the second vaccination, serum-neutralizing activity was detected by two methods in all participants evaluated, with values generally similar to those in the upper half of the distribution of a panel of control convalescent serum specimens. Solicited adverse events that occurred in more than half the participants included fatigue, chills, headache, myalgia, and pain at the injection site. Systemic adverse events were more common after the second vaccination, particularly with the highest dose, and three participants (21%) in the 250-μg dose group reported one or more severe adverse events.\\n'
text = text.split('\\n')
text
['',
 'After the first vaccination, antibody responses were higher with higher dose (day 29 enzyme-linked immunosorbent assay anti–S-2P antibody geometric mean titer [GMT], 40,227 in the 25-μg group, 109,209 in the 100-μg group, and 213,526 in the 250-μg group). After the second vaccination, the titers increased (day 57 GMT, 299,751, 782,719, and 1,192,154, respectively). After the second vaccination, serum-neutralizing activity was detected by two methods in all participants evaluated, with values generally similar to those in the upper half of the distribution of a panel of control convalescent serum specimens. Solicited adverse events that occurred in more than half the participants included fatigue, chills, headache, myalgia, and pain at the injection site. Systemic adverse events were more common after the second vaccination, particularly with the highest dose, and three participants (21%) in the 250-μg dose group reported one or more severe adverse events.',
 '']
lines = []

for line in text:
 if len(line) > 0:
 lines.append(line)

text = ' '.join(lines)
text
'After the first vaccination, antibody responses were higher with higher dose (day 29 enzyme-linked immunosorbent assay anti–S-2P antibody geometric mean titer [GMT], 40,227 in the 25-μg group, 109,209 in the 100-μg group, and 213,526 in the 250-μg group). After the second vaccination, the titers increased (day 57 GMT, 299,751, 782,719, and 1,192,154, respectively). After the second vaccination, serum-neutralizing activity was detected by two methods in all participants evaluated, with values generally similar to those in the upper half of the distribution of a panel of control convalescent serum specimens. Solicited adverse events that occurred in more than half the participants included fatigue, chills, headache, myalgia, and pain at the injection site. Systemic adverse events were more common after the second vaccination, particularly with the highest dose, and three participants (21%) in the 250-μg dose group reported one or more severe adverse events.'
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
stop_words = STOP_WORDS
stop_words
"text/plain": "{\"'d\",\n \"'ll\",\n \"'m\",\n \"'re\",\n \"'s\",\n \"'ve\",\n 'a',\n 'about',\n 'above',\n 'across',\n 'after',\n 'afterwards',\n 'again',\n 'against',\n 'all',\n 'almost',\n 'alone',\n 'along',\n 'already',\n 'also',\n 'although',\n 'always',\n 'am',\n 'among',\n 'amongst',\n 'amount',\n 'an',\n 'and',\n 'another',\n 'any',\n 'anyhow',\n 'anyone',\n 'anything',\n 'anyway',\n 'anywhere',\n 'are',\n 'around',\n 'as',\n 'at',\n 'back',\n 'be',\n 'became',\n 'because',\n 'become',\n 'becomes',\n 'becoming',\n 'been',\n 'before',\n 'beforehand',\n 'behind',\n 'being',\n 'below',\n 'beside',\n 'besides',\n 'between',\n 'beyond',\n 'both',\n 'bottom',\n 'but',\n 'by',\n 'ca',\n 'call',\n 'can',\n 'cannot',\n 'could',\n 'did',\n 'do',\n 'does',\n 'doing',\n 'done',\n 'down',\n 'due',\n 'during',\n 'each',\n 'eight',\n 'either',\n 'eleven',\n 'else',\n 'elsewhere',\n 'empty',\n 'enough',\n 'even',\n 'ever',\n 'every',\n 'everyone',\n 'everything',\n 'everywhere',\n 'except',\n 'few',\n 'fifteen',\n 'fifty',\n 'first',\n 'five',\n 'for',\n 'former',\n 'formerly',\n 'forty',\n 'four',\n 'from',\n 'front',\n 'full',\n 'further',\n 'get',\n 'give',\n 'go',\n 'had',\n 'has',\n 'have',\n 'he',\n 'hence',\n 'her',\n 'here',\n 'hereafter',\n 'hereby',\n 'herein',\n 'hereupon',\n 'hers',\n 'herself',\n 'him',\n 'himself',\n 'his',\n 'how',\n 'however',\n 'hundred',\n 'i',\n 'if',\n 'in',\n 'indeed',\n 'into',\n 'is',\n 'it',\n 'its',\n 'itself',\n 'just',\n 'keep',\n 'last',\n 'latter',\n 'latterly',\n 'least',\n 'less',\n 'made',\n 'make',\n 'many',\n 'may',\n 'me',\n 'meanwhile',\n 'might',\n 'mine',\n 'more',\n 'moreover',\n 'most',\n 'mostly',\n 'move',\n 'much',\n 'must',\n 'my',\n 'myself',\n \"n't\",\n 'name',\n 'namely',\n 'neither',\n 'never',\n 'nevertheless',\n 'next',\n 'nine',\n 'no',\n 'nobody',\n 'none',\n 'noone',\n 'nor',\n 'not',\n 'nothing',\n 'now',\n 'nowhere',\n 'n‘t',\n 'n’t',\n 'of',\n 'off',\n 'often',\n 'on',\n 'once',\n 'one',\n 'only',\n 'onto',\n 'or',\n 'other',\n 'others',\n 'otherwise',\n 'our',\n 'ours',\n 'ourselves',\n 'out',\n 'over',\n 'own',\n 'part',\n 'per',\n 'perhaps',\n 'please',\n 'put',\n 'quite',\n 'rather',\n 're',\n 'really',\n 'regarding',\n 'same',\n 'say',\n 'see',\n 'seem',\n 'seemed',\n 'seeming',\n 'seems',\n 'serious',\n 'several',\n 'she',\n 'should',\n 'show',\n 'side',\n 'since',\n 'six',\n 'sixty',\n 'so',\n 'some',\n 'somehow',\n 'someone',\n 'something',\n 'sometime',\n 'sometimes',\n 'somewhere',\n 'still',\n 'such',\n 'take',\n 'ten',\n 'than',\n 'that',\n 'the',\n 'their',\n 'them',\n 'themselves',\n 'then',\n 'thence',\n 'there',\n 'thereafter',\n 'thereby',\n 'therefore',\n 'therein',\n 'thereupon',\n 'these',\n 'they',\n 'third',\n 'this',\n 'those',\n 'though',\n 'three',\n 'through',\n 'throughout',\n 'thru',\n 'thus',\n 'to',\n 'together',\n 'too',\n 'top',\n 'toward',\n 'towards',\n 'twelve',\n 'twenty',\n 'two',\n 'under',\n 'unless',\n 'until',\n 'up',\n 'upon',\n 'us',\n 'used',\n 'using',\n 'various',\n 'very',\n 'via',\n 'was',\n 'we',\n 'well',\n 'were',\n 'what',\n 'whatever',\n 'when',\n 'whence',\n 'whenever',\n 'where',\n 'whereafter',\n 'whereas',\n 'whereby',\n 'wherein',\n 'whereupon',\n 'wherever',\n 'whether',\n 'which',\n 'while',\n 'whither',\n 'who',\n 'whoever',\n 'whole',\n 'whom',\n 'whose',\n 'why',\n 'will',\n 'with',\n 'within',\n 'without',\n 'would',\n 'yet',\n 'you',\n 'your',\n 'yours',\n 'yourself',\n 'yourselves',\n '‘d',\n '‘ll',\n '‘m',\n '‘re',\n '‘s',\n '‘ve',\n '’d',\n '’ll',\n '’m',\n '’re',\n '’s',\n '’ve'}"
stop_words = list(stop_words)
stop_words
"text/plain": "['make',\n 'always',\n 'using',\n 'already',\n 'my',\n 'yourselves',\n 'whole',\n 'also',\n 'anything',\n 'noone',\n 'anywhere',\n 'nowhere',\n '‘re',\n 'otherwise',\n 'you',\n 'their',\n 'sixty',\n 'seeming',\n 'yourself',\n 'five',\n 'once',\n 'ourselves',\n 'have',\n 'whereafter',\n 'himself',\n 'before',\n 'alone',\n 'everything',\n 'beyond',\n 'name',\n 'during',\n '’m',\n 'made',\n 'call',\n 'eleven',\n 'along',\n 'these',\n 'hereafter',\n 'onto',\n 'since',\n 'between',\n 'i',\n 'first',\n 'keep',\n 'over',\n 'ten',\n 'which',\n 'might',\n 'above',\n 'becomes',\n 'below',\n 'whom',\n 'if',\n 'from',\n 'amount',\n 'somewhere',\n 'may',\n 'something',\n 'almost',\n 'does',\n \"n't\",\n 'such',\n 'seems',\n 'more',\n 'whenever',\n 'except',\n 'very',\n 'down',\n 'toward',\n 'across',\n 'eight',\n 'because',\n 'whose',\n 'can',\n 'nobody',\n 'some',\n 'too',\n 'or',\n 'me',\n 'wherever',\n 'as',\n 'did',\n 'latterly',\n 'him',\n 'again',\n 'done',\n 'nine',\n 'at',\n 'that',\n 'else',\n 'a',\n 'really',\n 'by',\n 'full',\n '’d',\n 'being',\n 'next',\n 'themselves',\n 'further',\n 'get',\n 'due',\n 'see',\n 'many',\n 'hereupon',\n 'would',\n 'ever',\n 'while',\n 'seemed',\n 'regarding',\n 'own',\n '‘d',\n 'although',\n 'how',\n 'were',\n 'another',\n 'could',\n 'had',\n 'twenty',\n 'sometime',\n 'still',\n 'who',\n 'hers',\n 'per',\n 'hundred',\n 'rather',\n '’s',\n 'each',\n 'anyhow',\n 'either',\n 'it',\n 'whereas',\n 'side',\n 'same',\n 'on',\n 'enough',\n 'became',\n 'wherein',\n 'any',\n 'nothing',\n 'last',\n 'myself',\n 'what',\n \"'ll\",\n 'however',\n 'will',\n 'elsewhere',\n 'all',\n 'perhaps',\n 'only',\n 'and',\n 'until',\n 'is',\n \"'d\",\n 'empty',\n 'somehow',\n 'none',\n 'anyway',\n 'so',\n 'thereby',\n 'whereupon',\n 'n‘t',\n 'whether',\n 'cannot',\n 'thereafter',\n 'than',\n 'three',\n 'around',\n 'unless',\n 'fifty',\n 'beforehand',\n 'with',\n 'twelve',\n 'anyone',\n 'hence',\n 'now',\n \"'re\",\n 'has',\n 'was',\n 'no',\n 'whither',\n 'give',\n 'behind',\n 'say',\n 'thence',\n 'through',\n 'used',\n 'neither',\n 'just',\n 'nor',\n 'most',\n 'hereby',\n 'her',\n 'less',\n \"'ve\",\n 'been',\n 'this',\n 'former',\n 'yours',\n '‘s',\n 'though',\n 'then',\n 'itself',\n 'someone',\n 'among',\n '’ll',\n 'fifteen',\n 'please',\n 'back',\n 'other',\n 'us',\n 'amongst',\n 'move',\n 'am',\n 'forty',\n 'those',\n 'thus',\n 'herself',\n 'least',\n \"'s\",\n 'out',\n 'well',\n 'mine',\n 'its',\n 'seem',\n 'several',\n 'yet',\n 'becoming',\n 'third',\n 'against',\n 'throughout',\n 'but',\n 'under',\n 'your',\n 'afterwards',\n 'for',\n 'moreover',\n 'why',\n 'the',\n 'go',\n 'he',\n 'do',\n '‘m',\n 'four',\n 'upon',\n 'sometimes',\n 'become',\n 'formerly',\n 'besides',\n 'front',\n 'after',\n 'our',\n 'ca',\n 'where',\n 'be',\n 'thru',\n 'never',\n '‘ve',\n 'in',\n 'quite',\n 'six',\n 'are',\n 'herein',\n 'via',\n 'beside',\n 'therein',\n 'thereupon',\n \"'m\",\n '‘ll',\n 'whatever',\n 'she',\n 'put',\n 'within',\n 'without',\n 'others',\n 'here',\n 're',\n 'top',\n 'indeed',\n 'bottom',\n 'to',\n 'whereby',\n 'various',\n 'we',\n 'few',\n 'ours',\n 'both',\n 'mostly',\n 'show',\n 'one',\n 'whoever',\n 'meanwhile',\n '’ve',\n 'off',\n 'his',\n 'must',\n 'two',\n 'them',\n 'every',\n 'much',\n 'therefore',\n 'there',\n 'not',\n 'serious',\n 'everywhere',\n 'doing',\n 'whence',\n 'about',\n 'latter',\n 'should',\n 'often',\n 'of',\n 'take',\n 'part',\n '’re',\n 'everyone',\n 'an',\n 'nevertheless',\n 'namely',\n 'even',\n 'up',\n 'when',\n 'into',\n 'towards',\n 'n’t',\n 'together',\n 'they']"
nlp = spacy.load('en_core_web_lg')
doc = nlp(text)
tokens = [token.text for token in doc]
tokens
"text/plain": "['After',\n 'the',\n 'first',\n 'vaccination',\n ',',\n 'antibody',\n 'responses',\n 'were',\n 'higher',\n 'with',\n 'higher',\n 'dose',\n '(',\n 'day',\n '29',\n 'enzyme',\n '-',\n 'linked',\n 'immunosorbent',\n 'assay',\n 'anti',\n '–',\n 'S-2P',\n 'antibody',\n 'geometric',\n 'mean',\n 'titer',\n '[',\n 'GMT',\n ']',\n ',',\n '40,227',\n 'in',\n 'the',\n '25-μg',\n 'group',\n ',',\n '109,209',\n 'in',\n 'the',\n '100-μg',\n 'group',\n ',',\n 'and',\n '213,526',\n 'in',\n 'the',\n '250-μg',\n 'group',\n ')',\n '.',\n 'After',\n 'the',\n 'second',\n 'vaccination',\n ',',\n 'the',\n 'titers',\n 'increased',\n '(',\n 'day',\n '57',\n 'GMT',\n ',',\n '299,751',\n ',',\n '782,719',\n ',',\n 'and',\n '1,192,154',\n ',',\n 'respectively',\n ')',\n '.',\n 'After',\n 'the',\n 'second',\n 'vaccination',\n ',',\n 'serum',\n '-',\n 'neutralizing',\n 'activity',\n 'was',\n 'detected',\n 'by',\n 'two',\n 'methods',\n 'in',\n 'all',\n 'participants',\n 'evaluated',\n ',',\n 'with',\n 'values',\n 'generally',\n 'similar',\n 'to',\n 'those',\n 'in',\n 'the',\n 'upper',\n 'half',\n 'of',\n 'the',\n 'distribution',\n 'of',\n 'a',\n 'panel',\n 'of',\n 'control',\n 'convalescent',\n 'serum',\n 'specimens',\n '.',\n 'Solicited',\n 'adverse',\n 'events',\n 'that',\n 'occurred',\n 'in',\n 'more',\n 'than',\n 'half',\n 'the',\n 'participants',\n 'included',\n 'fatigue',\n ',',\n 'chills',\n ',',\n 'headache',\n ',',\n 'myalgia',\n ',',\n 'and',\n 'pain',\n 'at',\n 'the',\n 'injection',\n 'site',\n '.',\n 'Systemic',\n 'adverse',\n 'events',\n 'were',\n 'more',\n 'common',\n 'after',\n 'the',\n 'second',\n 'vaccination',\n ',',\n 'particularly',\n 'with',\n 'the',\n 'highest',\n 'dose',\n ',',\n 'and',\n 'three',\n 'participants',\n '(',\n '21',\n '%',\n ')',\n 'in',\n 'the',\n '250-μg',\n 'dose',\n 'group',\n 'reported',\n 'one',\n 'or',\n 'more',\n 'severe',\n 'adverse',\n 'events',\n '.']"
# Calculating word frequencies
word_frequency = {}

for word in doc:
 # if token is not a stop word
 if word.text.lower() not in stop_words:
 # if token is not a punctuation
 if word.text.lower() not in punctuation:
 # if word is not already in dictionary
 if word.text not in word_frequency.keys():
 word_frequency[word.text] = 1
 else:
 word_frequency[word.text] += 1
word_frequency
"text/plain": "{'vaccination': 4,\n 'antibody': 2,\n 'responses': 1,\n 'higher': 2,\n 'dose': 3,\n 'day': 2,\n '29': 1,\n 'enzyme': 1,\n 'linked': 1,\n 'immunosorbent': 1,\n 'assay': 1,\n 'anti': 1,\n '–': 1,\n 'S-2P': 1,\n 'geometric': 1,\n 'mean': 1,\n 'titer': 1,\n 'GMT': 2,\n '40,227': 1,\n '25-μg': 1,\n 'group': 4,\n '109,209': 1,\n '100-μg': 1,\n '213,526': 1,\n '250-μg': 2,\n 'second': 3,\n 'titers': 1,\n 'increased': 1,\n '57': 1,\n '299,751': 1,\n '782,719': 1,\n '1,192,154': 1,\n 'respectively': 1,\n 'serum': 2,\n 'neutralizing': 1,\n 'activity': 1,\n 'detected': 1,\n 'methods': 1,\n 'participants': 3,\n 'evaluated': 1,\n 'values': 1,\n 'generally': 1,\n 'similar': 1,\n 'upper': 1,\n 'half': 2,\n 'distribution': 1,\n 'panel': 1,\n 'control': 1,\n 'convalescent': 1,\n 'specimens': 1,\n 'Solicited': 1,\n 'adverse': 3,\n 'events': 3,\n 'occurred': 1,\n 'included': 1,\n 'fatigue': 1,\n 'chills': 1,\n 'headache': 1,\n 'myalgia': 1,\n 'pain': 1,\n 'injection': 1,\n 'site': 1,\n 'Systemic': 1,\n 'common': 1,\n 'particularly': 1,\n 'highest': 1,\n '21': 1,\n 'reported': 1,\n 'severe': 1}"
max_frequency = max(word_frequency.values())
max_frequency
4
# normalize the word frequency (divide frequency of each work by max_frequency)
for word in word_frequency.keys():
 word_frequency[word] = word_frequency[word] / max_frequency
word_frequency
"text/plain": "{'vaccination': 1.0,\n 'antibody': 0.5,\n 'responses': 0.25,\n 'higher': 0.5,\n 'dose': 0.75,\n 'day': 0.5,\n '29': 0.25,\n 'enzyme': 0.25,\n 'linked': 0.25,\n 'immunosorbent': 0.25,\n 'assay': 0.25,\n 'anti': 0.25,\n '–': 0.25,\n 'S-2P': 0.25,\n 'geometric': 0.25,\n 'mean': 0.25,\n 'titer': 0.25,\n 'GMT': 0.5,\n '40,227': 0.25,\n '25-μg': 0.25,\n 'group': 1.0,\n '109,209': 0.25,\n '100-μg': 0.25,\n '213,526': 0.25,\n '250-μg': 0.5,\n 'second': 0.75,\n 'titers': 0.25,\n 'increased': 0.25,\n '57': 0.25,\n '299,751': 0.25,\n '782,719': 0.25,\n '1,192,154': 0.25,\n 'respectively': 0.25,\n 'serum': 0.5,\n 'neutralizing': 0.25,\n 'activity': 0.25,\n 'detected': 0.25,\n 'methods': 0.25,\n 'participants': 0.75,\n 'evaluated': 0.25,\n 'values': 0.25,\n 'generally': 0.25,\n 'similar': 0.25,\n 'upper': 0.25,\n 'half': 0.5,\n 'distribution': 0.25,\n 'panel': 0.25,\n 'control': 0.25,\n 'convalescent': 0.25,\n 'specimens': 0.25,\n 'Solicited': 0.25,\n 'adverse': 0.75,\n 'events': 0.75,\n 'occurred': 0.25,\n 'included': 0.25,\n 'fatigue': 0.25,\n 'chills': 0.25,\n 'headache': 0.25,\n 'myalgia': 0.25,\n 'pain': 0.25,\n 'injection': 0.25,\n 'site': 0.25,\n 'Systemic': 0.25,\n 'common': 0.25,\n 'particularly': 0.25,\n 'highest': 0.25,\n '21': 0.25,\n 'reported': 0.25,\n 'severe': 0.25}"
# Sentence tokenization
sentence_tokens = [sent for sent in doc.sents]
sentence_tokens
"text/plain": "[After the first vaccination, antibody responses were higher with higher dose (day 29 enzyme-linked immunosorbent assay anti–S-2P antibody geometric mean titer [GMT], 40,227 in the 25-μg group, 109,209 in the 100-μg group, and 213,526 in the 250-μg group).,\n After the second vaccination, the titers increased (day 57 GMT, 299,751, 782,719, and 1,192,154, respectively).,\n After the second vaccination, serum-neutralizing activity was detected by two methods in all participants evaluated, with values generally similar to those in the upper half of the distribution of a panel of control convalescent serum specimens.,\n Solicited adverse events that occurred in more than half the participants included fatigue, chills, headache, myalgia, and pain at the injection site.,\n Systemic adverse events were more common after the second vaccination, particularly with the highest dose, and three participants (21%) in the 250-μg dose group reported one or more severe adverse events.]"
# calculate sentence score
sentence_score = {}

for sent in sentence_tokens:
 for word in sent:
 if word.text.lower() in word_frequency.keys():
 if sent not in sentence_score.keys():
 sentence_score[sent] = word_frequency[word.text.lower()]
 else:
 sentence_score[sent] += word_frequency[word.text.lower()]
sentence_score
"text/plain": "{After the first vaccination, antibody responses were higher with higher dose (day 29 enzyme-linked immunosorbent assay anti–S-2P antibody geometric mean titer [GMT], 40,227 in the 25-μg group, 109,209 in the 100-μg group, and 213,526 in the 250-μg group).: 11.75,\n After the second vaccination, the titers increased (day 57 GMT, 299,751, 782,719, and 1,192,154, respectively).: 4.0,\n After the second vaccination, serum-neutralizing activity was detected by two methods in all participants evaluated, with values generally similar to those in the upper half of the distribution of a panel of control convalescent serum specimens.: 7.5,\n Solicited adverse events that occurred in more than half the participants included fatigue, chills, headache, myalgia, and pain at the injection site.: 5.0,\n Systemic adverse events were more common after the second vaccination, particularly with the highest dose, and three participants (21%) in the 250-μg dose group reported one or more severe adverse events.: 10.0}"
# now get 30% of sentence with max score
from heapq import nlargest
select_length = int(len(sentence_tokens) * 0.3)
select_length
1
len(sentence_tokens)
5
summary = nlargest(select_length, sentence_score, key = sentence_score.get)
summary
[After the first vaccination, antibody responses were higher with higher dose (day 29 enzyme-linked immunosorbent assay anti–S-2P antibody geometric mean titer [GMT], 40,227 in the 25-μg group, 109,209 in the 100-μg group, and 213,526 in the 250-μg group).]
final_summary = [word.text for word in summary]
summary = ' '.join(final_summary)
summary
'After the first vaccination, antibody responses were higher with higher dose (day 29 enzyme-linked immunosorbent assay anti–S-2P antibody geometric mean titer [GMT], 40,227 in the 25-μg group, 109,209 in the 100-μg group, and 213,526 in the 250-μg group).'
len(text)
969
len(summary)
255
