Created
October 3, 2022 00:30
-
-
Save khtdr/76993218a4036d12f5a7af0b083e3271 to your computer and use it in GitHub Desktop.
Text summarizer using NLP (compromise)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nlp from "compromise"; | |
export function summarize(text: string, approx_word_count = 75): string { | |
const terms = rankTextTerms(text); | |
const ranked = reorderSentences(text, terms); | |
const summary = extractSummary(ranked, approx_word_count); | |
return summary; | |
} | |
function rankTextTerms(text: string): Record<string, number> { | |
const terms: ReturnType<typeof rankTextTerms> = {}; | |
const doc = nlp(text); | |
doc.termList().forEach((term) => { | |
terms[term.normal] = terms[term.normal] || 0; | |
}); | |
doc.nouns().forEach((noun) => { | |
noun.termList().forEach((term) => { | |
terms[term.normal] = terms[term.normal] || 0; | |
terms[term.normal] += 1; | |
}); | |
}); | |
doc.verbs().forEach((verb) => { | |
verb.termList().forEach((term) => { | |
terms[term.normal] = terms[term.normal] || 0; | |
terms[term.normal] += 1; | |
}); | |
}); | |
return terms; | |
} | |
function reorderSentences( | |
text: string, | |
terms: ReturnType<typeof rankTextTerms> | |
): string { | |
return (nlp(text).sentences().json() as SentenceJson) | |
.map((sentence) => { | |
const score = sentence.terms.reduce((currentScore, { normal }) => { | |
return currentScore + terms[normal]; | |
}, 0); | |
return { text: sentence.text, score }; | |
}) | |
.sort((a, b) => b.score - a.score) | |
.map(({ text }) => text.trim()) | |
.join(" "); | |
} | |
function extractSummary(text: string, approx_words: number): string { | |
const summary = (nlp(text).sentences().json() as SentenceJson).reduce( | |
(data, sentence) => { | |
if (data.wordCount > approx_words) return data; | |
return { | |
wordCount: data.wordCount + sentence.terms.length, | |
text: data.text + " " + sentence.text.replace(/\s+/gm, " "), | |
}; | |
}, | |
{ wordCount: 0, text: "" } | |
); | |
return summary.text; | |
} | |
type Sentence = { text: string; terms: Array<{ normal: string }> }; | |
type SentenceJson = Sentence[]; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment