Last active
January 27, 2024 13:50
-
-
Save B-R-P/5a6d6a98dc95440154b52600f579121c to your computer and use it in GitHub Desktop.
Score each sentence in a text using gzip
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import zlib from 'zlib'; | |
import sbd from 'sbd'; | |
function calculateZScore(arr) { | |
const mean = arr.reduce((sum, value) => sum + value, 0) / arr.length; | |
const stdDeviation = Math.sqrt(arr.reduce((sum, value) => sum + Math.pow(value - mean, 2), 0) / arr.length); | |
return arr.map(value => (value - mean) / stdDeviation); | |
} | |
function scoreSentence(text) { | |
const sentences = sbd.sentences(text); | |
let score = new Array(sentences.length).fill(0); | |
const getCompressedSize = (sentence) => zlib.gzipSync(sentence.toLowerCase()).length; | |
const compressed = sentences.map(getCompressedSize); | |
for (let p1 = 0; p1 < compressed.length; p1++) { | |
for (let p2 = p1; p2 < compressed.length; p2++) { | |
const similarity = 1-((getCompressedSize(sentences[p1] + sentences[p2]) - Math.min(compressed[p1], compressed[p2])) / | |
Math.max(compressed[p1], compressed[p2])); | |
score[p1] += similarity; | |
score[p2] += similarity; | |
} | |
} | |
const result = {}; | |
score = calculateZScore(score) | |
for (const i of score.map((val, index) => ({ val, index }))) { | |
result[i.val] = sentences[i.index]; | |
} | |
return result; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment