Skip to content

Instantly share code, notes, and snippets.

@davidguttman
Created April 26, 2016 22:58
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save davidguttman/7fdc16f496f6cc26585dcede5e987ba2 to your computer and use it in GitHub Desktop.
Save davidguttman/7fdc16f496f6cc26585dcede5e987ba2 to your computer and use it in GitHub Desktop.
Split text without spaces into list of words
// port of http://stackoverflow.com/questions/8870261/how-to-split-text-without-spaces-into-list-of-words
var _ = require('lodash')
var fs = require('fs')
var tape = require('tape')
var dictStr = fs.readFileSync(__dirname + '/dump/words-by-freq.txt', 'utf8')
var wordsByFreq = dictStr.split('\n')
var maxWord = 0
var wordCost = {}
wordsByFreq.forEach(function (w, i) {
wordCost[w] = Math.log( (i + 1) * Math.log(wordsByFreq.length) )
if (w.length > maxWord) maxWord = w.length
})
tape('test 1', function (t) {
var test = 'yourbestcreditcardoptionsaustralia'
var expected = 'your best credit card options australia'
t.deepEqual(inferSpaces(test), expected)
t.end()
})
tape('test 2', function (t) {
var test = 'finddentalimplantsoptions'
var expected = 'find dental implants options'
t.deepEqual(inferSpaces(test), expected)
t.end()
})
tape('test 3', function (t) {
var test = 'fantasticstudentgrantsearch'
var expected = 'fantastic student grant search'
t.deepEqual(inferSpaces(test), expected)
t.end()
})
function inferSpaces (blob) {
function bestMatch (nChar) {
var cStart = _.max(0, nChar - maxWord)
var candidates = cost.slice( cStart, nChar).reverse()
return _.min(
candidates.map(function (cost, iCandi) {
var bStart = nChar - iCandi - 1
var substr = blob.slice(bStart, nChar)
var wc = wordCost[substr] || Infinity
return [ cost + wc, iCandi + 1 ]
}),
0
)
}
var cost = [0]
for (var i = 1; i < blob.length + 1; i++) {
var m = bestMatch(i)
cost.push(m[0])
}
var out = []
var iBlob = blob.length
while (iBlob > 0) {
var match = bestMatch(iBlob)
var c = match[0]
var k = match[1]
if (c != cost[iBlob]) throw new Error('wtf?')
var oStart = iBlob - k
var oEnd = iBlob
var substr = blob.slice(oStart, oEnd)
out.push(substr)
iBlob -= k
}
return out.reverse().join(' ')
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment