Skip to content

Instantly share code, notes, and snippets.

@popey456963
Created November 10, 2016 21:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save popey456963/5bc1a8f20f7d4f121530a364d46e6255 to your computer and use it in GitHub Desktop.
Save popey456963/5bc1a8f20f7d4f121530a364d46e6255 to your computer and use it in GitHub Desktop.
Word Spacer
from nltk.corpus import brown
import copy
from nltk.probability import FreqDist
def findmostlikelytokens(testsentence, fq):
stack=[([], testsentence)]
resultlist=[]
while len(stack) > 0:
currentlist, sentence=stack.pop(0)
if len(sentence)==0:
resultlist.append(currentlist)
for i in range(0, len(sentence)+1):
if sentence[0:i] in fq:
newlist=copy.deepcopy(currentlist)
newlist.append(sentence[0:i])
stack.append((newlist, sentence[i:]))
finallist=sorted(resultlist, key=(lambda x: scorelist(x)), reverse=True)
return ( finallist[0])
def scorelist(liste):
global fq
summe=0
for value in liste:
summe+=fq[value]**len(value)
#print(str(liste)+ " : " + str(summe))
return summe/len(liste)
if __name__=="__main__":
fq=FreqDist([word.lower() for word in brown.words()[:1000000]] )
print(fq["m"])
while True:
print(findmostlikelytokens(input("Next sentence to tokenize: "), fq))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment