Skip to content

Instantly share code, notes, and snippets.

@Priyansh2
Created September 10, 2018 10:59
Show Gist options
  • Save Priyansh2/4e17d096e1692c8a8746d9a8f336f67c to your computer and use it in GitHub Desktop.
Save Priyansh2/4e17d096e1692c8a8746d9a8f336f67c to your computer and use it in GitHub Desktop.
ngrams and their counts with 'bisect' python2
import bisect
class WordList( object ):
"""Leaf-level is list of words and counts."""
def __init__( self ):
self.words= [ ('\xff-None-',0) ]
def count( self, wordTuple ):
assert len(wordTuple)==1
word= wordTuple[0]
loc= bisect.bisect_left( self.words, word )
if self.words[loc][0] != word:
self.words.insert( loc, (word,0) )
self.words[loc]= ( word, self.words[loc][1]+1 )
def getWords( self ):
return self.words[:-1]
class WordTree( object ):
"""Above non-leaf nodes are words and either trees or lists."""
def __init__( self ):
self.words= [ ('\xff-None-',None) ]
def count( self, wordTuple ):
head, tail = wordTuple[0], wordTuple[1:]
loc= bisect.bisect_left( self.words, head )
if self.words[loc][0] != head:
if len(tail) == 1:
newList= WordList()
else:
newList= WordTree()
self.words.insert( loc, (head,newList) )
self.words[loc][1].count( tail )
def getWords( self ):
return self.words[:-1]
t = WordTree()
for a in ( ('the','quick','brown'), ('the','quick','fox'),('hilary','clinton','obama') ):
t.count(a)
for w1,wt1 in t.getWords():
print w1
for w2,wt2 in wt1.getWords():
print w2
for w3 in wt2.getWords():
print w3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment