Skip to content

Instantly share code, notes, and snippets.

@PtrMan
Created July 5, 2015 14:56
Show Gist options
  • Save PtrMan/cb27555f68ea6f7711ea to your computer and use it in GitHub Desktop.
Save PtrMan/cb27555f68ea6f7711ea to your computer and use it in GitHub Desktop.
import re
# http://stackoverflow.com/questions/691148/pythonic-way-to-implement-a-tokenizer
class Scanner(object):
def __init__(self):
self._scanner=re.Scanner([
(r"[0-9]+", lambda scanner,token:("INTEGER", token)),
(r"[#a-z_]+", lambda scanner,token:("IDENTIFIER", token)),
(r"[,.]+", lambda scanner,token:("PUNCTUATION", token)),
(r"\s+", None), # None == skip token.
])
def scan(self, string):
results, remainder=self._scanner.scan(string)
return results
class WordLookupTable(object):
def __init__(self):
self._dictionary = {}
self._counter = 0
def addIfNew(self, string):
if string in self._dictionary:
return
self._dictionary[string] = self._counter
self._counter += 1
def lookup(self, string):
return self._dictionary[string]
class X(object):
def __init__(self):
self._knownWordLookupTable = WordLookupTable()
self._scanner = Scanner()
def anonymous0(self):
isWasWere = ["is", "was", "were"]
anAThe = ["a", "an", "the"]
template = "{a} {b}"
for iterationA in isWasWere:
for iterationB in anAThe:
self._knownWordLookupTable.addIfNew(iterationA)
self._knownWordLookupTable.addIfNew(iterationB)
encodedStringForA = self.toEncodedString(iterationA)
encodedStringForB = self.toEncodedString(iterationB)
print ( template.replace("{a}", encodedStringForA).replace("{b}", encodedStringForB) )
def processInput(self, string):
tokensFromScanner = self._scanner.scan(string)
tokensAfterHashTranslation = self._translateIdentifierTokensToWordHashTokens(tokensFromScanner)
print( tokensAfterHashTranslation )
def toEncodedString(self, string):
valueOfIndexForString = self._knownWordLookupTable.lookup(string)
return "#" + str(valueOfIndexForString)
# TODO< make static >
def _translateIdentifierTokensToWordHashTokens(self, tokens):
resultTokens = []
for iterationToken in tokens:
if iterationToken[0] == "IDENTIFIER":
self._knownWordLookupTable.addIfNew(iterationToken[1])
hashIndex = self._knownWordLookupTable.lookup(iterationToken[1])
resultTokens += ("HASH", hashIndex)
else:
resultTokens += iterationToken
return resultTokens
x = X()
x.anonymous0()
# "God Is an Astronaut is a band from the Glen of the Downs, County Wicklow, Ireland."
# after some classical formating, after markup/link etc preprocessing
x.processInput("## is a band from the ##, ##, ireland.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment