Skip to content

Instantly share code, notes, and snippets.

@jasonrdsouza
Created September 24, 2011 02:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jasonrdsouza/1238868 to your computer and use it in GitHub Desktop.
Save jasonrdsouza/1238868 to your computer and use it in GitHub Desktop.
Function to tokenize string input
#!/usr/bin/python
class tokenizerState:
START = 0 #start and outside of token state
BRACE = 1 #inside brace token state
BRACE_END = 2
QUOTE = 3 #inside of quote token state
QUOTE_END = 4
CHAR = 5 #inside of char token state
def tokenize(commandline):
currState = tokenizerState.START
tokens = []
temp_token = '';
for c in commandline:
if(currState == tokenizerState.START):
if(c == ' '):
continue
elif(c == '['):
currState = tokenizerState.BRACE
elif(c == '"'):
currState = tokenizerState.QUOTE
else:
temp_token = temp_token + c
currState = tokenizerState.CHAR
elif(currState == tokenizerState.BRACE):
if(c == ']'):
currState = tokenizerState.BRACE_END
else:
temp_token = temp_token + c
elif(currState == tokenizerState.BRACE_END):
if(c == ']'): #escaped char
temp_token = temp_token + c
currState = tokenizerState.BRACE
elif(c == ' '):
tokens.append(temp_token)
temp_token = ''
currState = tokenizerState.START
else:
print 'Error: invalid input'
break
elif(currState == tokenizerState.QUOTE):
if(c == '"'):
currState = tokenizerState.QUOTE_END
else:
temp_token = temp_token + c
elif(currState == tokenizerState.QUOTE_END):
if(c == '"'): #escaped char
temp_token = temp_token + c
currState = tokenizerState.QUOTE
elif(c == ' '):
tokens.append(temp_token)
temp_token = ''
currState = tokenizerState.START
else:
print 'Error: invalid input'
break
elif(currState == tokenizerState.CHAR):
if(c == ' '): #token done
tokens.append(temp_token)
temp_token = ''
currState = tokenizerState.START
else:
temp_token = temp_token + c
else: #invalid state
print 'Error: Invalid state'
break
if((currState == tokenizerState.QUOTE) or (currState == tokenizerState.BRACE)):
print 'Warning: early token termination'
else:
tokens.append(temp_token)
return tokens
if __name__ == "__main__":
test_str = 'test1 -test2 [test3] "test4" [[test"5] [test]]6] "test""7" t"est[8]]'
test_result = ['test1', '-test2', 'test3', 'test4', '[test"5', 'test]6', 'test"7', 't"est[8]]']
assert tokenize(test_str) == test_result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment