Created
September 24, 2011 02:18
-
-
Save jasonrdsouza/1238868 to your computer and use it in GitHub Desktop.
Function to tokenize string input
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
class tokenizerState: | |
START = 0 #start and outside of token state | |
BRACE = 1 #inside brace token state | |
BRACE_END = 2 | |
QUOTE = 3 #inside of quote token state | |
QUOTE_END = 4 | |
CHAR = 5 #inside of char token state | |
def tokenize(commandline): | |
currState = tokenizerState.START | |
tokens = [] | |
temp_token = ''; | |
for c in commandline: | |
if(currState == tokenizerState.START): | |
if(c == ' '): | |
continue | |
elif(c == '['): | |
currState = tokenizerState.BRACE | |
elif(c == '"'): | |
currState = tokenizerState.QUOTE | |
else: | |
temp_token = temp_token + c | |
currState = tokenizerState.CHAR | |
elif(currState == tokenizerState.BRACE): | |
if(c == ']'): | |
currState = tokenizerState.BRACE_END | |
else: | |
temp_token = temp_token + c | |
elif(currState == tokenizerState.BRACE_END): | |
if(c == ']'): #escaped char | |
temp_token = temp_token + c | |
currState = tokenizerState.BRACE | |
elif(c == ' '): | |
tokens.append(temp_token) | |
temp_token = '' | |
currState = tokenizerState.START | |
else: | |
print 'Error: invalid input' | |
break | |
elif(currState == tokenizerState.QUOTE): | |
if(c == '"'): | |
currState = tokenizerState.QUOTE_END | |
else: | |
temp_token = temp_token + c | |
elif(currState == tokenizerState.QUOTE_END): | |
if(c == '"'): #escaped char | |
temp_token = temp_token + c | |
currState = tokenizerState.QUOTE | |
elif(c == ' '): | |
tokens.append(temp_token) | |
temp_token = '' | |
currState = tokenizerState.START | |
else: | |
print 'Error: invalid input' | |
break | |
elif(currState == tokenizerState.CHAR): | |
if(c == ' '): #token done | |
tokens.append(temp_token) | |
temp_token = '' | |
currState = tokenizerState.START | |
else: | |
temp_token = temp_token + c | |
else: #invalid state | |
print 'Error: Invalid state' | |
break | |
if((currState == tokenizerState.QUOTE) or (currState == tokenizerState.BRACE)): | |
print 'Warning: early token termination' | |
else: | |
tokens.append(temp_token) | |
return tokens | |
if __name__ == "__main__": | |
test_str = 'test1 -test2 [test3] "test4" [[test"5] [test]]6] "test""7" t"est[8]]' | |
test_result = ['test1', '-test2', 'test3', 'test4', '[test"5', 'test]6', 'test"7', 't"est[8]]'] | |
assert tokenize(test_str) == test_result | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment