jasonrdsouza/tokenizer.py

## tokenizer.py
#!/usr/bin/python


class tokenizerState:
    START = 0       #start and outside of token state
    BRACE = 1       #inside brace token state
    BRACE_END = 2
    QUOTE = 3       #inside of quote token state
    QUOTE_END = 4
    CHAR = 5        #inside of char token state

def tokenize(commandline):
    currState = tokenizerState.START
    tokens = []
    temp_token = '';
    for c in commandline:
        if(currState == tokenizerState.START):
            if(c == ' '):
                continue
            elif(c == '['):
                currState = tokenizerState.BRACE
            elif(c == '"'):
                currState = tokenizerState.QUOTE
            else:
                temp_token = temp_token + c
                currState = tokenizerState.CHAR
        elif(currState == tokenizerState.BRACE):
            if(c == ']'):
                currState = tokenizerState.BRACE_END
            else:
                temp_token = temp_token + c
        elif(currState == tokenizerState.BRACE_END):
            if(c == ']'): #escaped char
                temp_token = temp_token + c
                currState = tokenizerState.BRACE
            elif(c == ' '):
                tokens.append(temp_token)
                temp_token = ''
                currState = tokenizerState.START
            else:
                print 'Error: invalid input'
                break
        elif(currState == tokenizerState.QUOTE):
            if(c == '"'):
                currState = tokenizerState.QUOTE_END
            else:
                temp_token = temp_token + c
        elif(currState == tokenizerState.QUOTE_END):
            if(c == '"'): #escaped char
                temp_token = temp_token + c
                currState = tokenizerState.QUOTE
            elif(c == ' '):
                tokens.append(temp_token)
                temp_token = ''
                currState = tokenizerState.START
            else:
                print 'Error: invalid input'
                break
        elif(currState == tokenizerState.CHAR):
            if(c == ' '): #token done
                tokens.append(temp_token)
                temp_token = ''
                currState = tokenizerState.START
            else:
                temp_token = temp_token + c
        else: #invalid state
            print 'Error: Invalid state'
            break
    if((currState == tokenizerState.QUOTE) or (currState == tokenizerState.BRACE)):
        print 'Warning: early token termination'
    else:
        tokens.append(temp_token)
    return tokens

if __name__ == "__main__":
    test_str = 'test1 -test2 [test3] "test4" [[test"5] [test]]6] "test""7" t"est[8]]'
    test_result = ['test1', '-test2', 'test3', 'test4', '[test"5', 'test]6', 'test"7', 't"est[8]]']
    assert tokenize(test_str) == test_result
	#!/usr/bin/python


	class tokenizerState:
	START = 0 #start and outside of token state
	BRACE = 1 #inside brace token state
	BRACE_END = 2
	QUOTE = 3 #inside of quote token state
	QUOTE_END = 4
	CHAR = 5 #inside of char token state

	def tokenize(commandline):
	currState = tokenizerState.START
	tokens = []
	temp_token = '';
	for c in commandline:
	if(currState == tokenizerState.START):
	if(c == ' '):
	continue
	elif(c == '['):
	currState = tokenizerState.BRACE
	elif(c == '"'):
	currState = tokenizerState.QUOTE
	else:
	temp_token = temp_token + c
	currState = tokenizerState.CHAR
	elif(currState == tokenizerState.BRACE):
	if(c == ']'):
	currState = tokenizerState.BRACE_END
	else:
	temp_token = temp_token + c
	elif(currState == tokenizerState.BRACE_END):
	if(c == ']'): #escaped char
	temp_token = temp_token + c
	currState = tokenizerState.BRACE
	elif(c == ' '):
	tokens.append(temp_token)
	temp_token = ''
	currState = tokenizerState.START
	else:
	print 'Error: invalid input'
	break
	elif(currState == tokenizerState.QUOTE):
	if(c == '"'):
	currState = tokenizerState.QUOTE_END
	else:
	temp_token = temp_token + c
	elif(currState == tokenizerState.QUOTE_END):
	if(c == '"'): #escaped char
	temp_token = temp_token + c
	currState = tokenizerState.QUOTE
	elif(c == ' '):
	tokens.append(temp_token)
	temp_token = ''
	currState = tokenizerState.START
	else:
	print 'Error: invalid input'
	break
	elif(currState == tokenizerState.CHAR):
	if(c == ' '): #token done
	tokens.append(temp_token)
	temp_token = ''
	currState = tokenizerState.START
	else:
	temp_token = temp_token + c
	else: #invalid state
	print 'Error: Invalid state'
	break
	if((currState == tokenizerState.QUOTE) or (currState == tokenizerState.BRACE)):
	print 'Warning: early token termination'
	else:
	tokens.append(temp_token)
	return tokens

	if __name__ == "__main__":
	test_str = 'test1 -test2 [test3] "test4" [[test"5] [test]]6] "test""7" t"est[8]]'
	test_result = ['test1', '-test2', 'test3', 'test4', '[test"5', 'test]6', 'test"7', 't"est[8]]']
	assert tokenize(test_str) == test_result