Skip to content

Instantly share code, notes, and snippets.

@mohamed-ali
Created January 4, 2014 13:34
Show Gist options
  • Save mohamed-ali/8255392 to your computer and use it in GitHub Desktop.
Save mohamed-ali/8255392 to your computer and use it in GitHub Desktop.
a simple python lexer for HTML
import ply.lex as lex
#list of tokens
tokens = (
'LANGLE', #<
'LANGLESLASH', #</
'RANGLE', #>
'EQUAL', # =
'STRING', # "hello"
'WORD', # Welcome!
)
#tokens to ignore
t_ignore = ' '#shortcut for whitespace
#start of comment token
def t_htmlcomment(token):
r'<!--'
token.lexer.begin('htmlcomment')
#end of comment token
def t_htmlcomment_end(token):
r'-->'
token.lexer.lineno += token.value.count('\n')
token.lexer.begin('INITIAL')
#skip everything in a comment
def t_htmlcomment_error(token):
token.lexer.skip(1) #pass
#new line counter
def t_newline(token):
r'\n'
token.lexer.lineno +=1
pass
#left angle, slash token
def t_LANGLESLASH(token):
r'</'
return token
#left angle token
def t_LANGLE(token):
r'<'
return token
#right angle token
def t_RANGLE(token):
r'>'
return token
#equal token
def t_EQUAL(token):
r'='
return token
#string token
def t_STRING(token):
r'"[^"]*"'
token.value = token.value[1:-1]
return token
#word token
def t_WORD(token):
r'[^ <>\n]+'
return token
#how to test
webpage = "this is <b>my</b> webpage!"
htmllexer = lex.lex()
htmllexer.input(webpage)
while True:
tok = htmllexer.token()
if not tok: break
print tok
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment