Skip to content

Instantly share code, notes, and snippets.

@dcolish
Forked from ramen/phplex.py
Created July 21, 2010 16:33
Show Gist options
  • Save dcolish/484731 to your computer and use it in GitHub Desktop.
Save dcolish/484731 to your computer and use it in GitHub Desktop.
# ----------------------------------------------------------------------
# phplex.py
#
# A lexer for PHP.
# ----------------------------------------------------------------------
import ply.lex as lex
# todo: literal html
# todo: double-quoted strings
# todo: number literals (LNUMBER, DNUMBER)
# todo: heredocs
# todo: backticks
# todo: namespaces
# todo: casts
# todo: "die" as alias for "exit"
# todo: BAD_CHARACTER
# todo: CURLY_OPEN, DOLLAR_OPEN_CURLY_BRACES, STRING_VARNAME
# todo: <script> syntax (does anyone use this?)
# todo: HALT_COMPILER (??)
# Reserved words
reserved = (
'ARRAY', 'AS', 'BREAK', 'CASE', 'CLASS', 'CONST', 'CONTINUE', 'DECLARE',
'DEFAULT', 'DO', 'ECHO', 'ELSE', 'ELSEIF', 'EMPTY', 'ENDDECLARE',
'ENDFOR', 'ENDFOREACH', 'ENDIF', 'ENDSWITCH', 'ENDWHILE', 'EVAL', 'EXIT',
'EXTENDS', 'FOR', 'FOREACH', 'FUNCTION', 'GLOBAL', 'IF', 'INCLUDE',
'INCLUDE_ONCE', 'INSTANCEOF', 'ISSET', 'LIST', 'NEW', 'PRINT', 'REQUIRE',
'REQUIRE_ONCE', 'RETURN', 'STATIC', 'SWITCH', 'UNSET', 'USE', 'VAR',
'WHILE', 'FINAL', 'INTERFACE', 'IMPLEMENTS', 'PUBLIC', 'PRIVATE',
'PROTECTED', 'ABSTRACT', 'CLONE', 'TRY', 'CATCH', 'THROW', 'CFUNCTION',
'OLD_FUNCTION',
)
tokens = reserved + (
# Generic
'WHITESPACE', 'OP',
# Operators
'SL', 'SR', 'BOOLEAN_OR', 'BOOLEAN_AND', 'IS_SMALLER_OR_EQUAL',
'IS_GREATER_OR_EQUAL', 'IS_EQUAL', 'IS_NOT_EQUAL', 'IS_IDENTICAL',
'IS_NOT_IDENTICAL',
# Assignment operators
'MUL_EQUAL', 'DIV_EQUAL', 'MOD_EQUAL', 'PLUS_EQUAL', 'MINUS_EQUAL',
'SL_EQUAL', 'SR_EQUAL', 'AND_EQUAL', 'OR_EQUAL', 'XOR_EQUAL',
'CONCAT_EQUAL',
# Increment/decrement
'INC', 'DEC',
# Arrows
'OBJECT_OPERATOR', 'DOUBLE_ARROW', 'DOUBLE_COLON',
# Comments
'COMMENT', 'DOC_COMMENT',
# Escaping from HTML
'OPEN_TAG', 'OPEN_TAG_WITH_ECHO', 'CLOSE_TAG'
# Identifiers and reserved words
'DIR', 'FILE', 'LINE', 'FUNC_C', 'CLASS_C', 'METHOD_C', 'NS_C',
'LOGICAL_AND', 'LOGICAL_OR', 'LOGICAL_XOR',
'STRING', 'VARIABLE',
'LNUMBER', 'DNUMBER',
'CONSTANT_ENCAPSED_STRING',
)
# Newlines
def t_WHITESPACE(t):
r'[ \t\r\n]+'
t.lexer.lineno += t.value.count("\n")
return t
# Assignment operators
def t_SL_EQUAL(t): r'<<='; return t
def t_SR_EQUAL(t): r'>>='; return t
def t_AND_EQUAL(t): r'&='; return t
def t_OR_EQUAL(t): r'\|='; return t
def t_XOR_EQUAL(t): r'\^='; return t
def t_MUL_EQUAL(t): r'\*='; return t
def t_DIV_EQUAL(t): r'/='; return t
def t_MOD_EQUAL(t): r'%='; return t
def t_PLUS_EQUAL(t): r'\+='; return t
def t_MINUS_EQUAL(t): r'-='; return t
def t_CONCAT_EQUAL(t): r'\.='; return t
# Operators
def t_SL(t): r'<<'; return t
def t_SR(t): r'>>'; return t
def t_BOOLEAN_AND(t): r'&&'; return t
def t_BOOLEAN_OR(t): r'\|\|'; return t
def t_IS_SMALLER_OR_EQUAL(t): r'<='; return t
def t_IS_GREATER_OR_EQUAL(t): r'>='; return t
def t_IS_IDENTICAL(t): r'==='; return t
def t_IS_NOT_IDENTICAL(t): r'!=='; return t
def t_IS_EQUAL(t): r'=='; return t
def t_IS_NOT_EQUAL(t): r'(!=)|(<>)'; return t
# Increment/decrement
def t_INC(t): r'\+\+'; return t
def t_DEC(t): r'--'; return t
# Arrows
def t_OBJECT_OPERATOR(t): r'->'; return t
def t_DOUBLE_ARROW(t): r'=>'; return t
def t_DOUBLE_COLON(t): r'::'; return t
# Comments
def t_DOC_COMMENT(t):
r'/\*\*(.|\n)*?\*/'
t.lexer.lineno += t.value.count("\n")
return t
def t_COMMENT(t):
r'(/\*(.|\n)*?\*/)|(//.*?\n)|(\#.*?\n)'
t.lexer.lineno += t.value.count("\n")
return t
# Escaping from HTML
def t_OPEN_TAG(t):
r'<[?%]((php)|=)?\n?'
if t.value.endswith('='): t.type = 'OPEN_TAG_WITH_ECHO'
t.lexer.lineno += t.value.count("\n")
return t
def t_CLOSE_TAG(t):
r'[?%]>\n?'
t.lexer.lineno += t.value.count("\n")
return t
# Identifiers and reserved words
reserved_map = {
'__DIR__': 'DIR',
'__FILE__': 'FILE',
'__LINE__': 'LINE',
'__FUNCTION__': 'FUNC_C',
'__CLASS__': 'CLASS_C',
'__METHOD__': 'METHOD_C',
'__NAMESPACE__': 'NS_C',
'AND': 'LOGICAL_AND',
'OR': 'LOGICAL_OR',
'XOR': 'LOGICAL_XOR',
}
for r in reserved:
reserved_map[r] = r
# Identifier
def t_STRING(t):
r'[A-Za-z_][\w_]*'
t.type = reserved_map.get(t.value.upper(), 'STRING')
return t
# Variable
def t_VARIABLE(t):
r'\$[A-Za-z_][\w_]*'
return t
# Integer literal (todo)
def t_LNUMBER(t):
r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?'
return t
# Floating literal (todo)
def t_DNUMBER(t):
r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?'
return t
# String literal
def t_CONSTANT_ENCAPSED_STRING(t):
r'(\"([^\\\n]|(\\.))*?\")|(\'([^\\\n]|(\\.))*?\')'
return t
# Simple operator
def t_OP(t):
r'[\(\)\{\}\[\]+-/*%^&|~=<>.!,?:;@]'
t.type = 'OP'
return t
def t_error(t):
print("Illegal character %s" % repr(t.value[0]))
t.lexer.skip(1)
lexer = lex.lex(optimize=1)
if __name__ == "__main__":
lex.runmain(lexer)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment