Skip to content

Instantly share code, notes, and snippets.

@bicycle1885
Last active April 3, 2016 05:02
Show Gist options
  • Save bicycle1885/c33a0c4265e61094f42e70e47ae9dd90 to your computer and use it in GitHub Desktop.
Save bicycle1885/c33a0c4265e61094f42e70e47ae9dd90 to your computer and use it in GitHub Desktop.
Toy Regular Expression Engine Written in Julia
# Toy Regular Expression Engine Written in Julia
# ==============================================
# Example
# -------
#
# julia> include("re.jl")
# run (generic function with 1 method)
#
# julia> ast = parse("foo*|ba*r")
# AST(:concat,Any[AST(:alt,[AST(:concat,Any[AST(:char,['f']),AST(:char,['o']),AST(:star,[AST(:char,['o'])])
# ]),AST(:concat,Any[AST(:char,['b']),AST(:star,[AST(:char,['a'])]),AST(:char,['r'])])])])
#
# julia> code = compile(ast)
# 13-element Array{Inst,1}:
# push 8
# char f
# char o
# push 7
# char o
# jump 4
# jump 13
# char b
# push 12
# char a
# jump 9
# char r
# match
#
# julia> run(code, "fooooooo")
# true
#
# julia> run(code, "baaaaar")
# true
#
# julia> run(code, "br")
# true
#
# julia> run(code, "baz")
# false
# Parser
# ------
# abstract syntax tree for regular expressions
type AST
head::Symbol
args::Vector
end
# parser of PCRE-like regular expression strings
function parse(pattern::AbstractString)
return parserec(pattern, start(pattern))[1]
end
function parserec(pattern, s)
args = []
while !done(pattern, s)
c, s = next(pattern, s)
if c == '*'
arg = pop!(args)
push!(args, AST(:star, [arg]))
elseif c == '|'
arg1 = AST(:concat, args)
arg2, s = parserec(pattern, s)
args = []
push!(args, AST(:alt, [arg1, arg2]))
else
push!(args, AST(:char, [c]))
end
end
return AST(:concat, args), s
end
# Virtual Machine
# ---------------
# instruction type
bitstype 32 Inst
const CharTag = UInt32(0b00) << 30
const PushTag = UInt32(0b01) << 30
const JumpTag = UInt32(0b10) << 30
const MatchTag = UInt32(0b11) << 30
# instruction constructors
char(c::Char) = reinterpret(Inst, CharTag | UInt32(c))
push(l::Int) = reinterpret(Inst, PushTag | UInt32(l))
jump(l::Int) = reinterpret(Inst, JumpTag | UInt32(l))
match() = reinterpret(Inst, MatchTag)
# accessors
tag(inst::Inst) = reinterpret(UInt32, inst) & (UInt32(0b11) << 30)
operand(inst::Inst) = reinterpret(UInt32, inst) & ((UInt32(1) << 30) - one(UInt32))
function Base.show(io::IO, inst::Inst)
t = tag(inst)
op = operand(inst)
if t == CharTag
print(io, "char ", Char(op))
elseif t == PushTag
print(io, "push ", Int(op))
elseif t == JumpTag
print(io, "jump ", Int(op))
elseif t == MatchTag
print(io, "match")
end
end
# compile an AST object to a vector of instructions
function compile(ast::AST)
code = Inst[]
compilerec!(code, ast)
push!(code, match())
return code
end
function compilerec!(code, ast)
head = ast.head
args = ast.args
if head == :char
push!(code, char(args[1]))
elseif head == :star
push!(code, push(0))
l = length(code)
compilerec!(code, args[1])
push!(code, jump(l))
code[l] = push(length(code) + 1)
elseif head == :alt
push!(code, push(0))
l = length(code)
compilerec!(code, args[1])
push!(code, jump(0))
code[l] = push(length(code) + 1)
l = length(code)
compilerec!(code, args[2])
code[l] = jump(length(code) + 1)
elseif head == :concat
for arg in args
compilerec!(code, arg)
end
end
return code
end
# run a VM encoed in a vector of instructions
# This function returns `true` if `str` is accepted; otherwise returns `false`.
function run(code::Vector{Inst}, str::AbstractString)
threads = [(1, start(str))]
while !isempty(threads)
pc::Int, s = pop!(threads)
while true
inst = code[pc]
t = tag(inst)
op = operand(inst)
if t == CharTag
if done(str, s)
break
end
c, s = next(str, s)
if c == op
pc += 1
else
break
end
elseif t == PushTag
push!(threads, (op, s))
pc += 1
elseif t == JumpTag
pc = op
elseif t == MatchTag
return true
end
end
end
return false
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment