Skip to content

Commit

Permalink
use bitmap to check membership (#24)
Browse files Browse the repository at this point in the history
  • Loading branch information
bicycle1885 authored Dec 4, 2017
1 parent f82e2ff commit ceb332b
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 13 deletions.
24 changes: 20 additions & 4 deletions benchmark/runbenchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ end
# ------

println("Case 1 ", raw"([A-z]*\r?\n)*")
println("PCRE: ", @benchmark ismatch($(r"^(:?[A-z]*\r?\n)*$"), data))
match(data) = ismatch(r"^(:?[A-z]*\r?\n)*$", data)
@assert match(data)
println("PCRE: ", @benchmark match(data))

machine = Automa.compile(re"([A-z]*\r?\n)*")
VISUALIZE && writesvg("case1", machine)
Expand All @@ -31,6 +33,7 @@ context = Automa.CodeGenContext(generator=:goto, checkbounds=false)
$(Automa.generate_exec_code(context, machine))
return cs == 0
end
@assert match(data)
println("Automa.jl: ", @benchmark match(data))

context = Automa.CodeGenContext(generator=:goto, checkbounds=false, loopunroll=10)
Expand All @@ -40,6 +43,7 @@ context = Automa.CodeGenContext(generator=:goto, checkbounds=false, loopunroll=1
$(Automa.generate_exec_code(context, machine))
return cs == 0
end
@assert match(data)
println("Automa.jl (unrolled): ", @benchmark match(data))


Expand All @@ -48,7 +52,9 @@ println("Automa.jl (unrolled): ", @benchmark match(data))

println()
println("Case 2 ", raw"([A-Za-z]*\r?\n)*")
println("PCRE: ", @benchmark ismatch($(r"^(:?[A-Za-z]*\r?\n)*$"), data))
match(data) = ismatch(r"^(:?[A-Za-z]*\r?\n)*$", data)
@assert match(data)
println("PCRE: ", @benchmark match(data))

machine = Automa.compile(re"([A-Za-z]*\r?\n)*")
VISUALIZE && writesvg("case2", machine)
Expand All @@ -59,6 +65,7 @@ context = Automa.CodeGenContext(generator=:goto, checkbounds=false)
$(Automa.generate_exec_code(context, machine))
return cs == 0
end
@assert match(data)
println("Automa.jl: ", @benchmark match(data))

context = Automa.CodeGenContext(generator=:goto, checkbounds=false, loopunroll=10)
Expand All @@ -68,6 +75,7 @@ context = Automa.CodeGenContext(generator=:goto, checkbounds=false, loopunroll=1
$(Automa.generate_exec_code(context, machine))
return cs == 0
end
@assert match(data)
println("Automa.jl (unrolled): ", @benchmark match(data))


Expand All @@ -76,7 +84,9 @@ println("Automa.jl (unrolled): ", @benchmark match(data))

println()
println("Case 3 ", raw"([ACGTacgt]*\r?\n)*")
println("PCRE: ", @benchmark ismatch($(r"^(:?[ACGTacgt]*\r?\n)*$"), data))
match(data) = ismatch(r"^(:?[ACGTacgt]*\r?\n)*$", data)
@assert match(data)
println("PCRE: ", @benchmark match(data))

machine = Automa.compile(re"([ACGTacgt]*\r?\n)*")
VISUALIZE && writesvg("case3", machine)
Expand All @@ -87,6 +97,7 @@ context = Automa.CodeGenContext(generator=:goto, checkbounds=false)
$(Automa.generate_exec_code(context, machine))
return cs == 0
end
@assert match(data)
println("Automa.jl: ", @benchmark match(data))

context = Automa.CodeGenContext(generator=:goto, checkbounds=false, loopunroll=10)
Expand All @@ -96,6 +107,7 @@ context = Automa.CodeGenContext(generator=:goto, checkbounds=false, loopunroll=1
$(Automa.generate_exec_code(context, machine))
return cs == 0
end
@assert match(data)
println("Automa.jl (unrolled): ", @benchmark match(data))


Expand All @@ -104,7 +116,9 @@ println("Automa.jl (unrolled): ", @benchmark match(data))

println()
println("Case 4 ", raw"([A-Za-z\*-]*\r?\n)*")
println("PCRE: ", @benchmark ismatch($(r"^(:?[A-Za-z\*-]*\r?\n)*$"), data))
match(data) = ismatch(r"^(:?[A-Za-z\*-]*\r?\n)*$", data)
@assert match(data)
println("PCRE: ", @benchmark match(data))

machine = Automa.compile(re"([A-Za-z\*-]*\r?\n)*")
VISUALIZE && writesvg("case4", machine)
Expand All @@ -115,6 +129,7 @@ context = Automa.CodeGenContext(generator=:goto, checkbounds=false)
$(Automa.generate_exec_code(context, machine))
return cs == 0
end
@assert match(data)
println("Automa.jl: ", @benchmark match(data))

context = Automa.CodeGenContext(generator=:goto, checkbounds=false, loopunroll=10)
Expand All @@ -124,4 +139,5 @@ context = Automa.CodeGenContext(generator=:goto, checkbounds=false, loopunroll=1
$(Automa.generate_exec_code(context, machine))
return cs == 0
end
@assert match(data)
println("Automa.jl (unrolled): ", @benchmark match(data))
28 changes: 28 additions & 0 deletions src/byteset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,34 @@ function Base.setdiff(s1::ByteSet, s2::ByteSet)
return ByteSet(s1.a & ~s2.a, s1.b & ~s2.b, s1.c & ~s2.c, s1.d & ~s2.d)
end

function Base.minimum(set::ByteSet)
if set.a != 0x00
return UInt8(trailing_zeros(set.a))
elseif set.b != 0x00
return UInt8(trailing_zeros(set.b)) + 0x40
elseif set.c != 0x00
return UInt8(trailing_zeros(set.c)) + 0x80
elseif set.d != 0x00
return UInt8(trailing_zeros(set.d)) + 0xc0
else
throw(ArgumentError("empty set"))
end
end

function Base.maximum(set::ByteSet)
if set.d != 0x00
return UInt8(63 - leading_zeros(set.d)) + 0xc0
elseif set.c != 0x00
return UInt8(63 - leading_zeros(set.c)) + 0x80
elseif set.b != 0x00
return UInt8(63 - leading_zeros(set.b)) + 0x40
elseif set.a != 0x00
return UInt8(63 - leading_zeros(set.a))
else
throw(ArgumentError("empty set"))
end
end

function isdisjoint(s1::ByteSet, s2::ByteSet)
return isempty(intersect(s1, s2))
end
Expand Down
37 changes: 28 additions & 9 deletions src/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ function generate_goto_code(ctx::CodeGenContext, machine::Machine, actions::Dict
dispatch_code = foldr(default, optimize_edge_order(s.edges)) do edge, els
e, t = edge
if isempty(e.actions)
if ctx.loopunroll > 0 && s.state == t.state
if ctx.loopunroll > 0 && s.state == t.state && length(e.labels) 4
then = generate_unrolled_loop(ctx, e, t)
else
then = :(@goto $(Symbol("state_", t.state)))
Expand Down Expand Up @@ -344,9 +344,9 @@ function generate_unrolled_loop(ctx::CodeGenContext, edge::Edge, t::Node)
# Generated code looks like this (when unroll=2):
# while p + 2 ≤ p_end
# l1 = $(getbyte)(data, p + 1)
# !$(generate_simple_condition_code(e, :l1)) && break
# !$(generate_membership_code(:l1, e.labels)) && break
# l2 = $(getbyte)(data, p + 2)
# !$(generate_simple_condition_code(e, :l2)) && break
# !$(generate_membership_code(:l2, e.labels)) && break
# p += 2
# end
# @goto ...
Expand All @@ -358,7 +358,7 @@ function generate_unrolled_loop(ctx::CodeGenContext, edge::Edge, t::Node)
body.args,
quote
$(generate_geybyte_code(ctx, l, k))
$(generate_simple_condition_code(edge, l)) || begin
$(generate_membership_code(l, edge.labels)) || begin
$(ctx.vars.p) += $(k-1)
break
end
Expand Down Expand Up @@ -406,8 +406,7 @@ function state_condition(ctx::CodeGenContext, s::Int)
end

function generate_condition_code(ctx::CodeGenContext, edge::Edge, actions::Dict{Symbol,Expr})
labelcode = foldr((range, cond) -> Expr(:||, :($(ctx.vars.byte) in $(range)), cond), :(false),
sort(range_encode(edge.labels), by=length, rev=true))
labelcode = generate_membership_code(ctx.vars.byte, edge.labels)
precondcode = foldr(:(true), edge.precond) do p, ex
name, value = p
if value == BOTH
Expand All @@ -424,9 +423,29 @@ function generate_condition_code(ctx::CodeGenContext, edge::Edge, actions::Dict{
return :($(labelcode) && $(precondcode))
end

function generate_simple_condition_code(edge::Edge, byte::Symbol)
return foldr((range, cond) -> Expr(:||, :($(byte) in $(range)), cond), :(false),
sort(range_encode(edge.labels), by=length, rev=true))
function generate_membership_code(var::Symbol, set::ByteSet)
min, max = minimum(set), maximum(set)
@assert min isa UInt8 && max isa UInt8
if max - min + 1 == length(set)
# contiguous
if min == max
return :($(var) == $(min))
else
return :($(var) in $(min:max))
end
elseif max - min + 1 64 && all(b - min max for b in 0x00:0xff if b < min)
# storable in a 64-bit bitmap
bitmap = UInt64(0)
for x in set
bitmap |= UInt64(1) << (x - min)
end
return :(($(UInt64(1)) << ($(var) - $(min))) & $(bitmap) != 0)
else
# fallback
return foldr((range, cond) -> Expr(:||, :($(var) in $(range)), cond),
:(false),
sort(range_encode(set), by=length, rev=true))
end
end

# Used by the :table and :inline code generators.
Expand Down

0 comments on commit ceb332b

Please sign in to comment.