Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use MultiplicativeInverse to speedup Linear to Cartesian indexing operations #539

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 102 additions & 9 deletions src/nditeration.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,65 @@
module NDIteration

import Base.MultiplicativeInverses: SignedMultiplicativeInverse

# CartesianIndex uses Int instead of Int32

@eval EmptySMI() = $(Expr(:new, SignedMultiplicativeInverse{Int32}, Int32(0), typemax(Int32), 0 % Int8, 0 % UInt8))
SMI(i) = i == 0 ? EmptySMI() : SignedMultiplicativeInverse{Int32}(i)

struct FastCartesianIndices{N} <: AbstractArray{CartesianIndex{N}, N}
inverses::NTuple{N, SignedMultiplicativeInverse{Int32}}
end

function FastCartesianIndices(indices::NTuple{N}) where {N}
inverses = map(i -> SMI(Int32(i)), indices)
FastCartesianIndices(inverses)
end

function Base.size(FCI::FastCartesianIndices{N}) where {N}
ntuple(Val(N)) do I
FCI.inverses[I].divisor
end
end

@inline function Base.getindex(::FastCartesianIndices{0})
return CartesianIndex()
end

@inline function Base.getindex(iter::FastCartesianIndices{N}, I::Vararg{Int, N}) where {N}
@boundscheck checkbounds(iter, I...)
index = map(iter.inverses, I) do inv, i
@inbounds getindex(Base.OneTo(inv.divisor), i % Int32)
end
CartesianIndex(index)
end

_ind2sub_recuse(::Tuple{}, ind) = (ind + 1,)
function _ind2sub_recurse(indslast::NTuple{1}, ind)
Base.@_inline_meta
(_lookup(ind, indslast[1]),)
end

function _ind2sub_recurse(inds, ind)
Base.@_inline_meta
assume(ind >= 0)
inv = inds[1]
indnext, f, l = _div(ind, inv)
(ind - l * indnext + f, _ind2sub_recurse(Base.tail(inds), indnext)...)
end

_lookup(ind, inv::SignedMultiplicativeInverse) = ind + 1
function _div(ind, inv::SignedMultiplicativeInverse)
# inv.divisor == 0 && throw(DivideError())
assume(ind >= 0)
div(ind % Int32, inv), 1, inv.divisor
end

function Base._ind2sub(inv::FastCartesianIndices, ind)
Base.@_inline_meta
_ind2sub_recurse(inv.inverses, ind - 1)
end

export _Size, StaticSize, DynamicSize, get
export NDRange, blocks, workitems, expand
export DynamicCheck, NoDynamicCheck
Expand Down Expand Up @@ -50,18 +110,32 @@ struct NDRange{N, StaticBlocks, StaticWorkitems, DynamicBlock, DynamicWorkitems}
blocks::DynamicBlock
workitems::DynamicWorkitems

function NDRange{N, B, W}() where {N, B, W}
new{N, B, W, Nothing, Nothing}(nothing, nothing)
end

function NDRange{N, B, W}(blocks, workitems) where {N, B, W}
function NDRange{N, B, W}(blocks::Union{Nothing, FastCartesianIndices{N}}, workitems::Union{Nothing, FastCartesianIndices{N}}) where {N, B, W}
@assert B <: _Size
@assert W <: _Size
new{N, B, W, typeof(blocks), typeof(workitems)}(blocks, workitems)
end
end

@inline workitems(range::NDRange{N, B, W}) where {N, B, W <: DynamicSize} = range.workitems::CartesianIndices{N}
function NDRange{N, B, W}() where {N, B, W}
NDRange{N, B, W}(nothing, nothing)
end

function NDRange{N, B, W}(blocks::CartesianIndices, workitems::CartesianIndices) where {N, B, W}
return NDRange{N, B, W}(FastCartesianIndices(size(blocks)), FastCartesianIndices(size(workitems)))
end

function NDRange{N, B, W}(blocks::Nothing, workitems::CartesianIndices) where {N, B, W}
return NDRange{N, B, W}(blocks, FastCartesianIndices(size(workitems)))
end

function NDRange{N, B, W}(blocks::CartesianIndices, workitems::Nothing) where {N, B, W}
return NDRange{N, B, W}(FastCartesianIndices(size(blocks)), workitems)
end

@inline workitems(range::NDRange{N, B, W}) where {N, B, W <: DynamicSize} = range.workitems::FastCartesianIndices{N}
@inline workitems(range::NDRange{N, B, W}) where {N, B, W <: StaticSize} = CartesianIndices(get(W))::CartesianIndices{N}
@inline blocks(range::NDRange{N, B}) where {N, B <: DynamicSize} = range.blocks::CartesianIndices{N}
@inline blocks(range::NDRange{N, B}) where {N, B <: DynamicSize} = range.blocks::FastCartesianIndices{N}
@inline blocks(range::NDRange{N, B}) where {N, B <: StaticSize} = CartesianIndices(get(B))::CartesianIndices{N}

import Base.iterate
Expand All @@ -80,8 +154,8 @@ Base.length(range::NDRange) = length(blocks(range))
CartesianIndex(nI)
end

Base.@propagate_inbounds function expand(ndrange::NDRange, groupidx::Integer, idx::Integer)
expand(ndrange, blocks(ndrange)[groupidx], workitems(ndrange)[idx])
Base.@propagate_inbounds function expand(ndrange::NDRange{N}, groupidx::Integer, idx::Integer) where {N}
return expand(ndrange, blocks(ndrange)[groupidx], workitems(ndrange)[idx])
end

Base.@propagate_inbounds function expand(ndrange::NDRange{N}, groupidx::CartesianIndex{N}, idx::Integer) where {N}
Expand Down Expand Up @@ -126,4 +200,23 @@ needs to perform dynamic bounds-checking.
end
end


"""
assume(cond::Bool)
Assume that the condition `cond` is true. This is a hint to the compiler, possibly enabling
it to optimize more aggressively.
"""
@inline assume(cond::Bool) = Base.llvmcall(
(
"""
declare void @llvm.assume(i1)
define void @entry(i8) #0 {
%cond = icmp eq i8 %0, 1
call void @llvm.assume(i1 %cond)
ret void
}
attributes #0 = { alwaysinline }""", "entry",
),
Nothing, Tuple{Bool}, cond
)
end #module
2 changes: 1 addition & 1 deletion test/compiler.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ end

function compiler_testsuite(backend, ArrayT)
kernel = index(CPU(), DynamicSize(), DynamicSize())
iterspace = NDRange{1, StaticSize{(128,)}, StaticSize{(8,)}}();
iterspace = NDRange{1, StaticSize{(128,)}, StaticSize{(8,)}}()
ctx = KernelAbstractions.mkcontext(kernel, 1, nothing, iterspace, Val(KernelAbstractions.NoDynamicCheck()))
@test KernelAbstractions.__index_Global_NTuple(ctx, CartesianIndex(1)) == (1,)

Expand Down
4 changes: 2 additions & 2 deletions test/localmem.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ using Test
end
I = @index(Global, Linear)
i = @index(Local, Linear)
lmem = @localmem Int (N,) # Ok iff groupsize is static
lmem = @localmem Int (N,) # Ok iff groupsize is static
@inbounds begin
lmem[i] = i
@synchronize
Expand All @@ -23,7 +23,7 @@ end
end
I = @index(Global, Linear)
i = @index(Local, Linear)
lmem = @localmem Int (N,) # Ok iff groupsize is static
lmem = @localmem Int (N,) # Ok iff groupsize is static
@inbounds begin
lmem[i] = i + 3
for j in 1:2
Expand Down
2 changes: 1 addition & 1 deletion test/private.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ function private_testsuite(backend, ArrayT)

A = ArrayT{Int}(undef, 64, 64)
A .= 1
forloop(backend())(A, Val(size(A, 2)), ndrange = size(A, 1), workgroupsize = size(A, 1))
forloop(backend(), size(A, 1))(A, Val(size(A, 2)), ndrange = size(A, 1), workgroupsize = size(A, 1))
synchronize(backend())
@test all(Array(A)[:, 1] .== 64)
@test all(Array(A)[:, 2:end] .== 1)
Expand Down
2 changes: 1 addition & 1 deletion test/test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; sk
@conditional_testset "Const" skip_tests begin
let kernel = constarg(Backend(), 8, (1024,))
# this is poking at internals
iterspace = NDRange{1, StaticSize{(128,)}, StaticSize{(8,)}}();
iterspace = NDRange{1, StaticSize{(128,)}, StaticSize{(8,)}}()
ctx = if Backend == CPU
KernelAbstractions.mkcontext(kernel, 1, nothing, iterspace, Val(NoDynamicCheck()))
else
Expand Down
Loading