-
-
Notifications
You must be signed in to change notification settings - Fork 210
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Getcoeffchunk #2393
Getcoeffchunk #2393
Conversation
So, I'm using Here is a benchmark function that does a decent bit of work on a variety of sizes and positions: vars = sort!(rand(Int, 512));
coeffs = collect(eachindex(vars));
cs = similar(vars, length(vars)+1);
function manygetcoeffs!(f, cs, vars, coeffs)
nvars = length(vars)
for len = 1:nvars
vj = @view vars[1:len]
cj = @view coeffs[1:len]
for i = 0:len
v = i == 0 ? 0 : vars[i]
cs[i+1] = f(vj, cj, v)
end
end
end This is doubly biased to larger sizes: we test them more often, and they take longer. So the largest sizes are going to take up the vast majority of Also, this should actually shuffle the indices to be less friendly to a branch predictor, but branched versions are still much slower with this, so it isn't too bad. The original function getcoeff(vars, coeffs, var)
for (vj, v) in enumerate(vars)
v == var && return coeffs[vj]
end
return 0
end not taking advantage of function _getcoeff(pv::Ptr{Int64}, pc::Ptr{Int64}, var::Int64, len::Int64)
Base.llvmcall(("""
declare i8 @llvm.cttz.i8(i8, i1);
define i64 @entry(i64 %0, i64 %1, i64 %2, i64 %3) #0 {
top:
%pv = inttoptr i64 %0 to i64*
%btmp = insertelement <8 x i64> undef, i64 %2, i64 0
%var = shufflevector <8 x i64> %btmp, <8 x i64> undef, <8 x i32> zeroinitializer
%lenm7 = add nsw i64 %3, -7
%dosimditer = icmp ugt i64 %3, 7
br i1 %dosimditer, label %L9.lr.ph, label %L32
L9.lr.ph:
%len8 = and i64 %3, 9223372036854775800
br label %L9
L9:
%i = phi i64 [ 0, %L9.lr.ph ], [ %vinc, %L30 ]
%pvi = getelementptr inbounds i64, i64* %pv, i64 %i
%vpvi = bitcast i64* %pvi to <8 x i64>*
%v = load <8 x i64>, <8 x i64>* %vpvi, align 8
%m = icmp eq <8 x i64> %v, %var
%mu = bitcast <8 x i1> %m to i8
%matchnotfound = icmp eq i8 %mu, 0
br i1 %matchnotfound, label %L30, label %L17
L17:
%tz8 = call i8 @llvm.cttz.i8(i8 %mu, i1 true)
%tz64 = zext i8 %tz8 to i64
%vis = add nuw i64 %i, %tz64
br label %common.coef.load
common.coef.load:
%common.index = phi i64 [ %vis, %L17 ], [ %si, %L51 ]
%pc = inttoptr i64 %1 to i64*
%pco = getelementptr inbounds i64, i64* %pc, i64 %common.index
%common.ret.op = load i64, i64* %pco, align 8
br label %common.ret
common.ret:
%retval = phi i64 [ 0, %L32 ], [ 0, %L67 ], [ %common.ret.op, %common.coef.load ]
ret i64 %retval
L30:
%vinc = add nuw nsw i64 %i, 8
%continue = icmp slt i64 %vinc, %lenm7
br i1 %continue, label %L9, label %L32
L32:
%cumi = phi i64 [ 0, %top ], [ %len8, %L30 ]
%done = icmp eq i64 %cumi, %3
br i1 %done, label %common.ret, label %L51
L51:
%si = phi i64 [ %inc, %L67 ], [ %cumi, %L32 ]
%spi = getelementptr inbounds i64, i64* %pv, i64 %si
%svi = load i64, i64* %spi, align 8
%match = icmp eq i64 %svi, %2
br i1 %match, label %common.coef.load, label %L67
L67:
%inc = add i64 %si, 1
%dobreak = icmp eq i64 %inc, %3
br i1 %dobreak, label %common.ret, label %L51
}
attributes #0 = { alwaysinline }
""", "entry"), Int, Tuple{Ptr{Int}, Ptr{Int}, Int, Int}, pv, pc, var, len)
end
function getcoeff_simd(vars::StridedVector{Int}, coeffs::StridedVector{Int}, var::Int)
GC.@preserve vars coeffs begin
ret = _getcoeff(pointer(vars), pointer(coeffs), var, length(vars))
end
return ret
end This uses a SIMD vector width of Noticing that it's sorted, we could just use a binary search function getcoeff_sorted(vars, coeffs, var)
s = searchsortedfirst(vars, var)
@inbounds (s <= length(vars) && vars[s] == var) ? coeffs[s] : 0
end We can also implement our own branchless binary search: @inline function binary_search(vars, var)
len = length(vars)
offset = 0
@inbounds while len > 0
half = len >>> 1
offset = ifelse(vars[offset + half + 1] < var, (len - half) + offset, offset)
len = half
end
return offset
end
function getcoeff_sorted2(vars, coeffs, var)
s = binary_search(vars, var)
@inbounds (s != length(vars) &&
vars[s + 1] == var) ? coeffs[s + 1] : 0
end However, LLVM's X86 backend stupidly converts the Unfortunately, we probably can't tell everyone to start their Julia with this flag, but maybe we could patch Julia to always just disable this pass. For those curious, However, when branches are predicted successfully, the branch is faster because it isn't part of the dependency chain of operations, while Branch predictors are often good enough to memorize benchmarks (they can have quite long histories), but in any real workload where you aren't just doing the same thing over and over again, Maybe I'll add inline ASM, it's easy enough. Now, a final consideration is, what about using a binary search, until we get below some size threshold, and then search linearly? function _getcoeff_scalar(pv::Ptr{Int64}, pc::Ptr{Int64}, var::Int64, len::Int64)
Base.llvmcall(("""
define i64 @entry(i64 %0, i64 %1, i64 %2, i64 %3) #0 {
top:
%pv = inttoptr i64 %0 to i64*
%done = icmp eq i64 0, %3
br i1 %done, label %common.ret, label %L51
common.coef.load:
%pc = inttoptr i64 %1 to i64*
%pco = getelementptr inbounds i64, i64* %pc, i64 %si
%common.ret.op = load i64, i64* %pco, align 8
br label %common.ret
common.ret:
%retval = phi i64 [ 0, %top ], [ 0, %L67 ], [ %common.ret.op, %common.coef.load ]
ret i64 %retval
L51:
%si = phi i64 [ %inc, %L67 ], [ 0, %top ]
%spi = getelementptr inbounds i64, i64* %pv, i64 %si
%svi = load i64, i64* %spi, align 8
%match = icmp eq i64 %svi, %2
br i1 %match, label %common.coef.load, label %L67
L67:
%inc = add i64 %si, 1
%dobreak = icmp eq i64 %inc, %3
br i1 %dobreak, label %common.ret, label %L51
}
attributes #0 = { alwaysinline }
""", "entry"), Int, Tuple{Ptr{Int}, Ptr{Int}, Int, Int}, pv, pc, var, len)
end
function _getcoeff_scalar(vars,
coeffs,
var, offset, len)
@inbounds while offset < offset + len
(getindex(vars, offset + 1) == var) && return coeffs[offset + 1]
offset += 1
end
return 0
end
function _getcoeff_scalar(vars::StridedVector{Int},
coeffs::StridedVector{Int},
var::Int, offset, len)
GC.@preserve vars coeffs begin
ret = _getcoeff_scalar(pointer(vars) + offset * 8,
pointer(coeffs) + offset * 8,
var,
len)
end
return ret
end
function _getcoeff_simd(vars,
coeffs,
var, offset, len)
return _getcoeff_scalar(vars, coeffs, var, offset, len)
end
function _getcoeff_simd(vars::StridedVector{Int},
coeffs::StridedVector{Int},
var::Int, offset, len)
GC.@preserve vars coeffs begin
ret = _getcoeff(pointer(vars) + offset * 8,
pointer(coeffs) + offset * 8,
var,
len)
end
return ret
end
function getcoeff_sorted_basecase(vars::StridedVector{Int},
coeffs::StridedVector{Int},
var::Int,
::Val{basecase}) where {basecase}
len = length(vars)
offset = 0
@inbounds while len > basecase
half = len >>> 1 # half on left, len - half on right
offset = ifelse(vars[offset + half + 1] <= var, half + offset, offset)
len = len - half
end
# maybe occurs in vars[offset+1:offset+len]
return _getcoeff_scalar(vars, coeffs, var, offset, len)
end
function getcoeff_sorted_basecase_simd(vars::StridedVector{Int},
coeffs::StridedVector{Int},
var::Int,
::Val{basecase}) where {basecase}
len = length(vars)
offset = 0
@inbounds while len > basecase
half = len >>> 1 # half on left, len - half on right
offset = ifelse(vars[offset + half + 1] <= var, half + offset, offset)
len = len - half
end
# maybe occurs in vars[offset+1:offset+len]
return _getcoeff_simd(vars, coeffs, var, offset, len)
end
struct CoefBaseCase{N}
end;
function (::CoefBaseCase{N})(vars, coeffs, var) where {N}
getcoeff_sorted_basecase(vars, coeffs, var, Val(N))
end
struct CoefBaseCaseSIMD{N}
end;
function (::CoefBaseCaseSIMD{N})(vars, coeffs, var) where {N}
getcoeff_sorted_basecase_simd(vars, coeffs, var, Val(N))
end Stating Julia normally: julia> @btime manygetcoeffs!(getcoeff, $cs, $vars, $coeffs)
12.251 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(getcoeff_simd, $cs, $vars, $coeffs)
2.026 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(getcoeff_sorted, $cs, $vars, $coeffs)
2.795 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(getcoeff_sorted2, $cs, $vars, $coeffs)
4.733 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCase{2}(), $cs, $vars, $coeffs)
3.544 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCase{4}(), $cs, $vars, $coeffs)
2.820 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCase{8}(), $cs, $vars, $coeffs)
2.227 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCase{16}(), $cs, $vars, $coeffs)
1.887 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCase{32}(), $cs, $vars, $coeffs)
2.081 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{8}(), $cs, $vars, $coeffs)
2.724 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{16}(), $cs, $vars, $coeffs)
1.685 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{32}(), $cs, $vars, $coeffs)
1.425 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{64}(), $cs, $vars, $coeffs)
1.374 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{128}(), $cs, $vars, $coeffs)
1.452 ms (0 allocations: 0 bytes) Disabling the julia> @btime manygetcoeffs!(getcoeff, $cs, $vars, $coeffs)
12.193 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(getcoeff_simd, $cs, $vars, $coeffs)
2.027 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(getcoeff_sorted, $cs, $vars, $coeffs)
1.733 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(getcoeff_sorted2, $cs, $vars, $coeffs)
1.975 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCase{2}(), $cs, $vars, $coeffs)
1.980 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCase{4}(), $cs, $vars, $coeffs)
1.745 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCase{8}(), $cs, $vars, $coeffs)
1.568 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCase{16}(), $cs, $vars, $coeffs)
1.559 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCase{32}(), $cs, $vars, $coeffs)
1.835 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{8}(), $cs, $vars, $coeffs)
1.734 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{16}(), $cs, $vars, $coeffs)
1.633 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{32}(), $cs, $vars, $coeffs)
1.468 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{64}(), $cs, $vars, $coeffs)
1.405 ms (0 allocations: 0 bytes)
julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{128}(), $cs, $vars, $coeffs)
1.547 ms (0 allocations: 0 bytes) Regardless, the clear winner on my CPU seems to be a SIMD base case of 32 or 64. Use the basecase of 32, I get julia> @time structural_simplify(ir_state);
20.988069 seconds (29.04 M allocations: 13.992 GiB, 2.03% gc time) versus julia> @time structural_simplify(ir_state);
69.229662 seconds (29.39 M allocations: 13.986 GiB, 1.15% gc time) on an actual motivating example, where the profile identified Next, I'll look more at |
vars = sort(union(ivars, kvars))
for v in vars
v == vpivot && continue
ck = getcoeff(kvars, kcoeffs, v)
ci = getcoeff(ivars, icoeffs, v) these are the only two uses, though, so I'll just optimize this function, which I expect to make a far bigger difference than the 3x reported above. |
Not much of an improvement over only optimizing julia> @time structural_simplify(ir_state);
17.476935 seconds (28.14 M allocations: 9.693 GiB, 2.60% gc time) |
Co-authored-by: Yingbo Ma <[email protected]>
@YingboMa Any suggestions for tests to add, or further review? |
I am really hesitant to merge this PR especially because CIs are currently down. Debugging LLVM will be a huge headache. Could we merge everything except that? |
Living life in the fast lane. |
No more |
Very good. Let's merge. |
Before
after
I could optimize this further with
llvmcall
/intrinsics.