Getcoeffchunk #2393

chriselrod · 2023-12-21T12:20:29Z

Before

julia> sys_loop = @time @eval structural_simplify(ir_state);
 69.648280 seconds (29.10 M allocations: 13.996 GiB, 1.26% gc time, 0.04% compilation time)

after

julia> @time structural_simplify(ir_state);
 51.804978 seconds (29.04 M allocations: 13.992 GiB, 1.15% gc time)

I could optimize this further with llvmcall/intrinsics.

chriselrod · 2023-12-22T12:36:55Z

So, vars is always sorted.
That's an obvious use case for binary search.

I'm using llvmcall here to write loops and some array indexing, because it's just way easier to get good code gen that way.

Here is a benchmark function that does a decent bit of work on a variety of sizes and positions:

vars = sort!(rand(Int, 512));
coeffs = collect(eachindex(vars));
cs = similar(vars, length(vars)+1);

function manygetcoeffs!(f, cs, vars, coeffs)
  nvars = length(vars)
  for len = 1:nvars
    vj = @view vars[1:len]
    cj = @view coeffs[1:len]
    for i = 0:len
      v = i == 0 ? 0 : vars[i]
      cs[i+1] = f(vj, cj, v)
    end
  end
end

This is doubly biased to larger sizes: we test them more often, and they take longer. So the largest sizes are going to take up the vast majority of manygetcoeffs! time.
But that's probably mostly okay.

Also, this should actually shuffle the indices to be less friendly to a branch predictor, but branched versions are still much slower with this, so it isn't too bad.

The original getcoeff is just a linear scan:

function getcoeff(vars, coeffs, var)
    for (vj, v) in enumerate(vars)
        v == var && return coeffs[vj]
    end
    return 0
end

not taking advantage of vars being sorted. Giving as soon as v > var would be an obvious improvement, but my initial SIMD implementation also didn't take advantage of sorting (as I hadn't realized vars are sorted):

function _getcoeff(pv::Ptr{Int64}, pc::Ptr{Int64}, var::Int64, len::Int64)
    Base.llvmcall(("""
                   declare i8 @llvm.cttz.i8(i8, i1);
                   define i64 @entry(i64 %0, i64 %1, i64 %2, i64 %3) #0 {
                   top:
                     %pv = inttoptr i64 %0 to i64*
                     %btmp = insertelement <8 x i64> undef, i64 %2, i64 0
                     %var = shufflevector <8 x i64> %btmp, <8 x i64> undef, <8 x i32> zeroinitializer
                     %lenm7 = add nsw i64 %3, -7
                     %dosimditer = icmp ugt i64 %3, 7
                     br i1 %dosimditer, label %L9.lr.ph, label %L32

                   L9.lr.ph:
                     %len8 = and i64 %3, 9223372036854775800
                     br label %L9

                   L9:
                     %i = phi i64 [ 0, %L9.lr.ph ], [ %vinc, %L30 ]
                     %pvi = getelementptr inbounds i64, i64* %pv, i64 %i
                     %vpvi = bitcast i64* %pvi to <8 x i64>*
                     %v = load <8 x i64>, <8 x i64>* %vpvi, align 8
                     %m = icmp eq <8 x i64> %v, %var
                     %mu = bitcast <8 x i1> %m to i8
                     %matchnotfound = icmp eq i8 %mu, 0
                     br i1 %matchnotfound, label %L30, label %L17

                   L17:
                     %tz8 = call i8 @llvm.cttz.i8(i8 %mu, i1 true)
                     %tz64 = zext i8 %tz8 to i64
                     %vis = add nuw i64 %i, %tz64
                     br label %common.coef.load

                   common.coef.load:
                     %common.index = phi i64 [ %vis, %L17 ], [ %si, %L51 ]
                     %pc = inttoptr i64 %1 to i64*
                     %pco = getelementptr inbounds i64, i64* %pc, i64 %common.index
                     %common.ret.op = load i64, i64* %pco, align 8
                     br label %common.ret

                   common.ret:
                     %retval = phi i64 [ 0, %L32 ], [ 0, %L67 ], [ %common.ret.op, %common.coef.load ]
                     ret i64 %retval

                   L30:
                     %vinc = add nuw nsw i64 %i, 8
                     %continue = icmp slt i64 %vinc, %lenm7
                     br i1 %continue, label %L9, label %L32

                   L32:
                     %cumi = phi i64 [ 0, %top ], [ %len8, %L30 ]
                     %done = icmp eq i64 %cumi, %3
                     br i1 %done, label %common.ret, label %L51

                   L51:
                     %si = phi i64 [ %inc, %L67 ], [ %cumi, %L32 ]
                     %spi = getelementptr inbounds i64, i64* %pv, i64 %si
                     %svi = load i64, i64* %spi, align 8
                     %match = icmp eq i64 %svi, %2
                     br i1 %match, label %common.coef.load, label %L67

                   L67:
                     %inc = add i64 %si, 1
                     %dobreak = icmp eq i64 %inc, %3
                     br i1 %dobreak, label %common.ret, label %L51

                   }
                   attributes #0 = { alwaysinline }
                   """, "entry"), Int, Tuple{Ptr{Int}, Ptr{Int}, Int, Int}, pv, pc, var, len)
end

function getcoeff_simd(vars::StridedVector{Int}, coeffs::StridedVector{Int}, var::Int)
    GC.@preserve vars coeffs begin
        ret = _getcoeff(pointer(vars), pointer(coeffs), var, length(vars))
    end
    return ret
end

This uses a SIMD vector width of 8, but LLVM's legalization should still make this work for CPUs with smaller vector widths.

Noticing that it's sorted, we could just use a binary search

function getcoeff_sorted(vars, coeffs, var)
    s = searchsortedfirst(vars, var)
    @inbounds (s <= length(vars) && vars[s] == var) ? coeffs[s] : 0
end

We can also implement our own branchless binary search:

@inline function binary_search(vars, var)
    len = length(vars)
    offset = 0
    @inbounds while len > 0
        half = len >>> 1
        offset = ifelse(vars[offset + half + 1] < var, (len - half) + offset, offset)
        len = half
    end
    return offset
end
function getcoeff_sorted2(vars, coeffs, var)
    s = binary_search(vars, var)
    @inbounds (s != length(vars) &&
               vars[s + 1] == var) ? coeffs[s + 1] : 0
end

However, LLVM's X86 backend stupidly converts the select in the LLVM IR (ifelse in Julia) into a totally unpredictable branch instead of a cmov.
I'll show benchmarks for this with and without specifying the environmental variable JULIA_LLVM_ARGS="-x86-cmov-converter=false".
Starting Julia with this ENV var disables the conversion of cmovs into branches.

Unfortunately, we probably can't tell everyone to start their Julia with this flag, but maybe we could patch Julia to always just disable this pass.

For those curious, cmov is better when branches are hard to predict. There's like a 16+ cycle penalty every time a branch predictor mispredicts.

However, when branches are predicted successfully, the branch is faster because it isn't part of the dependency chain of operations, while cmov is.
Banch predictors tend to be really accurate, so it's reasonable to expect the branch to be faster on average.
But in a binary search/any biscection algorithm, it's totally random which side you'll go to next.

Branch predictors are often good enough to memorize benchmarks (they can have quite long histories), but in any real workload where you aren't just doing the same thing over and over again, cmov is way better for a binary search.

Maybe I'll add inline ASM, it's easy enough.

Now, a final consideration is, what about using a binary search, until we get below some size threshold, and then search linearly?
I can modify these to use early-stopping based on vars[i] > var, but I didn't get around to it yet (partly, I want to double check that we really never need the unsorted case).

function _getcoeff_scalar(pv::Ptr{Int64}, pc::Ptr{Int64}, var::Int64, len::Int64)
    Base.llvmcall(("""
                   define i64 @entry(i64 %0, i64 %1, i64 %2, i64 %3) #0 {
                   top:
                     %pv = inttoptr i64 %0 to i64*
                     %done = icmp eq i64 0, %3
                     br i1 %done, label %common.ret, label %L51

                   common.coef.load:
                     %pc = inttoptr i64 %1 to i64*
                     %pco = getelementptr inbounds i64, i64* %pc, i64 %si
                     %common.ret.op = load i64, i64* %pco, align 8
                     br label %common.ret

                   common.ret:
                     %retval = phi i64 [ 0, %top ], [ 0, %L67 ], [ %common.ret.op, %common.coef.load ]
                     ret i64 %retval

                   L51:
                     %si = phi i64 [ %inc, %L67 ], [ 0, %top ]
                     %spi = getelementptr inbounds i64, i64* %pv, i64 %si
                     %svi = load i64, i64* %spi, align 8
                     %match = icmp eq i64 %svi, %2
                     br i1 %match, label %common.coef.load, label %L67

                   L67:
                     %inc = add i64 %si, 1
                     %dobreak = icmp eq i64 %inc, %3
                     br i1 %dobreak, label %common.ret, label %L51

                   }
                   attributes #0 = { alwaysinline }
                   """, "entry"), Int, Tuple{Ptr{Int}, Ptr{Int}, Int, Int}, pv, pc, var, len)
end

function _getcoeff_scalar(vars,
        coeffs,
        var, offset, len)
    @inbounds while offset < offset + len
        (getindex(vars, offset + 1) == var) && return coeffs[offset + 1]
        offset += 1
    end
    return 0
end
function _getcoeff_scalar(vars::StridedVector{Int},
        coeffs::StridedVector{Int},
        var::Int, offset, len)
    GC.@preserve vars coeffs begin
        ret = _getcoeff_scalar(pointer(vars) + offset * 8,
            pointer(coeffs) + offset * 8,
            var,
            len)
    end
    return ret
end
function _getcoeff_simd(vars,
        coeffs,
        var, offset, len)
    return _getcoeff_scalar(vars, coeffs, var, offset, len)
end
function _getcoeff_simd(vars::StridedVector{Int},
        coeffs::StridedVector{Int},
        var::Int, offset, len)
    GC.@preserve vars coeffs begin
        ret = _getcoeff(pointer(vars) + offset * 8,
            pointer(coeffs) + offset * 8,
            var,
            len)
    end
    return ret
end

function getcoeff_sorted_basecase(vars::StridedVector{Int},
        coeffs::StridedVector{Int},
        var::Int,
        ::Val{basecase}) where {basecase}
    len = length(vars)
    offset = 0
    @inbounds while len > basecase
        half = len >>> 1 # half on left, len - half on right
        offset = ifelse(vars[offset + half + 1] <= var, half + offset, offset)
        len = len - half
    end
    # maybe occurs in vars[offset+1:offset+len] 
    return _getcoeff_scalar(vars, coeffs, var, offset, len)
end
function getcoeff_sorted_basecase_simd(vars::StridedVector{Int},
        coeffs::StridedVector{Int},
        var::Int,
        ::Val{basecase}) where {basecase}
    len = length(vars)
    offset = 0
    @inbounds while len > basecase
        half = len >>> 1 # half on left, len - half on right
        offset = ifelse(vars[offset + half + 1] <= var, half + offset, offset)
        len = len - half
    end
    # maybe occurs in vars[offset+1:offset+len] 
    return _getcoeff_simd(vars, coeffs, var, offset, len)
end

struct CoefBaseCase{N}
end;
function (::CoefBaseCase{N})(vars, coeffs, var) where {N}
   getcoeff_sorted_basecase(vars, coeffs, var, Val(N))
end
struct CoefBaseCaseSIMD{N}
end;
function (::CoefBaseCaseSIMD{N})(vars, coeffs, var) where {N}
   getcoeff_sorted_basecase_simd(vars, coeffs, var, Val(N))
end

Stating Julia normally:

julia> @btime manygetcoeffs!(getcoeff, $cs, $vars, $coeffs)
  12.251 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(getcoeff_simd, $cs, $vars, $coeffs)
  2.026 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(getcoeff_sorted, $cs, $vars, $coeffs)
  2.795 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(getcoeff_sorted2, $cs, $vars, $coeffs)
  4.733 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCase{2}(), $cs, $vars, $coeffs)
  3.544 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCase{4}(), $cs, $vars, $coeffs)
  2.820 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCase{8}(), $cs, $vars, $coeffs)
  2.227 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCase{16}(), $cs, $vars, $coeffs)
  1.887 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCase{32}(), $cs, $vars, $coeffs)
  2.081 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{8}(), $cs, $vars, $coeffs)
  2.724 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{16}(), $cs, $vars, $coeffs)
  1.685 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{32}(), $cs, $vars, $coeffs)
  1.425 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{64}(), $cs, $vars, $coeffs)
  1.374 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{128}(), $cs, $vars, $coeffs)
  1.452 ms (0 allocations: 0 bytes)

Disabling the cmov conveter:

julia> @btime manygetcoeffs!(getcoeff, $cs, $vars, $coeffs)
  12.193 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(getcoeff_simd, $cs, $vars, $coeffs)
  2.027 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(getcoeff_sorted, $cs, $vars, $coeffs)
  1.733 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(getcoeff_sorted2, $cs, $vars, $coeffs)
  1.975 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCase{2}(), $cs, $vars, $coeffs)
  1.980 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCase{4}(), $cs, $vars, $coeffs)
  1.745 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCase{8}(), $cs, $vars, $coeffs)
  1.568 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCase{16}(), $cs, $vars, $coeffs)
  1.559 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCase{32}(), $cs, $vars, $coeffs)
  1.835 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{8}(), $cs, $vars, $coeffs)
  1.734 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{16}(), $cs, $vars, $coeffs)
  1.633 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{32}(), $cs, $vars, $coeffs)
  1.468 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{64}(), $cs, $vars, $coeffs)
  1.405 ms (0 allocations: 0 bytes)

julia> @btime manygetcoeffs!(CoefBaseCaseSIMD{128}(), $cs, $vars, $coeffs)
  1.547 ms (0 allocations: 0 bytes)

Regardless, the clear winner on my CPU seems to be a SIMD base case of 32 or 64.

Use the basecase of 32, I get

julia> @time structural_simplify(ir_state);
 20.988069 seconds (29.04 M allocations: 13.992 GiB, 2.03% gc time)

versus

julia> @time structural_simplify(ir_state);
 69.229662 seconds (29.39 M allocations: 13.986 GiB, 1.15% gc time)

on an actual motivating example, where the profile identified getcoeff as the primary cost.

Next, I'll look more at bareiss_update_virtual_colswap_mtk! to see if I notice any obvious improvements there that can help more than simple microoptimizations. But they seem to be able to take us a long way.

chriselrod · 2023-12-22T13:04:14Z

        vars = sort(union(ivars, kvars))
        for v in vars
            v == vpivot && continue
            ck = getcoeff(kvars, kcoeffs, v)
            ci = getcoeff(ivars, icoeffs, v)

these are the only two uses, though, so I'll just optimize this function, which I expect to make a far bigger difference than the 3x reported above.

chriselrod · 2023-12-22T14:08:25Z

Not much of an improvement over only optimizing getcoeff, unfortunately, but still an improvement

julia> @time structural_simplify(ir_state);
 17.476935 seconds (28.14 M allocations: 9.693 GiB, 2.60% gc time)

src/systems/sparsematrixclil.jl

Co-authored-by: Yingbo Ma <[email protected]>

chriselrod · 2024-01-19T19:13:16Z

@YingboMa Any suggestions for tests to add, or further review?

YingboMa · 2024-01-19T21:29:48Z

I am really hesitant to merge this PR especially because CIs are currently down. Debugging LLVM will be a huge headache. Could we merge everything except that?

chriselrod · 2024-01-23T23:51:33Z

JuliaRegistries/General#99415

ChrisRackauckas · 2024-01-24T13:15:52Z

Living life in the fast lane.

chriselrod · 2024-01-24T14:49:59Z

I am really hesitant to merge this PR especially because CIs are currently down. Debugging LLVM will be a huge headache. Could we merge everything except that?

No more llvmcall; it is hidden within FindFirstFunctions.jl, which tests fairly thoroughly: https://github.com/SciML/FindFirstFunctions.jl/blob/main/test/runtests.jl

YingboMa · 2024-01-24T14:51:13Z

Very good. Let's merge.

chunk getcoeff

e089c9e

optimize bareiss_update_virtual_colswap_mtk!

04962e5

ChrisRackauckas requested a review from YingboMa December 23, 2023 14:56

chriselrod added 2 commits December 27, 2023 07:28

minor microoptimization

a06fe2b

fix a couple branches

b435f94

YingboMa reviewed Jan 2, 2024

View reviewed changes

src/systems/sparsematrixclil.jl Outdated Show resolved Hide resolved

chriselrod and others added 2 commits January 2, 2024 09:20

Update src/systems/sparsematrixclil.jl

c31d58c

Co-authored-by: Yingbo Ma <[email protected]>

Merge origin/master

7ac6e4d

chriselrod added 2 commits January 24, 2024 09:47

FindFirstFunctions

6000ef6

Merge branch 'master' into getcoeffchunk

0b1092d

YingboMa approved these changes Jan 24, 2024

View reviewed changes

YingboMa merged commit 7685996 into SciML:master Jan 24, 2024
16 of 20 checks passed

chriselrod deleted the getcoeffchunk branch January 24, 2024 14:56

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Getcoeffchunk #2393

Getcoeffchunk #2393

chriselrod commented Dec 21, 2023 •

edited

Loading

chriselrod commented Dec 22, 2023

chriselrod commented Dec 22, 2023

chriselrod commented Dec 22, 2023

chriselrod commented Jan 19, 2024

YingboMa commented Jan 19, 2024

chriselrod commented Jan 23, 2024

ChrisRackauckas commented Jan 24, 2024

chriselrod commented Jan 24, 2024

YingboMa commented Jan 24, 2024

Getcoeffchunk #2393

Getcoeffchunk #2393

Conversation

chriselrod commented Dec 21, 2023 • edited Loading

chriselrod commented Dec 22, 2023

chriselrod commented Dec 22, 2023

chriselrod commented Dec 22, 2023

chriselrod commented Jan 19, 2024

YingboMa commented Jan 19, 2024

chriselrod commented Jan 23, 2024

ChrisRackauckas commented Jan 24, 2024

chriselrod commented Jan 24, 2024

YingboMa commented Jan 24, 2024

chriselrod commented Dec 21, 2023 •

edited

Loading