Add intrinsics from oneAPI.jl.

JuliaGPU · Sep 11, 2024 · 599980e · 599980e
1 parent 8a76c69
commit 599980e
Show file tree

Hide file tree

Showing 16 changed files with 1,031 additions and 5 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -3,9 +3,19 @@ steps:
     plugins:
       - JuliaCI/julia#v1:
           version: "1.10"
-      - JuliaCI/julia-test#v1: ~
       - JuliaCI/julia-coverage#v1:
           codecov: true
+    commands: |
+      julia --project -e '
+        using Pkg
+
+        println("--- :julia: Instantiating project")
+        Pkg.dev(path="lib/intrinsics")
+        Pkg.instantiate()
+
+
+        println("+++ :julia: Running tests")
+        Pkg.test(; coverage=true)'
     agents:
       queue: "juliagpu"
       cuda: "*"

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -28,7 +28,7 @@ jobs:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
       - uses: julia-actions/cache@v2
-      - uses: julia-actions/julia-buildpkg@v1
+      - run: julia --project -e 'using Pkg; Pkg.develop(path="lib/intrinsics")'
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v4

diff --git a/Project.toml b/Project.toml
@@ -9,8 +9,10 @@ LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
 SPIRV_LLVM_Translator_unified_jll = "85f0d8ed-5b39-5caa-b1ae-7472de402361"
 
 [compat]
 OpenCL_jll = "2024.5.8"
 julia = "1.10"
+LLVM = "9.1"
diff --git a/lib/intrinsics/Project.toml b/lib/intrinsics/Project.toml
@@ -0,0 +1,9 @@
+name = "SPIRVIntrinsics"
+uuid = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
+authors = ["Tim Besard <[email protected]>"]
+version = "0.1.0"
+
+[deps]
+ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
+SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
diff --git a/lib/intrinsics/src/SPIRVIntrinsics.jl b/lib/intrinsics/src/SPIRVIntrinsics.jl
@@ -0,0 +1,25 @@
+module SPIRVIntrinsics
+
+using LLVM, LLVM.Interop
+using Core: LLVMPtr
+
+import ExprTools
+
+import SpecialFunctions
+
+include("pointer.jl")
+include("utils.jl")
+
+# OpenCL intrinsics
+#
+# we currently don't implement SPIR-V intrinsics directly, but rely on
+# the SPIR-V to LLVM translator supporting OpenCL intrinsics
+include("work_item.jl")
+include("synchronization.jl")
+include("memory.jl")
+include("printf.jl")
+include("math.jl")
+include("integer.jl")
+include("atomic.jl")
+
+end
diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl
@@ -0,0 +1,264 @@
+# Atomic Functions
+
+# TODO: support for 64-bit atomics via atom_cmpxchg (from cl_khr_int64_base_atomics)
+
+# "atomic operations on 32-bit signed, unsigned integers and single precision
+#  floating-point to locations in __global or __local memory"
+
+const atomic_integer_types = [UInt32, Int32]
+# TODO: 64-bit atomics with ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS
+# TODO: additional floating-point atomics with ZE_extension_float_atomics
+const atomic_memory_types = [AS.Local, AS.Global]
+
+
+# generically typed
+
+for gentype in atomic_integer_types, as in atomic_memory_types
+@eval begin
+
+@device_function atomic_add!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_add", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_sub!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_sub", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_inc!(p::LLVMPtr{$gentype,$as}) =
+    @builtin_ccall("atomic_inc", $gentype, (LLVMPtr{$gentype,$as},), p)
+
+@device_function atomic_dec!(p::LLVMPtr{$gentype,$as}) =
+    @builtin_ccall("atomic_dec", $gentype, (LLVMPtr{$gentype,$as},), p)
+
+@device_function atomic_min!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_min", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_max!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_max", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_and!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_and", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_or!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_or", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_xor!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_xor", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_xchg!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_xchg", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_cmpxchg!(p::LLVMPtr{$gentype,$as}, cmp::$gentype, val::$gentype) =
+    @builtin_ccall("atomic_cmpxchg", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype, $gentype), p, cmp, val)
+
+end
+end
+
+
+# specifically typed
+
+for as in atomic_memory_types
+@eval begin
+
+@device_function atomic_xchg!(p::LLVMPtr{Float32,$as}, val::Float32) =
+    @builtin_ccall("atomic_xchg", Float32, (LLVMPtr{Float32,$as}, Float32,), p, val)
+
+# XXX: why is only xchg supported on floats? isn't it safe for cmpxchg too,
+#      which should only perform bitwise comparisons?
+@device_function atomic_cmpxchg!(p::LLVMPtr{Float32,$as}, cmp::Float32, val::Float32) =
+    reinterpret(Float32, atomic_cmpxchg!(reinterpret(LLVMPtr{UInt32,$as}, p),
+                                         reinterpret(UInt32, cmp),
+                                         reinterpret(UInt32, val)))
+
+end
+end
+
+
+
+# documentation
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `old + val` and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_add!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `old - val` and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_sub!
+
+"""
+Swaps the old value stored at location `p` with new value given by `val`.
+Returns old value.
+"""
+atomic_xchg!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute (`old` + 1) and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_inc!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute (`old` - 1) and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_dec!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `(old == cmp) ? val : old` and store result at location pointed by `p`.
+The function returns `old`.
+"""
+atomic_cmpxchg!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `min(old, val)` and store minimum value at location pointed by `p`. The
+function returns `old`.
+"""
+atomic_min!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `max(old, val)` and store maximum value at location pointed by `p`. The
+function returns `old`.
+"""
+atomic_max
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `old & val` and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_and!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `old | val` and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_or!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `old ^ val` and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_xor!
+
+
+
+#
+# High-level interface
+#
+
+# prototype of a high-level interface for performing atomic operations on arrays
+#
+# this design could be generalized by having atomic {field,array}{set,ref} accessors, as
+# well as acquire/release operations to implement the fallback functionality where any
+# operation can be applied atomically.
+
+const inplace_ops = Dict(
+    :(+=) => :(+),
+    :(-=) => :(-),
+    :(*=) => :(*),
+    :(/=) => :(/),
+    :(÷=) => :(÷),
+    :(&=) => :(&),
+    :(|=) => :(|),
+    :(⊻=) => :(⊻),
+)
+
+struct AtomicError <: Exception
+    msg::AbstractString
+end
+
+Base.showerror(io::IO, err::AtomicError) =
+    print(io, "AtomicError: ", err.msg)
+
+"""
+    @atomic a[I] = op(a[I], val)
+    @atomic a[I] ...= val
+
+Atomically perform a sequence of operations that loads an array element `a[I]`, performs the
+operation `op` on that value and a second value `val`, and writes the result back to the
+array. This sequence can be written out as a regular assignment, in which case the same
+array element should be used in the left and right hand side of the assignment, or as an
+in-place application of a known operator. In both cases, the array reference should be pure
+and not induce any side-effects.
+
+!!! warn
+    This interface is experimental, and might change without warning.  Use the lower-level
+    `atomic_...!` functions for a stable API.
+"""
+macro atomic(ex)
+    # decode assignment and call
+    if ex.head == :(=)
+        ref = ex.args[1]
+        rhs = ex.args[2]
+        Meta.isexpr(rhs, :call) || throw(AtomicError("right-hand side of an @atomic assignment should be a call"))
+        op = rhs.args[1]
+        if rhs.args[2] != ref
+            throw(AtomicError("right-hand side of a non-inplace @atomic assignment should reference the left-hand side"))
+        end
+        val = rhs.args[3]
+    elseif haskey(inplace_ops, ex.head)
+        op = inplace_ops[ex.head]
+        ref = ex.args[1]
+        val = ex.args[2]
+    else
+        throw(AtomicError("unknown @atomic expression"))
+    end
+
+    # decode array expression
+    Meta.isexpr(ref, :ref) || throw(AtomicError("@atomic should be applied to an array reference expression"))
+    array = ref.args[1]
+    indices = Expr(:tuple, ref.args[2:end]...)
+
+    esc(quote
+        $atomic_arrayset($array, $indices, $op, $val)
+    end)
+end
+
+# FIXME: make this respect the indexing style
+@inline atomic_arrayset(A::AbstractArray{T}, Is::Tuple, op::Function, val) where {T} =
+    atomic_arrayset(A, Base._to_linear_index(A, Is...), op, convert(T, val))
+
+# native atomics
+for (op,impl) in [(+)      => atomic_add!,
+                  (-)      => atomic_sub!,
+                  (&)      => atomic_and!,
+                  (|)      => atomic_or!,
+                  (⊻)      => atomic_xor!,
+                  Base.max => atomic_max!,
+                  Base.min => atomic_min!]
+    @eval @inline atomic_arrayset(A::AbstractArray{T}, I::Integer, ::typeof($op),
+                                  val::T) where {T <: Union{Int32,UInt32}} =
+        $impl(pointer(A, I), val)
+end
+
+# fallback using compare-and-swap
+function atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T}
+    ptr = pointer(A, I)
+    old = Base.unsafe_load(ptr, 1)
+    while true
+        cmp = old
+        new = convert(T, op(old, val))
+        old = atomic_cmpxchg!(ptr, cmp, new)
+        (old == cmp) && return new
+    end
+end
diff --git a/lib/intrinsics/src/integer.jl b/lib/intrinsics/src/integer.jl
@@ -0,0 +1,53 @@
+# Integer Functions
+
+# TODO: vector types
+const generic_integer_types = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64]
+
+
+# generically typed
+
+for gentype in generic_integer_types
+@eval begin
+
+@device_override Base.abs(x::$gentype) = @builtin_ccall("abs", $gentype, ($gentype,), x)
+@device_function abs_diff(x::$gentype, y::$gentype) = @builtin_ccall("abs_diff", $gentype, ($gentype, $gentype), x, y)
+
+@device_function add_sat(x::$gentype, y::$gentype) = @builtin_ccall("add_sat", $gentype, ($gentype, $gentype), x, y)
+@device_function hadd(x::$gentype, y::$gentype) = @builtin_ccall("hadd", $gentype, ($gentype, $gentype), x, y)
+@device_function rhadd(x::$gentype, y::$gentype) = @builtin_ccall("rhadd", $gentype, ($gentype, $gentype), x, y)
+
+@device_override Base.clamp(x::$gentype, minval::$gentype, maxval::$gentype) = @builtin_ccall("clamp", $gentype, ($gentype, $gentype, $gentype), x, minval, maxval)
+
+@device_function clz(x::$gentype) = @builtin_ccall("clz", $gentype, ($gentype,), x)
+@device_function ctz(x::$gentype) = @builtin_ccall("ctz", $gentype, ($gentype,), x)
+
+@device_function mad_hi(a::$gentype, b::$gentype, c::$gentype) = @builtin_ccall("mad_hi", $gentype, ($gentype, $gentype, $gentype), a, b, c)
+@device_function mad_sat(a::$gentype, b::$gentype, c::$gentype) = @builtin_ccall("mad_sat", $gentype, ($gentype, $gentype, $gentype), a, b, c)
+
+# XXX: these definitions introduce ambiguities
+#@device_override Base.max(x::$gentype, y::$gentype) = @builtin_ccall("max", $gentype, ($gentype, $gentype), x, y)
+#@device_override Base.min(x::$gentype, y::$gentype) = @builtin_ccall("min", $gentype, ($gentype, $gentype), x, y)
+
+@device_function mul_hi(x::$gentype, y::$gentype) = @builtin_ccall("mul_hi", $gentype, ($gentype, $gentype), x, y)
+
+@device_function rotate(v::$gentype, i::$gentype) = @builtin_ccall("rotate", $gentype, ($gentype, $gentype), v, i)
+
+@device_function sub_sat(x::$gentype, y::$gentype) = @builtin_ccall("sub_sat", $gentype, ($gentype, $gentype), x, y)
+
+@device_function popcount(x::$gentype) = @builtin_ccall("popcount", $gentype, ($gentype,), x)
+
+@device_function mad24(x::$gentype, y::$gentype, z::$gentype) = @builtin_ccall("mad24", $gentype, ($gentype, $gentype, $gentype), x, y, z)
+@device_function mul24(x::$gentype, y::$gentype) = @builtin_ccall("mul24", $gentype, ($gentype, $gentype), x, y)
+
+end
+end
+
+
+# specifically typed
+
+@device_function upsample(hi::Cchar, lo::Cuchar) = @builtin_ccall("upsample", Cshort, (Cchar, Cuchar), hi, lo)
+upsample(hi::Cuchar, lo::Cuchar) = @builtin_ccall("upsample", Cushort, (Cuchar, Cuchar), hi, lo)
+upsample(hi::Cshort, lo::Cushort) = @builtin_ccall("upsample", Cint, (Cshort, Cushort), hi, lo)
+upsample(hi::Cushort, lo::Cushort) = @builtin_ccall("upsample", Cuint, (Cushort, Cushort), hi, lo)
+upsample(hi::Cint, lo::Cuint) = @builtin_ccall("upsample", Clong, (Cint, Cuint), hi, lo)
+upsample(hi::Cuint, lo::Cuint) = @builtin_ccall("upsample", Culong, (Cuint, Cuint), hi, lo)