Skip to content

Commit

Permalink
Add intrinsics from oneAPI.jl.
Browse files Browse the repository at this point in the history
  • Loading branch information
maleadt committed Sep 11, 2024
1 parent 8a76c69 commit 599980e
Show file tree
Hide file tree
Showing 16 changed files with 1,031 additions and 5 deletions.
12 changes: 11 additions & 1 deletion .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,19 @@ steps:
plugins:
- JuliaCI/julia#v1:
version: "1.10"
- JuliaCI/julia-test#v1: ~
- JuliaCI/julia-coverage#v1:
codecov: true
commands: |
julia --project -e '
using Pkg
println("--- :julia: Instantiating project")
Pkg.dev(path="lib/intrinsics")
Pkg.instantiate()
println("+++ :julia: Running tests")
Pkg.test(; coverage=true)'
agents:
queue: "juliagpu"
cuda: "*"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
version: ${{ matrix.version }}
arch: ${{ matrix.arch }}
- uses: julia-actions/cache@v2
- uses: julia-actions/julia-buildpkg@v1
- run: julia --project -e 'using Pkg; Pkg.develop(path="lib/intrinsics")'
- uses: julia-actions/julia-runtest@v1
- uses: julia-actions/julia-processcoverage@v1
- uses: codecov/codecov-action@v4
Expand Down
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
SPIRV_LLVM_Translator_unified_jll = "85f0d8ed-5b39-5caa-b1ae-7472de402361"

[compat]
OpenCL_jll = "2024.5.8"
julia = "1.10"
LLVM = "9.1"
9 changes: 9 additions & 0 deletions lib/intrinsics/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name = "SPIRVIntrinsics"
uuid = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
authors = ["Tim Besard <[email protected]>"]
version = "0.1.0"

[deps]
ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
25 changes: 25 additions & 0 deletions lib/intrinsics/src/SPIRVIntrinsics.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
module SPIRVIntrinsics

using LLVM, LLVM.Interop
using Core: LLVMPtr

import ExprTools

import SpecialFunctions

include("pointer.jl")
include("utils.jl")

# OpenCL intrinsics
#
# we currently don't implement SPIR-V intrinsics directly, but rely on
# the SPIR-V to LLVM translator supporting OpenCL intrinsics
include("work_item.jl")
include("synchronization.jl")
include("memory.jl")
include("printf.jl")
include("math.jl")
include("integer.jl")
include("atomic.jl")

end
264 changes: 264 additions & 0 deletions lib/intrinsics/src/atomic.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
# Atomic Functions

# TODO: support for 64-bit atomics via atom_cmpxchg (from cl_khr_int64_base_atomics)

# "atomic operations on 32-bit signed, unsigned integers and single precision
# floating-point to locations in __global or __local memory"

const atomic_integer_types = [UInt32, Int32]
# TODO: 64-bit atomics with ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS
# TODO: additional floating-point atomics with ZE_extension_float_atomics
const atomic_memory_types = [AS.Local, AS.Global]


# generically typed

for gentype in atomic_integer_types, as in atomic_memory_types
@eval begin

@device_function atomic_add!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
@builtin_ccall("atomic_add", $gentype,
(LLVMPtr{$gentype,$as}, $gentype), p, val)

@device_function atomic_sub!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
@builtin_ccall("atomic_sub", $gentype,
(LLVMPtr{$gentype,$as}, $gentype), p, val)

@device_function atomic_inc!(p::LLVMPtr{$gentype,$as}) =
@builtin_ccall("atomic_inc", $gentype, (LLVMPtr{$gentype,$as},), p)

@device_function atomic_dec!(p::LLVMPtr{$gentype,$as}) =
@builtin_ccall("atomic_dec", $gentype, (LLVMPtr{$gentype,$as},), p)

@device_function atomic_min!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
@builtin_ccall("atomic_min", $gentype,
(LLVMPtr{$gentype,$as}, $gentype), p, val)

@device_function atomic_max!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
@builtin_ccall("atomic_max", $gentype,
(LLVMPtr{$gentype,$as}, $gentype), p, val)

@device_function atomic_and!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
@builtin_ccall("atomic_and", $gentype,
(LLVMPtr{$gentype,$as}, $gentype), p, val)

@device_function atomic_or!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
@builtin_ccall("atomic_or", $gentype,
(LLVMPtr{$gentype,$as}, $gentype), p, val)

@device_function atomic_xor!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
@builtin_ccall("atomic_xor", $gentype,
(LLVMPtr{$gentype,$as}, $gentype), p, val)

@device_function atomic_xchg!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
@builtin_ccall("atomic_xchg", $gentype,
(LLVMPtr{$gentype,$as}, $gentype), p, val)

@device_function atomic_cmpxchg!(p::LLVMPtr{$gentype,$as}, cmp::$gentype, val::$gentype) =
@builtin_ccall("atomic_cmpxchg", $gentype,
(LLVMPtr{$gentype,$as}, $gentype, $gentype), p, cmp, val)

end
end


# specifically typed

for as in atomic_memory_types
@eval begin

@device_function atomic_xchg!(p::LLVMPtr{Float32,$as}, val::Float32) =
@builtin_ccall("atomic_xchg", Float32, (LLVMPtr{Float32,$as}, Float32,), p, val)

# XXX: why is only xchg supported on floats? isn't it safe for cmpxchg too,
# which should only perform bitwise comparisons?
@device_function atomic_cmpxchg!(p::LLVMPtr{Float32,$as}, cmp::Float32, val::Float32) =
reinterpret(Float32, atomic_cmpxchg!(reinterpret(LLVMPtr{UInt32,$as}, p),
reinterpret(UInt32, cmp),
reinterpret(UInt32, val)))

end
end



# documentation

"""
Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
Compute `old + val` and store result at location pointed by `p`. The function
returns `old`.
"""
atomic_add!

"""
Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
Compute `old - val` and store result at location pointed by `p`. The function
returns `old`.
"""
atomic_sub!

"""
Swaps the old value stored at location `p` with new value given by `val`.
Returns old value.
"""
atomic_xchg!

"""
Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
Compute (`old` + 1) and store result at location pointed by `p`. The function
returns `old`.
"""
atomic_inc!

"""
Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
Compute (`old` - 1) and store result at location pointed by `p`. The function
returns `old`.
"""
atomic_dec!

"""
Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
Compute `(old == cmp) ? val : old` and store result at location pointed by `p`.
The function returns `old`.
"""
atomic_cmpxchg!

"""
Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
Compute `min(old, val)` and store minimum value at location pointed by `p`. The
function returns `old`.
"""
atomic_min!

"""
Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
Compute `max(old, val)` and store maximum value at location pointed by `p`. The
function returns `old`.
"""
atomic_max

"""
Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
Compute `old & val` and store result at location pointed by `p`. The function
returns `old`.
"""
atomic_and!

"""
Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
Compute `old | val` and store result at location pointed by `p`. The function
returns `old`.
"""
atomic_or!

"""
Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
Compute `old ^ val` and store result at location pointed by `p`. The function
returns `old`.
"""
atomic_xor!



#
# High-level interface
#

# prototype of a high-level interface for performing atomic operations on arrays
#
# this design could be generalized by having atomic {field,array}{set,ref} accessors, as
# well as acquire/release operations to implement the fallback functionality where any
# operation can be applied atomically.

const inplace_ops = Dict(
:(+=) => :(+),
:(-=) => :(-),
:(*=) => :(*),
:(/=) => :(/),
:(÷=) => :(÷),
:(&=) => :(&),
:(|=) => :(|),
:(⊻=) => :(),
)

struct AtomicError <: Exception
msg::AbstractString
end

Base.showerror(io::IO, err::AtomicError) =
print(io, "AtomicError: ", err.msg)

"""
@atomic a[I] = op(a[I], val)
@atomic a[I] ...= val
Atomically perform a sequence of operations that loads an array element `a[I]`, performs the
operation `op` on that value and a second value `val`, and writes the result back to the
array. This sequence can be written out as a regular assignment, in which case the same
array element should be used in the left and right hand side of the assignment, or as an
in-place application of a known operator. In both cases, the array reference should be pure
and not induce any side-effects.
!!! warn
This interface is experimental, and might change without warning. Use the lower-level
`atomic_...!` functions for a stable API.
"""
macro atomic(ex)
# decode assignment and call
if ex.head == :(=)
ref = ex.args[1]
rhs = ex.args[2]
Meta.isexpr(rhs, :call) || throw(AtomicError("right-hand side of an @atomic assignment should be a call"))
op = rhs.args[1]
if rhs.args[2] != ref
throw(AtomicError("right-hand side of a non-inplace @atomic assignment should reference the left-hand side"))
end
val = rhs.args[3]
elseif haskey(inplace_ops, ex.head)
op = inplace_ops[ex.head]
ref = ex.args[1]
val = ex.args[2]
else
throw(AtomicError("unknown @atomic expression"))
end

# decode array expression
Meta.isexpr(ref, :ref) || throw(AtomicError("@atomic should be applied to an array reference expression"))
array = ref.args[1]
indices = Expr(:tuple, ref.args[2:end]...)

esc(quote
$atomic_arrayset($array, $indices, $op, $val)
end)
end

# FIXME: make this respect the indexing style
@inline atomic_arrayset(A::AbstractArray{T}, Is::Tuple, op::Function, val) where {T} =
atomic_arrayset(A, Base._to_linear_index(A, Is...), op, convert(T, val))

# native atomics
for (op,impl) in [(+) => atomic_add!,
(-) => atomic_sub!,
(&) => atomic_and!,
(|) => atomic_or!,
() => atomic_xor!,
Base.max => atomic_max!,
Base.min => atomic_min!]
@eval @inline atomic_arrayset(A::AbstractArray{T}, I::Integer, ::typeof($op),
val::T) where {T <: Union{Int32,UInt32}} =
$impl(pointer(A, I), val)
end

# fallback using compare-and-swap
function atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T}
ptr = pointer(A, I)
old = Base.unsafe_load(ptr, 1)
while true
cmp = old
new = convert(T, op(old, val))
old = atomic_cmpxchg!(ptr, cmp, new)
(old == cmp) && return new
end
end
53 changes: 53 additions & 0 deletions lib/intrinsics/src/integer.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Integer Functions

# TODO: vector types
const generic_integer_types = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64]


# generically typed

for gentype in generic_integer_types
@eval begin

@device_override Base.abs(x::$gentype) = @builtin_ccall("abs", $gentype, ($gentype,), x)
@device_function abs_diff(x::$gentype, y::$gentype) = @builtin_ccall("abs_diff", $gentype, ($gentype, $gentype), x, y)

@device_function add_sat(x::$gentype, y::$gentype) = @builtin_ccall("add_sat", $gentype, ($gentype, $gentype), x, y)
@device_function hadd(x::$gentype, y::$gentype) = @builtin_ccall("hadd", $gentype, ($gentype, $gentype), x, y)
@device_function rhadd(x::$gentype, y::$gentype) = @builtin_ccall("rhadd", $gentype, ($gentype, $gentype), x, y)

@device_override Base.clamp(x::$gentype, minval::$gentype, maxval::$gentype) = @builtin_ccall("clamp", $gentype, ($gentype, $gentype, $gentype), x, minval, maxval)

@device_function clz(x::$gentype) = @builtin_ccall("clz", $gentype, ($gentype,), x)
@device_function ctz(x::$gentype) = @builtin_ccall("ctz", $gentype, ($gentype,), x)

@device_function mad_hi(a::$gentype, b::$gentype, c::$gentype) = @builtin_ccall("mad_hi", $gentype, ($gentype, $gentype, $gentype), a, b, c)
@device_function mad_sat(a::$gentype, b::$gentype, c::$gentype) = @builtin_ccall("mad_sat", $gentype, ($gentype, $gentype, $gentype), a, b, c)

# XXX: these definitions introduce ambiguities
#@device_override Base.max(x::$gentype, y::$gentype) = @builtin_ccall("max", $gentype, ($gentype, $gentype), x, y)
#@device_override Base.min(x::$gentype, y::$gentype) = @builtin_ccall("min", $gentype, ($gentype, $gentype), x, y)

@device_function mul_hi(x::$gentype, y::$gentype) = @builtin_ccall("mul_hi", $gentype, ($gentype, $gentype), x, y)

@device_function rotate(v::$gentype, i::$gentype) = @builtin_ccall("rotate", $gentype, ($gentype, $gentype), v, i)

@device_function sub_sat(x::$gentype, y::$gentype) = @builtin_ccall("sub_sat", $gentype, ($gentype, $gentype), x, y)

@device_function popcount(x::$gentype) = @builtin_ccall("popcount", $gentype, ($gentype,), x)

@device_function mad24(x::$gentype, y::$gentype, z::$gentype) = @builtin_ccall("mad24", $gentype, ($gentype, $gentype, $gentype), x, y, z)
@device_function mul24(x::$gentype, y::$gentype) = @builtin_ccall("mul24", $gentype, ($gentype, $gentype), x, y)

end
end


# specifically typed

@device_function upsample(hi::Cchar, lo::Cuchar) = @builtin_ccall("upsample", Cshort, (Cchar, Cuchar), hi, lo)
upsample(hi::Cuchar, lo::Cuchar) = @builtin_ccall("upsample", Cushort, (Cuchar, Cuchar), hi, lo)
upsample(hi::Cshort, lo::Cushort) = @builtin_ccall("upsample", Cint, (Cshort, Cushort), hi, lo)
upsample(hi::Cushort, lo::Cushort) = @builtin_ccall("upsample", Cuint, (Cushort, Cushort), hi, lo)
upsample(hi::Cint, lo::Cuint) = @builtin_ccall("upsample", Clong, (Cint, Cuint), hi, lo)
upsample(hi::Cuint, lo::Cuint) = @builtin_ccall("upsample", Culong, (Cuint, Cuint), hi, lo)
Loading

0 comments on commit 599980e

Please sign in to comment.