Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RFC: Support Adapt.AbstractGPUDevice #297

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ kernel on it instead. For example, launching on a CUDA GPU:
```julia
using CUDAKernels # Required to access CUDADevice
A = CUDA.ones(1024, 1024)
kernel = mul2(CUDADevice(), 16)
kernel = mul2(get_computing_device(A), 16)
# ... the rest is the same!
```

Expand Down
4 changes: 2 additions & 2 deletions examples/matmul.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using KernelAbstractions, Test
using KernelAbstractions, Adapt, Test
include(joinpath(@__DIR__, "utils.jl")) # Load backend

if has_cuda && has_cuda_gpu()
Expand All @@ -24,7 +24,7 @@ function matmul!(a, b, c)
println("Matrix size mismatch!")
return nothing
end
device = KernelAbstractions.get_device(a)
device = get_computing_device(a)
n = device isa GPU ? 256 : 4
kernel! = matmul_kernel!(device, n)
kernel!(a, b, c, ndrange=size(c))
Expand Down
2 changes: 1 addition & 1 deletion examples/memcopy.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ wait(event)
if has_cuda && has_cuda_gpu()
function mycopy!(A::CuArray, B::CuArray)
@assert size(A) == size(B)
copy_kernel!(CUDADevice(), 256)(A, B, ndrange=length(A))
copy_kernel!(get_computing_device(A), 256)(A, B, ndrange=length(A))
end

A = CuArray{Float32}(undef, 1024)
Expand Down
2 changes: 1 addition & 1 deletion examples/memcopy_static.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ if has_cuda && has_cuda_gpu()

function mycopy_static!(A::CuArray, B::CuArray)
@assert size(A) == size(B)
kernel = copy_kernel!(CUDADevice(), 32, size(A)) # if size(A) varies this will cause recompilation
kernel = copy_kernel!(get_computing_device(A), 32, size(A)) # if size(A) varies this will cause recompilation
kernel(A, B, ndrange=size(A))
end

Expand Down
2 changes: 1 addition & 1 deletion examples/mpi.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ end

using MPI

device(A) = typeof(A) <: Array ? CPU() : CUDADevice()
device(A) = typeof(A) <: Array ? CPU() : CUDA.device()

function mpiyield()
MPI.Iprobe(MPI.MPI_ANY_SOURCE, MPI.MPI_ANY_TAG, MPI.COMM_WORLD)
Expand Down
4 changes: 2 additions & 2 deletions examples/naive_transpose.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using KernelAbstractions, Test
using KernelAbstractions, Adapt, Test
include(joinpath(@__DIR__, "utils.jl")) # Load backend

if has_cuda && has_cuda_gpu()
Expand All @@ -17,7 +17,7 @@ function naive_transpose!(a, b)
println("Matrix size mismatch!")
return nothing
end
device = KernelAbstractions.get_device(a)
device = get_computing_device(a)
n = device isa GPU ? 256 : 4
kernel! = naive_transpose_kernel!(device, n)
kernel!(a, b, ndrange=size(a))
Expand Down
12 changes: 6 additions & 6 deletions examples/performance.jl
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ end

for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM*TILE_DIM, 1), (1, TILE_DIM*TILE_DIM))
for (name, kernel) in (
("copy", simple_copy_kernel!(CUDADevice(), block_dims)),
("transpose", simple_transpose_kernel!(CUDADevice(), block_dims)),
("copy", simple_copy_kernel!(CUDA.device(), block_dims)),
("transpose", simple_transpose_kernel!(CUDA.device(), block_dims)),
)
NVTX.@range "Simple $name $block_dims" let
input = CUDA.rand(T, (N, N))
Expand All @@ -154,8 +154,8 @@ end

# Benchmark localmem
for (name, kernel) in (
("copy", lmem_copy_kernel!(CUDADevice(), (TILE_DIM, TILE_DIM))),
("transpose", lmem_transpose_kernel!(CUDADevice(), (TILE_DIM, TILE_DIM))),
("copy", lmem_copy_kernel!(CUDA.device(), (TILE_DIM, TILE_DIM))),
("transpose", lmem_transpose_kernel!(CUDA.device(), (TILE_DIM, TILE_DIM))),
)
for bank in (true, false)
NVTX.@range "Localmem $name ($TILE_DIM, $TILE_DIM) bank=$bank" let
Expand All @@ -176,8 +176,8 @@ end

# Benchmark localmem + multiple elements per lane
for (name, kernel) in (
("copy", coalesced_copy_kernel!(CUDADevice(), (TILE_DIM, BLOCK_ROWS))),
("transpose", coalesced_transpose_kernel!(CUDADevice(), (TILE_DIM, BLOCK_ROWS))),
("copy", coalesced_copy_kernel!(CUDA.device(), (TILE_DIM, BLOCK_ROWS))),
("transpose", coalesced_transpose_kernel!(CUDA.device(), (TILE_DIM, BLOCK_ROWS))),
)
for bank in (true, false)
NVTX.@range "Localmem + multiple elements $name ($TILE_DIM, $BLOCK_ROWS) bank=$bank" let
Expand Down
25 changes: 13 additions & 12 deletions lib/CUDAKernels/src/CUDAKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@ import StaticArrays: MArray
import Adapt
import KernelAbstractions

export CUDADevice
using Adapt: get_computing_device

KernelAbstractions.get_device(::CUDA.CuArray) = CUDADevice()
KernelAbstractions.get_device(::CUDA.CUSPARSE.AbstractCuSparseArray) = CUDADevice()
export CUDADevice

const FREE_STREAMS = CUDA.CuStream[]
const STREAMS = CUDA.CuStream[]
Expand Down Expand Up @@ -94,7 +93,7 @@ end

import KernelAbstractions: Event, CPUEvent, NoneEvent, MultiEvent, CPU, GPU, isdone, failed

struct CUDADevice <: GPU end
const CUDADevice = CUDA.CuDevice

struct CudaEvent <: Event
event::CUDA.CuEvent
Expand All @@ -103,6 +102,8 @@ end
failed(::CudaEvent) = false
isdone(ev::CudaEvent) = CUDA.query(ev.event)

Adapt.get_computing_device(ev::CudaEvent) = get_computing_device(ev.event)

function Event(::CUDADevice)
stream = CUDA.stream()
event = CUDA.CuEvent(CUDA.EVENT_DISABLE_TIMING)
Expand Down Expand Up @@ -134,11 +135,11 @@ function wait(::CPU, ev::CudaEvent, progress=nothing)

event = Base.Event()
stream = next_stream()
wait(CUDADevice(), ev, nothing, stream)
dev = get_computing_device(ev)
wait(dev, ev, nothing, stream)
CUDA.launch(;stream) do
notify(event)
end
dev = CUDA.device()
# if an error occurs, the callback may never fire, so use a timer to detect such cases
timer = Timer(0; interval=1)
Base.@sync begin
Expand Down Expand Up @@ -169,15 +170,15 @@ end
wait(::CUDADevice, ev::CudaEvent, progress=nothing, stream=CUDA.stream()) = CUDA.wait(ev.event, stream)
wait(::CUDADevice, ev::NoneEvent, progress=nothing, stream=nothing) = nothing

function wait(::CUDADevice, ev::MultiEvent, progress=nothing, stream=CUDA.stream())
function wait(dev::CUDADevice, ev::MultiEvent, progress=nothing, stream=CUDA.stream())
dependencies = collect(ev.events)
cudadeps = filter(d->d isa CudaEvent, dependencies)
otherdeps = filter(d->!(d isa CudaEvent), dependencies)
for event in cudadeps
CUDA.wait(event.event, stream)
end
for event in otherdeps
wait(CUDADevice(), event, progress, stream)
wait(dev, event, progress, stream)
end
end

Expand Down Expand Up @@ -208,12 +209,12 @@ function __pin!(a)
return nothing
end

function KernelAbstractions.async_copy!(::CUDADevice, A, B; dependencies=nothing, progress=yield)
function KernelAbstractions.async_copy!(dev::CUDADevice, A, B; dependencies=nothing, progress=yield)
A isa Array && __pin!(A)
B isa Array && __pin!(B)

stream = next_stream()
wait(CUDADevice(), MultiEvent(dependencies), progress, stream)
wait(dev, MultiEvent(dependencies), progress, stream)
event = CUDA.CuEvent(CUDA.EVENT_DISABLE_TIMING)
GC.@preserve A B begin
destptr = pointer(A)
Expand Down Expand Up @@ -264,7 +265,7 @@ function threads_to_workgroupsize(threads, ndrange)
end
end

function (obj::Kernel{CUDADevice})(args...; ndrange=nothing, dependencies=Event(CUDADevice()), workgroupsize=nothing, progress=yield)
function (obj::Kernel{CUDADevice})(args...; ndrange=nothing, dependencies=Event(get_computing_device(first(args))), workgroupsize=nothing, progress=yield)

ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize)
# this might not be the final context, since we may tune the workgroupsize
Expand Down Expand Up @@ -294,7 +295,7 @@ function (obj::Kernel{CUDADevice})(args...; ndrange=nothing, dependencies=Event(
end

stream = next_stream()
wait(CUDADevice(), MultiEvent(dependencies), progress, stream)
wait(get_computing_device(first(args)), MultiEvent(dependencies), progress, stream)

# Launch kernel
event = CUDA.CuEvent(CUDA.EVENT_DISABLE_TIMING)
Expand Down
4 changes: 1 addition & 3 deletions lib/ROCKernels/src/ROCKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import KernelAbstractions

export ROCDevice

KernelAbstractions.get_device(::AMDGPU.ROCArray) = ROCDevice()
get_computing_device(::AMDGPU.ROCArray) = ROCDevice()


const FREE_QUEUES = HSAQueue[]
Expand Down Expand Up @@ -60,8 +60,6 @@ end

import KernelAbstractions: Event, CPUEvent, NoneEvent, MultiEvent, CPU, GPU, isdone, failed

struct ROCDevice <: GPU end

struct ROCEvent{T<:Union{AMDGPU.HSA.Signal,HSAStatusSignal}} <: Event
event::T
end
Expand Down
23 changes: 4 additions & 19 deletions src/KernelAbstractions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -333,28 +333,13 @@ constify(arg) = adapt(ConstAdaptor(), arg)
# Backend hierarchy
###

abstract type Device end
abstract type GPU <: Device end
const Device = AbstractComputingDevice
const GPU = AbstractGPUDevice
const CPU = CPUDevice

struct CPU <: Device end
Base.@deprecate get_device(A::AbstractArray) get_computing_device(A)


"""
KernelAbstractions.get_device(A::AbstractArray)::KernelAbstractions.Device

Get a `KernelAbstractions.Device` instance suitable for array `A`.
"""
function get_device end

# Should cover SubArray, ReshapedArray, ReinterpretArray, Hermitian, AbstractTriangular, etc.:
get_device(A::AbstractArray) = get_device(parent(A))

get_device(A::AbstractSparseArray) = get_device(rowvals(A))
get_device(A::Diagonal) = get_device(A.diag)
get_device(A::Tridiagonal) = get_device(A.d)

get_device(::Array) = CPU()

include("nditeration.jl")
using .NDIteration
import .NDIteration: get
Expand Down
14 changes: 0 additions & 14 deletions test/test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,20 +66,6 @@ end
A[I] = i
end

@testset "get_device" begin
x = ArrayT(rand(Float32, 5))
A = ArrayT(rand(Float32, 5,5))
device = backend()
@test @inferred(KernelAbstractions.get_device(A)) == device
@test @inferred(KernelAbstractions.get_device(view(A, 2:4, 1:3))) == device
if !(isdefined(Main, :ROCKernels) && (device isa Main.ROCKernels.ROCDevice))
# Sparse arrays are not supported by the ROCm backend yet:
@test @inferred(KernelAbstractions.get_device(sparse(A))) == device
end
@test @inferred(KernelAbstractions.get_device(Diagonal(x))) == device
@test @inferred(KernelAbstractions.get_device(Tridiagonal(A))) == device
end

@testset "indextest" begin
# TODO: add test for _group and _local_cartesian
A = ArrayT{Int}(undef, 16, 16)
Expand Down