Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new GPUArrays interface for KA transition #2315

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ steps:
# then, test supported CUDA toolkits (installed through the artifact system)
- group: "CUDA"
key: "cuda"
depends_on: "julia"
steps:
- label: "CUDA {{matrix.cuda}}"
plugins:
Expand Down Expand Up @@ -84,6 +83,33 @@ steps:
echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml
echo -e "[CUDA_Driver_jll]\ncompat = \"false\"" >>LocalPreferences.toml

- group: "Memory"
key: "memory"
steps:
- label: "CuArray with {{matrix.memory}} memory"
plugins:
- JuliaCI/julia#v1:
version: "1.10"
- JuliaCI/julia-test#v1:
test_args: "--quickfail core base libraries"
- JuliaCI/julia-coverage#v1:
dirs:
- src
- lib
- examples
agents:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip memory\]/ && !build.pull_request.draft
timeout_in_minutes: 30
matrix:
setup:
memory:
- "unified"
- "host"
commands: |
echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml

- group: ":nesting_dolls: Subpackages"
depends_on: "cuda"
steps:
Expand Down Expand Up @@ -121,6 +147,7 @@ steps:
using Pkg

println("--- :julia: Instantiating project")
Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
Pkg.activate(joinpath(pwd(), "lib", lowercase("{{matrix.package}}")))
try
Expand Down Expand Up @@ -157,6 +184,7 @@ steps:
julia --project -e '
using Pkg

Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
println("--- :julia: Instantiating project")
withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
Pkg.instantiate()
Expand Down Expand Up @@ -248,11 +276,15 @@ steps:
run_tests: false
command: |
julia --project -e '
using Pkg
Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
using CUDA
@assert !CUDA.functional()
@assert !isdefined(CUDA, :libcudart)
CUDA.set_runtime_version!(v"11.6")'
julia --project -e '
using Pkg
Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
using CUDA
@assert !CUDA.functional()
@assert isdefined(CUDA, :libcudart)'
Expand Down Expand Up @@ -407,6 +439,7 @@ steps:
julia --project -e '
using Pkg

Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
println("--- :julia: Instantiating project")
Pkg.resolve()
Pkg.instantiate()
Expand Down Expand Up @@ -441,6 +474,7 @@ steps:
command: |
julia --project -e '
using Pkg
Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"]

println("--- :julia: Instantiating project")
Expand Down
15 changes: 10 additions & 5 deletions src/CUDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ using Libdl

import NVTX

import KernelAbstractions as KA

using Printf


Expand Down Expand Up @@ -83,6 +85,11 @@ include("compiler/execution.jl")
include("compiler/exceptions.jl")
include("compiler/reflection.jl")

# KernelAbstractions
include("CUDAKernels.jl")
import .CUDAKernels: CUDABackend, KA.launch_config
export CUDABackend

# array implementation
include("gpuarrays.jl")
include("utilities.jl")
Expand Down Expand Up @@ -111,6 +118,9 @@ export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND
const has_cusolvermg = CUSOLVER.has_cusolvermg
export has_cusolvermg

# KA Backend Definition
KA.get_backend(::CUSPARSE.AbstractCuSparseArray) = CUDABackend()

# random depends on CURAND
include("random.jl")

Expand All @@ -119,11 +129,6 @@ include("../lib/nvml/NVML.jl")
const has_nvml = NVML.has_nvml
export NVML, has_nvml

# KernelAbstractions
include("CUDAKernels.jl")
import .CUDAKernels: CUDABackend
export CUDABackend

# StaticArrays is still a direct dependency, so directly include the extension
include("../ext/StaticArraysExt.jl")

Expand Down
1 change: 0 additions & 1 deletion src/CUDAKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ KA.zeros(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.zeros(T, dims)
KA.ones(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.ones(T, dims)

KA.get_backend(::CuArray) = CUDABackend()
KA.get_backend(::CUSPARSE.AbstractCuSparseArray) = CUDABackend()
KA.synchronize(::CUDABackend) = synchronize()

Adapt.adapt_storage(::CUDABackend, a::Array) = Adapt.adapt(CuArray, a)
Expand Down
49 changes: 6 additions & 43 deletions src/gpuarrays.jl
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
# GPUArrays.jl interface


#
# Device functionality
#


## execution

struct CuArrayBackend <: AbstractGPUBackend end

struct CuKernelContext <: AbstractKernelContext end
@inline function GPUArrays.launch_heuristic(::CUDABackend, obj::O, args::Vararg{Any,N};
elements::Int, elements_per_thread::Int) where {O,N}

@inline function GPUArrays.launch_heuristic(::CuArrayBackend, f::F, args::Vararg{Any,N};
elements::Int, elements_per_thread::Int) where {F,N}
kernel = @cuda launch=false f(CuKernelContext(), args...)
ndrange = ceil(Int, elements / elements_per_thread)
ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, nothing)
ctx = KA.mkcontext(obj, ndrange, iterspace)
kernel = @cuda launch=false obj.f(ctx, args...)

# launching many large blocks) lowers performance, as observed with broadcast, so cap
# the block size if we don't have a grid-stride kernel (which would keep the grid small)
Expand All @@ -24,39 +23,3 @@ struct CuKernelContext <: AbstractKernelContext end
launch_configuration(kernel.fun; max_threads=256)
end
end

@inline function GPUArrays.gpu_call(::CuArrayBackend, f::F, args::TT, threads::Int,
blocks::Int; name::Union{String,Nothing}) where {F,TT}
@cuda threads blocks name f(CuKernelContext(), args...)
end


## on-device

# indexing

GPUArrays.blockidx(ctx::CuKernelContext) = blockIdx().x
GPUArrays.blockdim(ctx::CuKernelContext) = blockDim().x
GPUArrays.threadidx(ctx::CuKernelContext) = threadIdx().x
GPUArrays.griddim(ctx::CuKernelContext) = gridDim().x

# memory

@inline function GPUArrays.LocalMemory(::CuKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}
) where {T, dims, id}
ptr = CUDA._shmem(Val(id), T, Val(prod(dims)))
ptr = reinterpret(LLVMPtr{T, AS.Shared}, ptr)
CuDeviceArray{T,length(dims),AS.Shared}(ptr, dims)
end

# synchronization

@inline GPUArrays.synchronize_threads(::CuKernelContext) = sync_threads()



#
# Host abstractions
#

GPUArrays.backend(::Type{<:CuArray}) = CuArrayBackend()