From 8b95ce012e07ce076ceb0a0e569aa1c45f223243 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Tue, 28 Nov 2023 18:04:39 +0200 Subject: [PATCH 1/2] Check context is valid before freeing streams, arrays. --- src/array.jl | 6 +++-- src/hip/HIP.jl | 10 +++----- src/hip/stream.jl | 15 ++++++++--- src/runtime/Runtime.jl | 2 +- src/runtime/memory/hip.jl | 32 ++++++++++++++++-------- src/tls.jl | 18 +++++++------ src/utils.jl | 14 +++++------ test/device/output.jl | 29 +++++++++++---------- test/device_tests.jl | 4 ++- test/{hip_tests.jl => hip_core_tests.jl} | 31 ++++++++--------------- test/hip_extra_tests.jl | 24 ++++++++++++++++++ test/runtests.jl | 25 ++++++++++-------- 12 files changed, 124 insertions(+), 86 deletions(-) rename test/{hip_tests.jl => hip_core_tests.jl} (73%) create mode 100644 test/hip_extra_tests.jl diff --git a/src/array.jl b/src/array.jl index 901aabbb..55976077 100644 --- a/src/array.jl +++ b/src/array.jl @@ -25,8 +25,10 @@ end # Passed to `DataRef` to handle freeing. function _free_buf(buf, stream_ordered::Bool) - s = stream_ordered ? AMDGPU.stream() : AMDGPU.default_stream() - Mem.free(buf; stream=s) + context!(buf.ctx) do + s = stream_ordered ? AMDGPU.stream() : AMDGPU.default_stream() + Mem.free(buf; stream=s) + end end unsafe_free!(x::ROCArray) = GPUArrays.unsafe_free!(x.buf, true) diff --git a/src/hip/HIP.jl b/src/hip/HIP.jl index 86e9bad4..2529bc3e 100644 --- a/src/hip/HIP.jl +++ b/src/hip/HIP.jl @@ -23,6 +23,7 @@ end mutable struct HIPContext context::hipContext_t + valid::Bool end const CONTEXTS = AMDGPU.LockedObject(Dict{HIPDevice,HIPContext}()) @@ -31,10 +32,11 @@ function HIPContext(device::HIPDevice) get!(contexts, device) do context_ref = Ref{hipContext_t}() hipCtxCreate(context_ref, Cuint(0), device.device) |> check - context = HIPContext(context_ref[]) + context = HIPContext(context_ref[], true) device!(device) finalizer(context) do c + c.valid = false hipCtxDestroy(c.context) |> check end return context @@ -44,11 +46,7 @@ end HIPContext(device_id::Integer) = HIPContext(HIPDevice(device_id)) -function HIPContext() - context_ref = Ref{hipContext_t}() - hipCtxGetCurrent(context_ref) |> check - HIPContext(context_ref[]) -end +HIPContext() = HIPContext(device()) Base.unsafe_convert(::Type{Ptr{T}}, context::HIPContext) where T = reinterpret(Ptr{T}, context.context) diff --git a/src/hip/stream.jl b/src/hip/stream.jl index ed23dd45..ed91d92c 100644 --- a/src/hip/stream.jl +++ b/src/hip/stream.jl @@ -5,6 +5,7 @@ mutable struct HIPStream stream::hipStream_t priority::Symbol device::HIPDevice + ctx::HIPContext end """ @@ -22,14 +23,17 @@ function HIPStream(priority::Symbol = :normal) stream_ref = Ref{hipStream_t}() hipStreamCreateWithPriority(stream_ref, Cuint(0), priority_int) |> check - stream = HIPStream(stream_ref[], priority, device()) + d = device() + stream = HIPStream(stream_ref[], priority, d, HIPContext(d)) finalizer(stream) do s - hipStreamDestroy(s.stream) |> check + AMDGPU.context!(s.ctx) do + hipStreamDestroy(s.stream) |> check + end end return stream end -default_stream() = HIPStream(convert(hipStream_t, C_NULL), :normal, device()) +default_stream() = HIPStream(C_NULL, :normal, device(), HIPContext()) """ HIPStream(stream::hipStream_t) @@ -37,7 +41,10 @@ default_stream() = HIPStream(convert(hipStream_t, C_NULL), :normal, device()) Create HIPStream from `hipStream_t` handle. Device is the default device that's currently in use. """ -HIPStream(stream::hipStream_t) = HIPStream(stream, priority(stream), device()) +function HIPStream(stream::hipStream_t) + d = device() + HIPStream(stream, priority(stream), d, HIPContext(d)) +end function isdone(stream::HIPStream) query = hipStreamQuery(stream) diff --git a/src/runtime/Runtime.jl b/src/runtime/Runtime.jl index 0f1ef1bf..4884e6f5 100644 --- a/src/runtime/Runtime.jl +++ b/src/runtime/Runtime.jl @@ -25,7 +25,7 @@ module Mem import AMDGPU import AMDGPU: HIP, HSA, Runtime - import .HIP: HIPDevice + import .HIP: HIPDevice, HIPContext import .Runtime: ROCDim, ROCDim3 abstract type AbstractAMDBuffer end diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl index b1bc0af9..4a9c2920 100644 --- a/src/runtime/memory/hip.jl +++ b/src/runtime/memory/hip.jl @@ -60,7 +60,8 @@ function mark_pool!(dev::HIP.HIPDevice) end struct HIPBuffer <: AbstractAMDBuffer - device::HIPDevice + device::HIPDevice # TODO not used? + ctx::HIPContext ptr::Ptr{Cvoid} bytesize::Int own::Bool @@ -68,9 +69,10 @@ end function HIPBuffer(bytesize; stream::HIP.HIPStream) dev = stream.device - bytesize == 0 && return HIPBuffer(dev, C_NULL, 0, true) + ctx = stream.ctx + bytesize == 0 && return HIPBuffer(dev, ctx, C_NULL, 0, true) - mark_pool!(dev) + # mark_pool!(dev) pool = HIP.memory_pool(dev) has_limit = hard_memory_limit() != typemax(UInt64) @@ -106,18 +108,20 @@ function HIPBuffer(bytesize; stream::HIP.HIPStream) @assert HIP.reserved_memory(pool) ≤ hard_memory_limit() end - HIPBuffer(dev, ptr, bytesize, true) + HIPBuffer(dev, ctx, ptr, bytesize, true) end -HIPBuffer(ptr::Ptr{Cvoid}, bytesize::Int) = - HIPBuffer(AMDGPU.device(), ptr, bytesize, false) +function HIPBuffer(ptr::Ptr{Cvoid}, bytesize::Int) + s = AMDGPU.stream() + HIPBuffer(s.device, s.ctx, ptr, bytesize, false) +end Base.unsafe_convert(::Type{Ptr{T}}, buf::HIPBuffer) where T = convert(Ptr{T}, buf.ptr) function view(buf::HIPBuffer, bytesize::Int) bytesize > buf.bytesize && throw(BoundsError(buf, bytesize)) - HIPBuffer(buf.device, buf.ptr + bytesize, buf.bytesize - bytesize, buf.own) + HIPBuffer(buf.device, buf.ctx, buf.ptr + bytesize, buf.bytesize - bytesize, buf.own) end function free(buf::HIPBuffer; stream::HIP.HIPStream) @@ -152,13 +156,17 @@ end struct HostBuffer <: AbstractAMDBuffer device::HIPDevice + ctx::HIPContext ptr::Ptr{Cvoid} dev_ptr::Ptr{Cvoid} bytesize::Int own::Bool end -HostBuffer() = HostBuffer(AMDGPU.device(), C_NULL, C_NULL, 0, true) +function HostBuffer() + s = AMDGPU.stream() + HostBuffer(s.device, s.ctx, C_NULL, C_NULL, 0, true) +end function HostBuffer(bytesize::Integer, flags = 0) bytesize == 0 && return HostBuffer() @@ -167,19 +175,21 @@ function HostBuffer(bytesize::Integer, flags = 0) HIP.hipHostMalloc(ptr_ref, bytesize, flags) |> HIP.check ptr = ptr_ref[] dev_ptr = get_device_ptr(ptr) - HostBuffer(AMDGPU.device(), ptr, dev_ptr, bytesize, true) + s = AMDGPU.stream() + HostBuffer(s.device, s.ctx, ptr, dev_ptr, bytesize, true) end function HostBuffer(ptr::Ptr{Cvoid}, sz::Integer) HIP.hipHostRegister(ptr, sz, HIP.hipHostRegisterMapped) |> HIP.check dev_ptr = get_device_ptr(ptr) - HostBuffer(AMDGPU.device(), ptr, dev_ptr, sz, false) + s = AMDGPU.stream() + HostBuffer(s.device, s.ctx, ptr, dev_ptr, sz, false) end function view(buf::HostBuffer, bytesize::Int) bytesize > buf.bytesize && throw(BoundsError(buf, bytesize)) HostBuffer( - buf.device, + buf.device, buf.ctx, buf.ptr + bytesize, buf.dev_ptr + bytesize, buf.bytesize - bytesize, buf.own) end diff --git a/src/tls.jl b/src/tls.jl index d0a22045..b169a37f 100644 --- a/src/tls.jl +++ b/src/tls.jl @@ -114,9 +114,9 @@ function context!(ctx::HIPContext) HIP.context!(ctx) task_local_state!(HIP.device(), ctx) else - old_ctx = state.ctx + old_ctx = state.context if old_ctx != ctx - HIP.context!(state.ctx) + HIP.context!(state.context) state.device = HIP.device() state.context = ctx end @@ -125,11 +125,15 @@ function context!(ctx::HIPContext) end function context!(f::Function, ctx::HIPContext) - old_ctx = context!(ctx) - return try - f() - finally - old_ctx ≢ nothing && old_ctx != ctx && context!(old_ctx) + if ctx.valid + old_ctx = context!(ctx) + return try + f() + finally + old_ctx ≢ nothing && old_ctx != ctx && context!(old_ctx) + end + else + @warn "CTX not valid" end end diff --git a/src/utils.jl b/src/utils.jl index 3394c6eb..1ad3decd 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -21,31 +21,31 @@ function versioninfo(io::IO=stdout) end println(_lib_title("rocBLAS", :rocblas; version_fn=rocBLAS.version)) if functional(:rocblas) - println(" @ $(Libdl.dlpath(librocblas))") + println(" @ $librocblas") end println(_lib_title("rocSOLVER", :rocsolver; version_fn=rocSOLVER.version)) if functional(:rocsolver) - println(" @ $(Libdl.dlpath(librocsolver))") + println(" @ $librocsolver") end println("[$(_status(functional(:rocalution)))] rocALUTION") if functional(:rocalution) - println(" @ $(Libdl.dlpath(librocalution))") + println(" @ $librocalution") end println("[$(_status(functional(:rocsparse)))] rocSPARSE") if functional(:rocsparse) - println(" @ $(Libdl.dlpath(librocsparse))") + println(" @ $librocsparse") end println(_lib_title("rocRAND", :rocrand; version_fn=rocRAND.version)) if functional(:rocrand) - println(" @ $(Libdl.dlpath(librocrand))") + println(" @ $librocrand") end println(_lib_title("rocFFT", :rocfft; version_fn=rocFFT.version)) if functional(:rocfft) - println(" @ $(Libdl.dlpath(librocfft))") + println(" @ $librocfft") end println(_lib_title("MIOpen", :MIOpen; version_fn=MIOpen.version)) if functional(:MIOpen) - println(" @ $(Libdl.dlpath(libMIOpen_path))") + println(" @ $libMIOpen_path") end if functional(:hip) diff --git a/test/device/output.jl b/test/device/output.jl index 49040c9a..8886ad52 100644 --- a/test/device/output.jl +++ b/test/device/output.jl @@ -32,21 +32,20 @@ @test msg == "Hello World!Goodbye World!\n" end - #= TODO - @testset "Interpolated string" begin - inner_str = "to the" - function kernel(oc) - @rocprintln oc "Hello $inner_str World!" - nothing - end - - iob = IOBuffer() - oc = OutputContext(iob) - @roc kernel(oc) - sleep(1) - @test String(take!(iob)) == "Hello to the World!\n" - end - =# + # TODO + # @testset "Interpolated string" begin + # inner_str = "to the" + # function kernel(oc) + # @rocprintln oc "Hello $inner_str World!" + # nothing + # end + + # iob = IOBuffer() + # oc = OutputContext(iob) + # @roc kernel(oc) + # sleep(1) + # @test String(take!(iob)) == "Hello to the World!\n" + # end end @testset "@rocprintf" begin diff --git a/test/device_tests.jl b/test/device_tests.jl index 7c4279be..11bb5b2d 100644 --- a/test/device_tests.jl +++ b/test/device_tests.jl @@ -11,10 +11,12 @@ include("device/array.jl") include("device/vadd.jl") include("device/memory.jl") include("device/indexing.jl") -include("device/math.jl") include("device/wavefront.jl") include("device/synchronization.jl") include("device/execution_control.jl") include("device/exceptions.jl") +# TODO https://github.com/JuliaGPU/AMDGPU.jl/issues/546 +include("device/math.jl") + end diff --git a/test/hip_tests.jl b/test/hip_core_tests.jl similarity index 73% rename from test/hip_tests.jl rename to test/hip_core_tests.jl index 36862f63..1e41243f 100644 --- a/test/hip_tests.jl +++ b/test/hip_core_tests.jl @@ -1,4 +1,4 @@ -@testitem "hip" begin +@testitem "hip - core" begin using Test using LinearAlgebra @@ -10,26 +10,6 @@ import AMDGPU: @allowscalar Random.seed!(1) AMDGPU.allowscalar(false) -if AMDGPU.functional(:rocblas) - include("rocarray/blas.jl") -end -if AMDGPU.functional(:MIOpen) - include("dnn/miopen.jl") -end -if AMDGPU.functional(:rocsolver) - include("rocarray/solver.jl") -end -if AMDGPU.functional(:rocsparse) - include("rocsparse/rocsparse.jl") -end -if AMDGPU.functional(:rocrand) - include("rocarray/random.jl") -end -# TODO rocFFT tests crash Windows due to access violation -if Sys.islinux() && AMDGPU.functional(:rocfft) - include("rocarray/fft.jl") -end - @testset "AMDGPU.@elapsed" begin xgpu = AMDGPU.rand(Float32, 100) t = AMDGPU.@elapsed xgpu .+= 1 @@ -52,5 +32,14 @@ if length(AMDGPU.devices()) > 1 end end +if AMDGPU.functional(:rocblas) + include("rocarray/blas.jl") +end +if AMDGPU.functional(:MIOpen) + include("dnn/miopen.jl") +end +if AMDGPU.functional(:rocrand) + include("rocarray/random.jl") +end end diff --git a/test/hip_extra_tests.jl b/test/hip_extra_tests.jl new file mode 100644 index 00000000..221669ad --- /dev/null +++ b/test/hip_extra_tests.jl @@ -0,0 +1,24 @@ +@testitem "hip - extra" begin + +using Test +using LinearAlgebra +using Random + +using AMDGPU: HIP, Runtime, Device, Mem +import AMDGPU: @allowscalar + +Random.seed!(1) +AMDGPU.allowscalar(false) + +if AMDGPU.functional(:rocsolver) + include("rocarray/solver.jl") +end +if AMDGPU.functional(:rocsparse) + include("rocsparse/rocsparse.jl") +end +# TODO rocFFT tests crash Windows due to access violation +if Sys.islinux() && AMDGPU.functional(:rocfft) + include("rocarray/fft.jl") +end + +end diff --git a/test/runtests.jl b/test/runtests.jl index 6220615d..dc742522 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -93,27 +93,30 @@ const TARGET_TESTS = isempty(ARGS) ? TEST_NAMES : ARGS # Run tests in parallel. np = set_jobs ? jobs : (Sys.CPU_THREADS ÷ 2) -# Limit to 4 workers, otherwise unfortunate things happen (fences timeout). -np = clamp(np, 1, 4) +# Limit to 2 workers, otherwise unfortunate things happen. +np = clamp(np, 1, 2) @info "Running tests with $np workers." @info "Testing using device $(AMDGPU.device())." InteractiveUtils.versioninfo() AMDGPU.versioninfo() +CI = parse(Bool, get(ENV, "CI", "false")) +runtests(AMDGPU; nworkers=np, nworker_threads=1, testitem_timeout=60 * 30) do ti + for tt in TARGET_TESTS + startswith(ti.name, tt) && return true + end + return false +end + if "core" in TARGET_TESTS && Sys.islinux() @info "Testing `Hostcalls` on the main thread." @testset "Hostcalls" begin include("device/hostcall.jl") - include("device/output.jl") - end -end - -CI = parse(Bool, get(ENV, "CI", "false")) -runtests(AMDGPU; nworkers=np, nworker_threads=4, testitem_timeout=60 * 30) do ti - for tt in TARGET_TESTS - startswith(ti.name, tt) && return true + # TODO 1.11 fails + if VERSION < v"1.11-" + include("device/output.jl") + end end - return false end From bd050c2c8c63c247d02352a18560f4b03a72eea0 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Tue, 28 Nov 2023 19:02:36 +0200 Subject: [PATCH 2/2] Add TODO --- src/fft/fft.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fft/fft.jl b/src/fft/fft.jl index 9ae971d9..13d4dc13 100644 --- a/src/fft/fft.jl +++ b/src/fft/fft.jl @@ -208,6 +208,7 @@ function unsafe_execute!( plan::cROCFFTPlan{T,K,false,N}, X::ROCArray{T,N}, Y::ROCArray{T}, ) where {T,N,K} X = copy(X) # since input array can also be modified + # TODO on 1.11 we need to manually cast `pointer(X)` to `Ptr{Cvoid}`. rocfft_execute(plan, [pointer(X),], [pointer(Y),], plan.execution_info) end