-
Notifications
You must be signed in to change notification settings - Fork 48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Miscompilation with nested loops and control flow #672
Comments
Some more simplification:
|
And again: using AMDGPU
nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1
function gpu_kernel_xx!(tensor, Nx::Int64, Ny::Int64; )
workitems = CartesianIndices((10, 1, 1))
blocks = CartesianIndices((1,1,1))
bI = AMDGPU.blockIdx().x
tI = AMDGPU.threadIdx().x
bI2 = @inbounds blocks[bI]
wI2 = @inbounds workitems[tI]
@inbounds begin
stride = size(workitems, 3)
gidx = bI2.I[3]
i3 = (gidx - 1) * stride + wI2.I[3]
end
if tI <= 10
wI2 = @inbounds workitems[tI]
sum = zero(eltype(tensor))
for _ = 1:Nx + 2
for _ = -Ny:Ny
sum += 1.0
end
end
@inbounds tensor[tI, 1, i3] = sum
end
return nothing
end
tensor = AMDGPU.zeros(Float64, nx, ny, nz)
ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1,1,1))
workitems = CartesianIndices((10,1,1))
@roc groupsize=512 gridsize=length(tensor) gpu_kernel_xx!(tensor, Nx, Ny)
println("ka_direct:", tensor) |
using AMDGPU
nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1
function gpu_kernel_xx!(tensor, Nx::Int64, Ny::Int64; )
workitems = CartesianIndices((10, 1, 1))
tI = AMDGPU.threadIdx().x
wI2 = @inbounds workitems[tI]
@inbounds begin
i3 = wI2.I[3]
end
if tI <= 10
wI2 = @inbounds workitems[tI]
sum = zero(eltype(tensor))
for _ = -Nx:Nx
for _ = -Ny:Ny
sum += 1.0
end
end
@inbounds tensor[tI, 1, i3] = sum
end
return nothing
end
tensor = AMDGPU.zeros(Float64, nx, ny, nz)
ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1,1,1))
workitems = CartesianIndices((10,1,1))
@roc groupsize=512 gridsize=length(tensor) gpu_kernel_xx!(tensor, Nx, Ny)
println("ka_direct:", tensor)
# expected answer [9.0, 9.0, 9.0, ...]
AMDGPU.@device_code dir="amd_dump" begin
@roc groupsize=512 gridsize=length(tensor) gpu_kernel_xx!(tensor, Nx, Ny)
end |
The LLVM IR is becoming manageable: ; @ /home/vchuravy/src/KernelAbstractions/issue517/repr.jl:6 within `gpu_kernel_xx!`
define amdgpu_kernel void @_Z14gpu_kernel_xx_14ROCDeviceArrayI7Float64Ll3ELl1EE5Int64S1_({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, { [3 x i64], i8 addrspace(1)*, i64 } %0, i64 signext %1, i64 signext %2) local_unnamed_addr #2 !dbg !43 {
conversion:
%.fca.0.0.extract = extractvalue { [3 x i64], i8 addrspace(1)*, i64 } %0, 0, 0
%.fca.0.1.extract = extractvalue { [3 x i64], i8 addrspace(1)*, i64 } %0, 0, 1
%.fca.1.extract = extractvalue { [3 x i64], i8 addrspace(1)*, i64 } %0, 1
%3 = call i32 @llvm.amdgcn.workitem.id.x(), !dbg !47, !range !62
%.lhs.trunc = trunc i32 %3 to i16, !dbg !63
%4 = udiv i16 %.lhs.trunc, 10, !dbg !63
%5 = icmp ugt i32 %3, 9, !dbg !83
br i1 %5, label %L296, label %pass11, !dbg !89
L179: ; preds = %L179.preheader, %L211
%value_phi15 = phi i64 [ %8, %L211 ], [ %18, %L179.preheader ]
%value_phi16 = phi double [ %value_phi24, %L211 ], [ 0.000000e+00, %L179.preheader ]
br i1 %.not50.not, label %L211, label %L198, !dbg !90
L198: ; preds = %L198, %L179
%value_phi20 = phi double [ %6, %L198 ], [ %value_phi16, %L179 ]
%value_phi21 = phi i64 [ %7, %L198 ], [ %20, %L179 ]
%6 = fadd double %value_phi20, 1.000000e+00, !dbg !91
%.not51 = icmp eq i64 %value_phi21, %value_phi17, !dbg !95
%7 = add i64 %value_phi21, 1, !dbg !97
br i1 %.not51, label %L211, label %L198, !dbg !100
L211: ; preds = %L198, %L179
%value_phi24 = phi double [ %value_phi16, %L179 ], [ %6, %L198 ]
%.not52 = icmp eq i64 %value_phi15, %value_phi, !dbg !101
%8 = add i64 %value_phi15, 1, !dbg !102
br i1 %.not52, label %L222, label %L179, !dbg !103
L222: ; preds = %pass11, %L211
%value_phi27 = phi double [ 0.000000e+00, %pass11 ], [ %value_phi24, %L211 ]
%9 = call i64 @llvm.smax.i64(i64 %.fca.0.0.extract, i64 0), !dbg !104
%10 = call i64 @llvm.smax.i64(i64 %.fca.0.1.extract, i64 0), !dbg !104
%11 = mul i64 %9, %10, !dbg !128
%12 = zext i16 %4 to i64, !dbg !136
%13 = mul i64 %11, %12, !dbg !141
%14 = zext i32 %3 to i64, !dbg !142
%15 = add i64 %13, %14, !dbg !144
%16 = bitcast i8 addrspace(1)* %.fca.1.extract to double addrspace(1)*, !dbg !145
%17 = getelementptr inbounds double, double addrspace(1)* %16, i64 %15, !dbg !145
store double %value_phi27, double addrspace(1)* %17, align 8, !dbg !145, !tbaa !157
br label %L296, !dbg !160
L296: ; preds = %L222, %conversion
ret void, !dbg !161
pass11: ; preds = %conversion
%18 = sub i64 0, %1, !dbg !162
%.not = icmp sgt i64 %18, %1, !dbg !164
%19 = sext i1 %.not to i64, !dbg !168
%value_phi = xor i64 %19, %1, !dbg !168
%.not48.not = icmp slt i64 %value_phi, %18, !dbg !174
br i1 %.not48.not, label %L222, label %L179.preheader, !dbg !163
L179.preheader: ; preds = %pass11
%20 = sub i64 0, %2
%.not49 = icmp sgt i64 %20, %2
%21 = sext i1 %.not49 to i64
%value_phi17 = xor i64 %21, %2
%.not50.not = icmp slt i64 %value_phi17, %20
br label %L179, !dbg !90
} |
using AMDGPU
function gpu_kernel_xx!(x, Nx::Int64, Ny::Int64)
i = AMDGPU.threadIdx().x
i ≤ 10 || return
workitems = CartesianIndices((10, 1))
@inbounds workitems[i] # accessing `workitems` triggers miscompilation
y = 0f0
for _ in -Nx:Nx
for _ in -Ny:Ny
y += 1f0
end
end
@inbounds x[i] = y
return
end
Nx, Ny = 1, 1
x = AMDGPU.zeros(Float32, 10)
@roc groupsize=length(x) gridsize=1 gpu_kernel_xx!(x, Nx, Ny)
println("ka_direct:", x)
# expected answer [9.0, 9.0, 9.0, ...]
AMDGPU.@device_code dir="devcode" @roc launch=false gpu_kernel_xx!(x, Nx, Ny) define amdgpu_kernel void @_Z14gpu_kernel_xx_14ROCDeviceArrayI7Float32Ll1ELl1EE5Int64S1_({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, { [1 x i64], i8 addrspace(1)*, i64 } %0, i64 signext %1, i64 signext %2) local_unnamed_addr #1 {
conversion:
%.fca.1.extract = extractvalue { [1 x i64], i8 addrspace(1)*, i64 } %0, 1
%3 = call i32 @llvm.amdgcn.workitem.id.x(), !range !7
%4 = icmp ugt i32 %3, 9
br i1 %4, label %common.ret, label %pass
L86: ; preds = %L86.preheader, %L118
%value_phi3 = phi i64 [ %7, %L118 ], [ %11, %L86.preheader ]
%value_phi4 = phi float [ %value_phi12, %L118 ], [ 0.000000e+00, %L86.preheader ]
br i1 %.not3.not, label %L118, label %L105
L105: ; preds = %L105, %L86
%value_phi8 = phi float [ %5, %L105 ], [ %value_phi4, %L86 ]
%value_phi9 = phi i64 [ %6, %L105 ], [ %13, %L86 ]
%5 = fadd float %value_phi8, 1.000000e+00
%.not4 = icmp eq i64 %value_phi9, %value_phi5
%6 = add i64 %value_phi9, 1
br i1 %.not4, label %L118, label %L105
L118: ; preds = %L105, %L86
%value_phi12 = phi float [ %value_phi4, %L86 ], [ %5, %L105 ]
%.not5 = icmp eq i64 %value_phi3, %value_phi
%7 = add i64 %value_phi3, 1
br i1 %.not5, label %L150, label %L86
L150: ; preds = %pass, %L118
%value_phi15 = phi float [ 0.000000e+00, %pass ], [ %value_phi12, %L118 ]
%8 = bitcast i8 addrspace(1)* %.fca.1.extract to float addrspace(1)*
%9 = zext i32 %3 to i64
%10 = getelementptr inbounds float, float addrspace(1)* %8, i64 %9
store float %value_phi15, float addrspace(1)* %10, align 4, !tbaa !8
br label %common.ret
common.ret: ; preds = %L150, %conversion
ret void
pass: ; preds = %conversion
%11 = sub i64 0, %1
%.not = icmp sgt i64 %11, %1
%12 = sext i1 %.not to i64
%value_phi = xor i64 %12, %1
%.not1.not = icmp slt i64 %value_phi, %11
br i1 %.not1.not, label %L150, label %L86.preheader
L86.preheader: ; preds = %pass
%13 = sub i64 0, %2
%.not2 = icmp sgt i64 %13, %2
%14 = sext i1 %.not2 to i64
%value_phi5 = xor i64 %14, %2
%.not3.not = icmp slt i64 %value_phi5, %13
br label %L86
} |
@vchuravy so this is a bug with ROCm LLVM? |
Yeah pretty much as far as I can tell this is very cursed.
Yeah that's what I figured out as well, funnily the code is gone in the optimized IR and the code is basically identically Identical LLVM modules lead to a very subtle codegen difference... https://godbolt.org/z/1z3b3fEdn |
My reproducer ended up looking like:
|
@pxl-th do you have time to bisect this? Otherwise I can try and take a look. |
If you have some infrastructure setup for this... |
There is the |
No worries, I will bisect it. |
After removing all the KernelAbstractions parts from JuliaGPU/KernelAbstractions.jl#517
The
assume
is there to remove an exceptional branchThe text was updated successfully, but these errors were encountered: