You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hoping somebody who understands HIP/ROCM better than me can help me understand whats going on here.
Using the version you get when you use "add AMDGPU" I get a core dump instantly.
By going into src/discovery/discovery.jl and moving
global libMIOpen_path = get_library(lib_prefix * "MIOpen"; rocm_path)
up to the top (it needs to come before libhsa gets loaded. one line below and the coredumps return.):
try
global libMIOpen_path = get_library(lib_prefix * "MIOpen"; rocm_path)
global libhsaruntime = if Sys.islinux()
get_library("libhsa-runtime64"; rocm_path, ext="so.1")
else
""
end
# Linker.
lld_path = get_ld_lld(rocm_path; from_artifact=false,
artifact_library=:LLD_jll, artifact_field=:lld_path)
lld_artifact = false
if isempty(lld_path)
lld_path = get_ld_lld(rocm_path; from_artifact=true,
artifact_library=:LLD_jll, artifact_field=:lld_path)
lld_artifact = true
end
global lld_path = lld_path
global lld_artifact = lld_artifact
# HIP.
global libhip = get_library(
Sys.islinux() ? "libamdhip64" : "amdhip64"; rocm_path)
# Check if opaque pointers are enabled and turn off artifacts.
llvm_args = get(ENV, "JULIA_LLVM_ARGS", "")
enabled_opaque_pointers = occursin("-opaque-pointers", llvm_args)
from_artifact = (
# Detect HIP version, which will influence what device libraries to use.
(isempty(libhip) || Base.thisminor(_hip_runtime_version()) > v"5.4")
&& !enabled_opaque_pointers)
# If ROCm 5.5+ - use artifact device libraries.
global libdevice_libs = get_device_libs(from_artifact; rocm_path)
# HIP-based libraries.
global librocblas = get_library(lib_prefix * "rocblas"; rocm_path)
global librocsparse = get_library(lib_prefix * "rocsparse"; rocm_path)
global librocsolver = get_library(lib_prefix * "rocsolver"; rocm_path)
global librocrand = get_library(lib_prefix * "rocrand"; rocm_path)
global librocfft = get_library(lib_prefix * "rocfft"; rocm_path)
catch err
@error """ROCm discovery failed!
Discovered ROCm path: $rocm_path.
Use `ROCM_PATH` env variable to specify ROCm directory.
""" exception=(err, catch_backtrace())
end
the core dumps stop and everything seems to work normally.
Anybody have any ideas? Thanks for any help.
commit:
commit 4385ed941d5bf1b4818d8cd34ca090b8827d1ca4 (HEAD -> master, tag: v1.1.0, origin/master, origin/HEAD)
Author: Anton Smirnov <[email protected]>
Date: Sat Nov 9 14:50:57 2024 +0200
Bump to 1.1.0
julia> using AMDGPU
[Detaching after vfork from child process 19143]
[New Thread 0x7fffa34006c0 (LWP 19144)]
[New Thread 0x7ffea28006c0 (LWP 19145)]
[Thread 0x7ffea28006c0 (LWP 19145) exited]
julia: /usr/src/debug/hip-runtime/clr-rocm-6.2.2/hipamd/src/hip_code_object.cpp:1152: hip::FatBinaryInfo** hip::StatCO::addFatBinary(const void*, bool): Assertion `err == hipSuccess' failed.
Thread 1 "julia" received signal SIGABRT, Aborted.
0x00007ffff7e383f4 in ?? () from /usr/lib/libc.so.6
(gdb) bt
#0 0x00007ffff7e383f4 in ?? () from /usr/lib/libc.so.6
#1 0x00007ffff7ddf120 in raise () from /usr/lib/libc.so.6
#2 0x00007ffff7dc64c3 in abort () from /usr/lib/libc.so.6
#3 0x00007ffff7dc63df in ?? () from /usr/lib/libc.so.6
#4 0x00007ffff7dd7177 in __assert_fail () from /usr/lib/libc.so.6
#5 0x00007fffae250955 in ?? () from /opt/rocm/lib/libamdhip64.so
#6 0x00007ffe8cd6b91d in ?? () from /opt/rocm/lib/libMIOpen.so
#7 0x00007ffff7fcb5b7 in ?? () from /lib64/ld-linux-x86-64.so.2
#8 0x00007ffff7fcb6ad in ?? () from /lib64/ld-linux-x86-64.so.2
#9 0x00007ffff7fc85c2 in _dl_catch_exception () from /lib64/ld-linux-x86-64.so.2
#10 0x00007ffff7fd24fc in ?? () from /lib64/ld-linux-x86-64.so.2
#11 0x00007ffff7fc8523 in _dl_catch_exception () from /lib64/ld-linux-x86-64.so.2
#12 0x00007ffff7fd2904 in ?? () from /lib64/ld-linux-x86-64.so.2
#13 0x00007ffff7e31f14 in ?? () from /usr/lib/libc.so.6
#14 0x00007ffff7fc8523 in _dl_catch_exception () from /lib64/ld-linux-x86-64.so.2
#15 0x00007ffff7fc8679 in ?? () from /lib64/ld-linux-x86-64.so.2
#16 0x00007ffff7e319f3 in ?? () from /usr/lib/libc.so.6
#17 0x00007ffff7e31fcf in dlopen () from /usr/lib/libc.so.6
#18 0x00007ffff6c723f7 in ijl_dlopen (filename=<optimized out>, flags=<optimized out>) at /cache/build/builder-demeter6-6/julialang/julia-master/src/dlload.c:200
#19 0x00007ffff6c724f6 in ijl_load_dynamic_library (modname=0x7fffeb9b4298 "libMIOpen", flags=4, throw_err=0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/dlload.c:365
#20 0x00007fffe1251aaa in julia_#dlopen#3_49859 () at libdl.jl:120
#21 0x00007fffaf628cdb in dlopen () at libdl.jl:119
#22 julia_find_library_7656 () at libdl.jl:209
#23 0x00007fffaf612f1c in find_library () at libdl.jl:217
#24 find_library () at libdl.jl:217
--------------------------------------------------------------------------------
#25 japi1_find_rocm_library_5949 () at /home/neil/.julia/dev/AMDGPU/src/discovery/utils.jl:109
#26 0x00007fffaf626f57 in #get_library#3 () at /home/neil/.julia/dev/AMDGPU/src/discovery/discovery.jl:16
#27 get_library () at /home/neil/.julia/dev/AMDGPU/src/discovery/discovery.jl:15
#28 julia___init___6051 () at /home/neil/.julia/dev/AMDGPU/src/discovery/discovery.jl:111
#29 0x00007fffaf612859 in jfptr___init___6052 () from /home/neil/.julia/compiled/v1.11/AMDGPU/arpZD_5tx5N.so
#30 0x00007ffff6c8d132 in jl_apply (nargs=1, args=0x7fffffff9c08) at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157
#31 jl_module_run_initializer (m=0x7fffaf68b560 <jl_system_image_data+333280>) at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:76
#32 0x00007fffe1927a49 in julia_run_module_init_69426 () at loading.jl:1336
#33 0x00007fffe1b4541b in julia_register_restored_modules_69409 () at loading.jl:1324
#34 0x00007fffe29eb390 in julia_#_include_from_serialized#1066_69268 () at loading.jl:1213
#35 0x00007fffe2300613 in _include_from_serialized () at loading.jl:1169
#36 _include_from_serialized () at loading.jl:1169
#37 julia_#_require_search_from_serialized#1077_69714 () at loading.jl:1985
#38 0x00007fffe213e38c in julia__require_search_from_serialized_44532 () at loading.jl:1908
#39 0x00007fffe2ab083c in jfptr.require_search_from_serialized_44533 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/lib/julia/sys.so
#40 0x00007fffe12b8b12 in julia__require_69981 () at loading.jl:2450
#41 0x00007fffe17ab2fa in julia___require_prelocked_69876 () at loading.jl:2315
#42 0x00007fffe1cf8933 in jfptr___require_prelocked_69877.1 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/lib/julia/sys.so
#43 0x00007ffff6c5f97a in jl_apply (nargs=3, args=0x7fffffffcde0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157
#44 jl_f__call_in_world (F=<optimized out>, args=0x7fffffffcdd8, nargs=4) at /cache/build/builder-demeter6-6/julialang/julia-master/src/builtins.c:894
#45 0x00007fffe1ccd235 in #invoke_in_world#3 () at essentials.jl:1089
#46 invoke_in_world () at essentials.jl:1086
#47 julia__require_prelocked_69874 () at loading.jl:2302
#48 0x00007fffe21260e9 in macro expansion () at loading.jl:2241
#49 macro expansion () at lock.jl:273
#50 julia___require_69813 () at loading.jl:2198
#51 0x00007fffe26d29b3 in jfptr___require_69814.1 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/lib/julia/sys.so
#52 0x00007ffff6c5f97a in jl_apply (nargs=3, args=0x7fffffffd3d0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157
#53 jl_f__call_in_world (F=<optimized out>, args=0x7fffffffd3c8, nargs=4) at /cache/build/builder-demeter6-6/julialang/julia-master/src/builtins.c:894
#54 0x00007fffe2925e2f in #invoke_in_world#3 () at essentials.jl:1089
#55 invoke_in_world () at essentials.jl:1086
#56 julia_require_69802 () at loading.jl:2191
#57 0x00007fffe1ff71c3 in jfptr_require_69803 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/lib/julia/sys.so
#58 0x00007ffff6c8cd54 in jl_apply (nargs=3, args=0x7fffffffd440) at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157
#59 call_require (var=0x7fffef3a0c28, mod=0x7fffe5f9b870 <jl_system_image_data+50367280>) at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:486
#60 eval_import_path (where=where@entry=0x7fffe5f9b870 <jl_system_image_data+50367280>, from=from@entry=0x0, args=0x7fffed1ec8b0, name=name@entry=0x7fffffffd510, keyword=keyword@entry=0x7ffff6e29346 "using")
at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:523
#61 0x00007ffff6c8ed21 in jl_toplevel_eval_flex (m=m@entry=0x7fffe5f9b870 <jl_system_image_data+50367280>, e=<optimized out>, fast=fast@entry=1, expanded=expanded@entry=0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:759
#62 0x00007ffff6c8e92a in jl_toplevel_eval_flex (m=m@entry=0x7fffe5f9b870 <jl_system_image_data+50367280>, e=e@entry=0x7fffed1ee3f0, fast=fast@entry=1, expanded=expanded@entry=0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:886
#63 0x00007ffff6c8f40c in ijl_toplevel_eval (m=m@entry=0x7fffe5f9b870 <jl_system_image_data+50367280>, v=v@entry=0x7fffed1ee3f0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:952
#64 0x00007ffff6c8f996 in ijl_toplevel_eval_in (m=0x7fffe5f9b870 <jl_system_image_data+50367280>, ex=0x7fffed1ee3f0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:994
#65 0x00007fffd0e536d8 in eval () at boot.jl:430
#66 japi1_eval_user_input_9990 () at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:245
#67 0x00007fffd0e73b02 in julia_repl_backend_loop_10025 () at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:342
--Type <RET> for more, q to quit, c to continue without paging--c
#68 0x00007fffd0e6d26d in japi1_#start_repl_backend#59_10022 () at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:327
#69 0x00007fffd0ce871f in japi1_start_repl_backend_10632 () at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:324
#70 0x00007fffd0deb1ab in julia_#run_repl#72_10096 () at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:483
#71 0x00007fffd0d749ed in julia_run_repl_10087 () at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:469
#72 0x00007fffd0e1d063 in jfptr_run_repl_10088 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/share/julia/compiled/v1.11/REPL/u0gqU_GYsA8.so
#73 0x00007fffd0cac40a in julia_#1139_14648 () at client.jl:446
#74 0x00007fffd0e6ead8 in jfptr_YY.1139_14649 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/share/julia/compiled/v1.11/REPL/u0gqU_GYsA8.so
#75 0x00007ffff6c5f82a in jl_apply (nargs=2, args=0x7fffffffe580) at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157
#76 jl_f__call_latest (F=<optimized out>, args=0x7fffffffe580, nargs=2) at /cache/build/builder-demeter6-6/julialang/julia-master/src/builtins.c:875
#77 0x00007fffe1544387 in #invokelatest#2 () at essentials.jl:1055
#78 invokelatest () at essentials.jl:1052
#79 julia_run_main_repl_72104 () at client.jl:430
#80 0x00007fffe20498d5 in repl_main () at client.jl:567
#81 julia__start_72143 () at client.jl:541
#82 0x00007fffe1b194a4 in jfptr.start_72144 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/lib/julia/sys.so
#83 0x00007ffff6cc4ca6 in jl_apply (nargs=1, args=0x7fffffffe8e8) at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157
#84 true_main (argc=<optimized out>, argv=<optimized out>) at /cache/build/builder-demeter6-6/julialang/julia-master/src/jlapi.c:900
#85 0x00007ffff6cc573f in jl_repl_entrypoint (argc=<optimized out>, argv=0x7fffffffecc8) at /cache/build/builder-demeter6-6/julialang/julia-master/src/jlapi.c:1059
#86 0x0000000000401089 in main (argc=<optimized out>, argv=<optimized out>) at /cache/build/builder-demeter6-6/julialang/julia-master/cli/loader_exe.c:58
rocminfo:
=====================
HSA System Attributes
=====================
Runtime Version: 1.1
Runtime Ext Version: 1.6
System Timestamp Freq.: 1000.000000MHz
Sig. Max Wait Duration: 18446744073709551615 (0xFFFFFFFFFFFFFFFF) (timestamp count)
Machine Model: LARGE
System Endianness: LITTLE
Mwaitx: DISABLED
DMAbuf Support: YES
==========
HSA Agents
==========
*******
Agent 1
*******
Name: Intel(R) Core(TM) i3-6100 CPU @ 3.70GHz
Uuid: CPU-XX
Marketing Name: Intel(R) Core(TM) i3-6100 CPU @ 3.70GHz
Vendor Name: CPU
Feature: None specified
Profile: FULL_PROFILE
Float Round Mode: NEAR
Max Queue Number: 0(0x0)
Queue Min Size: 0(0x0)
Queue Max Size: 0(0x0)
Queue Type: MULTI
Node: 0
Device Type: CPU
Cache Info:
L1: 32768(0x8000) KB
Chip ID: 0(0x0)
ASIC Revision: 0(0x0)
Cacheline Size: 64(0x40)
Max Clock Freq. (MHz): 3700
BDFID: 0
Internal Node ID: 0
Compute Unit: 4
SIMDs per CU: 0
Shader Engines: 0
Shader Arrs. per Eng.: 0
WatchPts on Addr. Ranges:1
Memory Properties:
Features: None
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: FINE GRAINED
Size: 16318556(0xf9005c) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 2
Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED
Size: 16318556(0xf9005c) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 3
Segment: GLOBAL; FLAGS: COARSE GRAINED
Size: 16318556(0xf9005c) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
ISA Info:
*******
Agent 2
*******
Name: gfx1102
Uuid: GPU-XX
Marketing Name: AMD Radeon RX 7600 XT
Vendor Name: AMD
Feature: KERNEL_DISPATCH
Profile: BASE_PROFILE
Float Round Mode: NEAR
Max Queue Number: 128(0x80)
Queue Min Size: 64(0x40)
Queue Max Size: 131072(0x20000)
Queue Type: MULTI
Node: 1
Device Type: GPU
Cache Info:
L1: 32(0x20) KB
L2: 2048(0x800) KB
Chip ID: 29824(0x7480)
ASIC Revision: 0(0x0)
Cacheline Size: 64(0x40)
Max Clock Freq. (MHz): 2539
BDFID: 768
Internal Node ID: 1
Compute Unit: 32
SIMDs per CU: 2
Shader Engines: 2
Shader Arrs. per Eng.: 2
WatchPts on Addr. Ranges:4
Coherent Host Access: FALSE
Memory Properties:
Features: KERNEL_DISPATCH
Fast F16 Operation: TRUE
Wavefront Size: 32(0x20)
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Max Waves Per CU: 32(0x20)
Max Work-item Per CU: 1024(0x400)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 4294967295(0xffffffff)
y 4294967295(0xffffffff)
z 4294967295(0xffffffff)
Max fbarriers/Workgrp: 32
Packet Processor uCode:: 372
SDMA engine uCode:: 21
IOMMU Support:: None
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: COARSE GRAINED
Size: 16760832(0xffc000) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 2
Segment: GLOBAL; FLAGS: EXTENDED FINE GRAINED
Size: 16760832(0xffc000) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 3
Segment: GROUP
Size: 64(0x40) KB
Allocatable: FALSE
Alloc Granule: 0KB
Alloc Recommended Granule:0KB
Alloc Alignment: 0KB
Accessible by all: FALSE
ISA Info:
ISA 1
Name: amdgcn-amd-amdhsa--gfx1102
Machine Models: HSA_MACHINE_MODEL_LARGE
Profiles: HSA_PROFILE_BASE
Default Rounding Mode: NEAR
Default Rounding Mode: NEAR
Fast f16: TRUE
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 4294967295(0xffffffff)
y 4294967295(0xffffffff)
z 4294967295(0xffffffff)
FBarrier Max Size: 32
*** Done ***
The text was updated successfully, but these errors were encountered:
Hoping somebody who understands HIP/ROCM better than me can help me understand whats going on here.
Using the version you get when you use "add AMDGPU" I get a core dump instantly.
By going into src/discovery/discovery.jl and moving
up to the top (it needs to come before libhsa gets loaded. one line below and the coredumps return.):
the core dumps stop and everything seems to work normally.
Anybody have any ideas? Thanks for any help.
commit:
AMDGPU.versioninfo()
GDB backtrace:
rocminfo:
The text was updated successfully, but these errors were encountered: