Ferrite-FEM · Abdelrahman912 · Jan 11, 2024 · Jan 11, 2024 · Feb 24, 2024 · Feb 24, 2024
diff --git a/Project.toml b/Project.toml
@@ -3,6 +3,10 @@ uuid = "c061ca5d-56c9-439f-9c0e-210fe06d3992"
 version = "0.3.14"
 
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
 EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

diff --git a/docs/Manifest.toml b/docs/Manifest.toml
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,5 +1,7 @@
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 Ferrite = "c061ca5d-56c9-439f-9c0e-210fe06d3992"

diff --git a/docs/src/literate-tutorials/gpu_qp_heat_equation.jl b/docs/src/literate-tutorials/gpu_qp_heat_equation.jl
@@ -0,0 +1,260 @@
+using Ferrite, CUDA
+using StaticArrays
+using SparseArrays
+using Adapt
+using Test
+using NVTX
+
+
+
+left = Tensor{1,2,Float32}((0,-0)) # define the left bottom corner of the grid.
+right = Tensor{1,2,Float32}((100.0,100.0)) # define the right top corner of the grid.
+
+
+grid = generate_grid(Quadrilateral, (100, 100),left,right)
+
+
+colors = create_coloring(grid) .|> (x -> Int32.(x)) # convert to Int32 to reduce number of registers
+
+
+ip = Lagrange{RefQuadrilateral, 1}() # define the interpolation function (i.e. Bilinear lagrange)
+
+qr = QuadratureRule{RefQuadrilateral}(Float32,2)
+
+
+cellvalues = CellValues(Float32,qr, ip)
+
+
+dh = DofHandler(grid)
+
+
+
+add!(dh, :u, ip)
+
+close!(dh);
+
+
+# Standard assembly of the element.
+function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
+    n_basefuncs = getnbasefunctions(cellvalues)
+
+    # Loop over quadrature points
+    for q_point in 1:getnquadpoints(cellvalues)
+        # Get the quadrature weight
+        dΩ = getdetJdV(cellvalues, q_point)
+        # Loop over test shape functions
+        for i in 1:n_basefuncs
+            δu  = shape_value(cellvalues, q_point, i)
+            ∇δu = shape_gradient(cellvalues, q_point, i)
+            # Add contribution to fe
+            fe[i] += δu * dΩ
+            # Loop over trial shape functions
+            for j in 1:n_basefuncs
+                ∇u = shape_gradient(cellvalues, q_point, j)
+                # Add contribution to Ke
+                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
+            end
+        end
+    end
+    return Ke, fe
+end
+
+
+function create_buffers(cellvalues, dh)
+    f = zeros(ndofs(dh))
+    K = allocate_matrix(dh)
+    assembler = start_assemble(K, f)
+    ## Local quantities
+    n_basefuncs = getnbasefunctions(cellvalues)
+    Ke = zeros(n_basefuncs, n_basefuncs)
+    fe = zeros(n_basefuncs)
+    return (;f, K, assembler, Ke, fe)
+end
+
+
+# Standard global assembly
+
+function assemble_global!(cellvalues, dh::DofHandler,qp_iter::Val{QPiter}) where {QPiter}
+    (;f, K, assembler, Ke, fe) = create_buffers(cellvalues,dh)
+    # Loop over all cels
+    for cell in CellIterator(dh)
+        fill!(Ke, 0)
+        fill!(fe, 0)
+        if QPiter
+            #reinit!(cellvalues, getcoordinates(cell))
+            assemble_element_qpiter!(Ke, fe, cellvalues,getcoordinates(cell))
+        else
+            # Reinitialize cellvalues for this cell
+            reinit!(cellvalues, cell)
+            # Compute element contribution
+            assemble_element_std!(Ke, fe, cellvalues)
+        end
+        # Assemble Ke and fe into K and f
+        assemble!(assembler, celldofs(cell), Ke, fe)
+    end
+    return K, f
+end
+
+
+
+#=NVTX.@annotate=# function assemble_element_gpu!(assembler,cv,dh,n_cells_colored, eles_colored)
+    tx = threadIdx().x
+    bx = blockIdx().x
+    bd = blockDim().x
+    e_color = tx + (bx-Int32(1))*bd # element number per color
+
+    e_color ≤ n_cells_colored || return nothing # e here is the current element index.
+    n_basefuncs = getnbasefunctions(cv)
+    e = eles_colored[e_color]
+    cell_coords = getcoordinates(dh.grid, e)
+
+    ke = MMatrix{4,4,Float32}(undef) # Note: using n_basefuncs instead of 4 will throw an error because this type of dynamisim is not supported in GPU.
+    fill!(ke, 0.0f0)
+    fe = MVector{4,Float32}(undef)
+    fill!(fe, 0.0f0)
+     #Loop over quadrature points
+     for qv in Ferrite.QuadratureValuesIterator(cv,cell_coords)
+        ## Get the quadrature weight
+        dΩ = getdetJdV(qv)
+        ## Loop over test shape functions
+        for i in 1:n_basefuncs
+            δu  = shape_value(qv, i)
+            ∇δu = shape_gradient(qv, i)
+            ## Add contribution to fe
+            fe[i] += δu * dΩ
+            ## Loop over trial shape functions
+            for j in 1:n_basefuncs
+                ∇u = shape_gradient(qv, j)
+                ## Add contribution to Ke
+                ke[i,j] += (∇δu ⋅ ∇u) * dΩ
+            end
+        end
+    end
+
+    ## Assemble Ke into Kgpu ##
+    assemble!(assembler, celldofs(dh,e),SMatrix(ke),SVector(fe)) # when passin mutable objects, throws and error
+
+    return nothing
+end
+
+
+
+Adapt.@adapt_structure Ferrite.GPUGrid
+Adapt.@adapt_structure Ferrite.GPUDofHandler
+Adapt.@adapt_structure Ferrite.GPUAssemblerSparsityPattern
+
+#=NVTX.@annotate=# function assemble_global_gpu_color(cellvalues,dh,colors)
+    K = allocate_matrix(SparseMatrixCSC{Float32, Int32},dh)
+    Kgpu = CUSPARSE.CuSparseMatrixCSC(K)
+    fgpu = CUDA.zeros(ndofs(dh))
+    assembler = start_assemble(Kgpu, fgpu)
+    n_colors = length(colors)
+    # set up kernel adaption & launch the kernel
+    dh_gpu = Adapt.adapt_structure(CuArray, dh)
+    assembler_gpu = Adapt.adapt_structure(CUDA.KernelAdaptor(), assembler)
+    cellvalues_gpu = Adapt.adapt_structure(CuArray, cellvalues)
+    for i in 1:n_colors
+        kernel = @cuda launch=false assemble_element_gpu!(assembler_gpu,cellvalues_gpu,dh_gpu,Int32(length(colors[i])),cu(colors[i]))
+        #@show CUDA.registers(kernel)
+        config = launch_configuration(kernel.fun)
+        threads = min(length(colors[i]), config.threads)
+        blocks =  cld(length(colors[i]), threads)
+        kernel(assembler_gpu,cellvalues,dh_gpu,Int32(length(colors[i])),cu(colors[i]);  threads, blocks)
+    end
+    return Kgpu,fgpu
+end
+
+
+# an alternative way to call the kernel using a macro
+function assemble_global_gpu_color_macro(cellvalues,dh,colors)
+    K = allocate_matrix(SparseMatrixCSC{Float32, Int32},dh)
+    Kgpu = CUSPARSE.CuSparseMatrixCSC(K)
+    fgpu = CUDA.zeros(ndofs(dh))
+    assembler = start_assemble(Kgpu, fgpu)
+
+    # set up kernel adaption & launch the kernel
+    @run_gpu(assemble_element_gpu!, assembler, cellvalues, dh, colors)
+    return Kgpu,fgpu
+end
+
+
+
+
+
+stassy(cv,dh) = assemble_global!(cv,dh,Val(false))
+
+
+
+
+# qpassy(cv,dh) = assemble_global!(cv,dh,Val(true))
+
+Kgpu, fgpu = @btime CUDA.@sync  assemble_global_gpu_color($cellvalues,$dh,colors);
+#Kgpu, fgpu = CUDA.@profile    assemble_global_gpu_color(cellvalues,dh,colors)
+# to benchmark the code using nsight compute use the following command: ncu --mode=launch julia
+# Open nsight compute and attach the profiler to the julia instance
+# ref: https://cuda.juliagpu.org/v2.2/development/profiling/#NVIDIA-Nsight-Compute
+# to benchmark using nsight system use the following command: # nsys profile --trace=nvtx julia rmse_kernel_v1.jl
+
+
+#mKgpu, mfgpu =    assemble_global_gpu_color_macro(cellvalues,dh,colors)
+
+
+
+norm(Kgpu)
+
+
+#Kstd , Fstd = @btime stassy($cellvalues,$dh);
+Kstd , Fstd =  stassy(cellvalues,dh);
+norm(Kstd)
+
+@testset "GPU Heat Equation" begin
+
+    for i = 1:10
+        # Bottom left point in the grid in the physical coordinate system.
+        # Generate random Float32 between -100 and -1
+        bl_x = rand(Float32) * (-99) - 1
+        bl_y = rand(Float32) * (-99) - 1
+
+        # Top right point in the grid in the physical coordinate system.
+        # Generate random Float32 between 0 and 100
+        tr_x = rand(Float32) * 100
+        tr_y = rand(Float32) * 100
+
+        n_x = rand(1:100)   # number of cells in x direction
+        n_y = rand(1:100)   # number of cells in y direction
+
+        left = Tensor{1,2,Float32}((bl_x,bl_y)) # define the left bottom corner of the grid.
+        right = Tensor{1,2,Float32}((tr_x,tr_y)) # define the right top corner of the grid.
+
+
+        grid = generate_grid(Quadrilateral, (n_x, n_y),left,right)
+
+
+        colors = create_coloring(grid) .|> (x -> Int32.(x)) # convert to Int32 to reduce number of registers
+
+
+        ip = Lagrange{RefQuadrilateral, 1}() # define the interpolation function (i.e. Bilinear lagrange)
+
+
+        qr = QuadratureRule{RefQuadrilateral,Float32}(2)
+
+
+        cellvalues = CellValues(Float32,qr, ip)
+
+
+        dh = DofHandler(grid)
+
+
+
+        add!(dh, :u, ip)
+
+        close!(dh);
+        # The CPU version:
+        Kstd , Fstd =  stassy(cellvalues,dh);
+
+        # The GPU version
+        Kgpu, fgpu =  assemble_global_gpu_color(cellvalues,dh,colors)
+
+        @test norm(Kstd) ≈ norm(Kgpu) atol=1e-4
+    end
+end