Ferrite-FEM · Abdelrahman912 · Jan 11, 2024 · Jan 11, 2024 · Feb 24, 2024 · Feb 24, 2024
diff --git a/Project.toml b/Project.toml
@@ -16,11 +16,14 @@ Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4"
 WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 
 [weakdeps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b"
 
 [extensions]
 FerriteBlockArrays = "BlockArrays"
+FerriteGPU = ["CUDA", "Adapt"]
 FerriteMetis = "Metis"
 
 [compat]
@@ -34,6 +37,7 @@ Preferences = "1"
 Reexport = "1"
 StaticArrays = "1"
 Tensors = "1.14"
+TimerOutputs = "0.5.25"
 WriteVTK = "1.13"
 julia = "1.10"
 

diff --git a/docs/Manifest.toml b/docs/Manifest.toml
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,5 +1,7 @@
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Changelog = "5217a498-cd5d-4ec6-b8c2-9b85a09b6e3e"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"

diff --git a/docs/src/literate-tutorials/gpu_qp_heat_equation.jl b/docs/src/literate-tutorials/gpu_qp_heat_equation.jl
@@ -0,0 +1,182 @@
+using Ferrite
+using StaticArrays
+using SparseArrays
+using CUDA
+
+
+left = Tensor{1, 2, Float32}((0, -0)) # define the left bottom corner of the grid.
+
+right = Tensor{1, 2, Float32}((1000.0, 1000.0)) # define the right top corner of the grid.
+
+
+grid = generate_grid(Quadrilateral, (1000, 1000), left, right)
+
+
+ip = Lagrange{RefQuadrilateral, 2}() # define the interpolation function (i.e. Bilinear lagrange)
+
+
+qr = QuadratureRule{RefQuadrilateral}(Float32, 3)
+
+
+cellvalues = CellValues(Float32, qr, ip)
+
+
+dh = DofHandler(grid)
+
+add!(dh, :u, ip)
+
+close!(dh);
+
+
+dh |> get_grid
+
+# Standard assembly of the element.
+function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
+    n_basefuncs = getnbasefunctions(cellvalues)
+    ## Loop over quadrature points
+    for q_point in 1:getnquadpoints(cellvalues)
+        ## Get the quadrature weight
+        dΩ = getdetJdV(cellvalues, q_point)
+        ## Loop over test shape functions
+        for i in 1:n_basefuncs
+            δu = shape_value(cellvalues, q_point, i)
+            ∇δu = shape_gradient(cellvalues, q_point, i)
+            ## Add contribution to fe
+            fe[i] += δu * dΩ
+            ## Loop over trial shape functions
+            for j in 1:n_basefuncs
+                ∇u = shape_gradient(cellvalues, q_point, j)
+                ## Add contribution to Ke
+                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
+            end
+        end
+    end
+    return Ke, fe
+end
+
+
+function create_buffers(cellvalues, dh)
+    f = zeros(ndofs(dh))
+    K = allocate_matrix(dh)
+    assembler = start_assemble(K, f)
+    ## Local quantities
+    n_basefuncs = getnbasefunctions(cellvalues)
+    Ke = zeros(n_basefuncs, n_basefuncs)
+    fe = zeros(n_basefuncs)
+    return (; f, K, assembler, Ke, fe)
+end
+
+
+# Standard global assembly
+
+
+function assemble_global!(cellvalues, dh::DofHandler, qp_iter::Val{QPiter}) where {QPiter}
+    (; f, K, assembler, Ke, fe) = create_buffers(cellvalues, dh)
+    ## Loop over all cels
+    for cell in CellIterator(dh)
+        fill!(Ke, 0)
+        fill!(fe, 0)
+        if QPiter
+            ## reinit!(cellvalues, getcoordinates(cell))
+            assemble_element_qpiter!(Ke, fe, cellvalues, getcoordinates(cell))
+        else
+            ## Reinitialize cellvalues for this cell
+            reinit!(cellvalues, cell)
+            ## Compute element contribution
+            assemble_element_std!(Ke, fe, cellvalues)
+        end
+        ## Assemble Ke and fe into K and f
+        assemble!(assembler, celldofs(cell), Ke, fe)
+    end
+    return K, f
+end
+
+
+## gpu version of element assembly
+
+
+function assemble_element!(Ke, fe, cv, cell)
+    n_basefuncs = getnbasefunctions(cv)
+    for qv in Ferrite.QuadratureValuesIterator(cv, getcoordinates(cell))
+        ## Get the quadrature weight
+        dΩ = getdetJdV(qv)
+        ## Loop over test shape functions
+        for i in 1:n_basefuncs
+            δu = shape_value(qv, i)
+            ∇u = shape_gradient(qv, i)
+            ## Add contribution to fe
+            fe[i] += δu * dΩ
+            ## fe_shared[tx,i] += δu * dΩ
+            ## Loop over trial shape functions
+            for j in 1:n_basefuncs
+                ∇δu = shape_gradient(qv, j)
+                ## Add contribution to Ke
+                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
+            end
+        end
+    end
+    return
+end
+
+
+# gpu version of global assembly
+function assemble_gpu!(Kgpu, fgpu, cv, dh)
+    n_basefuncs = getnbasefunctions(cv)
+    assembler = start_assemble(Kgpu, fgpu; fillzero = false) ## has to be always false
+    for cell in CellIterator(dh, convert(Int32, n_basefuncs))
+        Ke = cellke(cell)
+        fe = cellfe(cell)
+        assemble_element!(Ke, fe, cv, cell)
+        assemble!(assembler, celldofs(cell), Ke, fe)
+    end
+    return nothing
+end
+
+
+n_basefuncs = getnbasefunctions(cellvalues)
+
+## Allocate CPU matrix
+K = allocate_matrix(SparseMatrixCSC{Float32, Int32}, dh);
+
+#K = allocate_matrix(SparseMatrixCSC{Float64, Int64}, dh);
+f = zeros(ndofs(dh));
+
+# Allocate GPU matrix
+## commented to pass the test
+## Kgpu = CUSPARSE.CuSparseMatrixCSC(K);
+## fgpu = CUDA.zeros(ndofs(dh));
+
+n_cells = dh |> get_grid |> getncells
+
+# Kernel configuration
+## GPU kernel ##
+## commented to pass the test
+## First init the kernel with the required config.
+## gpu_kernel = init_kernel(BackendCUDA, n_cells, n_basefuncs, assemble_gpu!, (Kgpu, fgpu, cellvalues, dh))
+## Then launch the kernel
+## gpu_kernel |> launch! or gpu_kernel()
+## gpu_kernel()
+
+## CPU kernel ##
+## cpu_kernel = init_kernel(BackendCPU, n_cells, n_basefuncs, assemble_gpu!, (K, f, cellvalues, dh));
+## cpu_kernel()
+
+stassy(cv, dh) = assemble_global!(cv, dh, Val(false))
+
+norm(K)
+## commented to pass the test
+## norm(Kgpu)
+Kstd, Fstd = stassy(cellvalues, dh);
+norm(Kstd)
+
+
+## GPU Benchmarking, remove when not needed ##
+## function bench_gpu(n_cells, n_basefuncs, cellvalues, dh)
+##     Kgpu = CUSPARSE.CuSparseMatrixCSC(K);
+##     fgpu = CUDA.zeros(ndofs(dh));
+##     gpu_kernel = init_kernel(BackendCUDA, n_cells, n_basefuncs, assemble_gpu!, (Kgpu, fgpu, cellvalues, dh))
+##     gpu_kernel()
+## end
+
+## CUDA.@time bench_gpu(n_cells, n_basefuncs, cellvalues, dh)
+## CUDA.@profile trace = true bench_gpu(n_cells, n_basefuncs, cellvalues, dh)
diff --git a/ext/FerriteGPU.jl b/ext/FerriteGPU.jl
@@ -0,0 +1,20 @@
+module FerriteGPU
+# This module represnets an extenssion of Ferrite.jl that uses GPU backend for assembly, namely CUDA.jl
+
+using Ferrite
+using CUDA
+using Adapt
+using Base:
+    @propagate_inbounds
+using SparseArrays:
+    AbstractSparseArray
+using StaticArrays:
+    SVector, MVector
+
+
+include("GPU/gpu_assembler.jl")
+include("GPU/CUDAKernelLauncher.jl")
+include("GPU/cuda_iterator.jl")
+include("GPU/adapt.jl")
+
+end
diff --git a/ext/GPU/CUDAKernelLauncher.jl b/ext/GPU/CUDAKernelLauncher.jl
@@ -0,0 +1,139 @@
+## This file manifsts the launch of GPU kernel on CUDA backend ##
+
+"""
+    Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
+
+Initialize a CUDA kernel for the Ferrite framework.
+
+# Arguments
+- `::Type{BackendCUDA}`: Specifies the CUDA backend.
+- `n_cells::Ti`: Number of cells in the problem.
+- `n_basefuncs::Ti`: Number of shape functions per cell.
+- `kernel::Function`: The CUDA kernel function to execute.
+- `args::Tuple`: Tuple of arguments for the kernel.
+
+# Returns
+- A `LazyKernel` object encapsulating the kernel and its execution configuration.
+
+# Errors
+Throws an `ArgumentError` if CUDA is not functional (e.g., due to missing drivers or improper installation).
+"""
+function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
+    if CUDA.functional()
+        return LazyKernel(n_cells, n_basefuncs, kernel, args, BackendCUDA)
+    else
+        throw(ArgumentError("CUDA is not functional, please check your GPU driver and CUDA installation"))
+    end
+end
+
+"""
+    Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti}
+
+Launch a CUDA kernel encapsulated in a `LazyKernel` object.
+
+# Arguments
+- `kernel::LazyKernel`: The kernel to be launched, along with its configuration.
+
+# Returns
+- `nothing`: Indicates that the kernel was launched and synchronized successfully.
+"""
+function Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti}
+    n_cells = kernel.n_cells
+    n_basefuncs = kernel.n_basefuncs
+    ker = kernel.kernel
+    args = kernel.args
+    kernel = @cuda launch = false ker(args...)
+    config = launch_configuration(kernel.fun)
+    threads = convert(Ti, min(n_cells, config.threads, 256))
+    shared_mem = _calculate_shared_memory(threads, n_basefuncs)
+    blocks = _calculate_nblocks(threads, n_cells)
+
+    ## use dynamic shared memory if possible
+    _can_use_dynshmem(shared_mem) && return CUDA.@sync kernel(args...; threads, blocks, shmem = shared_mem)
+
+    ## otherwise use global memory
+    nes = blocks * threads
+    kes = CUDA.zeros(Float32, nes, n_basefuncs, n_basefuncs)
+    fes = CUDA.zeros(Float32, nes, n_basefuncs)
+    args = _to_localdh(args, kes, fes)
+    CUDA.@sync @cuda blocks = blocks threads = threads ker(args...)
+    return nothing
+end
+
+"""
+    _to_localdh(args::Tuple, kes::AbstractArray, fes::AbstractArray)
+
+Convert a global degree-of-freedom handler to a local handler for use on the GPU.
+
+# Arguments
+- `args::Tuple`: Kernel arguments.
+- `kes::AbstractArray`: GPU storage for element stiffness matrices.
+- `fes::AbstractArray`: GPU storage for element force vectors.
+
+# Returns
+- `Tuple`: Updated arguments tuple with the degree-of-freedom handler replaced by a local GPU handler.
+
+# Errors
+Throws an `ErrorException` if no `AbstractDofHandler` is found in `args`.
+"""
+function _to_localdh(args::Tuple, kes::AbstractArray, fes::AbstractArray)
+    dh_index = findfirst(x -> x isa Ferrite.AbstractDofHandler, args)
+    dh_index !== nothing || throw(ErrorException("No subtype of AbstractDofHandler found in the arguments"))
+    arr = args |> collect
+    local_dh = LocalsGPUDofHandler(arr[dh_index], kes, fes)
+    arr[dh_index] = local_dh
+    return Tuple(arr)
+end
+
+"""
+    _calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
+
+Calculate the shared memory required for kernel execution.
+
+# Arguments
+- `threads::Integer`: Number of threads per block.
+- `n_basefuncs::Integer`: Number of basis functions per cell.
+
+# Returns
+- `Integer`: Amount of shared memory in bytes.
+"""
+function _calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
+    return sizeof(Float32) * (threads) * (n_basefuncs) * n_basefuncs + sizeof(Float32) * (threads) * n_basefuncs
+end
+
+"""
+    _can_use_dynshmem(required_shmem::Integer)
+
+Check if the GPU supports the required amount of dynamic shared memory.
+
+# Arguments
+- `required_shmem::Integer`: Required shared memory size in bytes.
+
+# Returns
+- `Bool`: `true` if the GPU can provide the required shared memory; `false` otherwise.
+"""
+function _can_use_dynshmem(required_shmem::Integer)
+    dev = device()
+    MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
+    return required_shmem < MAX_DYN_SHMEM
+end
+
+"""
+    _calculate_nblocks(threads::Integer, n_cells::Integer)
+
+Calculate the number of blocks required for kernel execution.
+
+# Arguments
+- `threads::Integer`: Number of threads per block.
+- `n_cells::Integer`: Total number of cells to process.
+
+# Returns
+- `Integer`: Number of blocks to launch.
+"""
+function _calculate_nblocks(threads::Integer, n_cells::Integer)
+    dev = device()
+    no_sms = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
+    required_blocks = cld(n_cells, threads)
+    required_blocks < 2 * no_sms || return 2 * no_sms
+    return required_blocks
+end