add working imp for global gpu mem

Ferrite-FEM · Nov 18, 2024 · c7f4b0f · c7f4b0f
1 parent bc8ec95
commit c7f4b0f
Show file tree

Hide file tree

Showing 6 changed files with 78 additions and 24 deletions.
diff --git a/docs/src/literate-tutorials/gpu_qp_heat_equation.jl b/docs/src/literate-tutorials/gpu_qp_heat_equation.jl
@@ -2,15 +2,14 @@ using Ferrite
 using StaticArrays
 using SparseArrays
 using CUDA
-using TimerOutputs
 
 
 left = Tensor{1, 2, Float32}((0, -0)) # define the left bottom corner of the grid.
 
-right = Tensor{1, 2, Float32}((1000.0, 1000.0)) # define the right top corner of the grid.
+right = Tensor{1, 2, Float32}((100.0, 100.0)) # define the right top corner of the grid.
 
 
-grid = generate_grid(Quadrilateral, (1000, 1000), left, right)
+grid = generate_grid(Quadrilateral, (100, 100), left, right)
 
 
 ip = Lagrange{RefQuadrilateral, 2}() # define the interpolation function (i.e. Bilinear lagrange)

diff --git a/ext/GPU/CUDAKernelLauncher.jl b/ext/GPU/CUDAKernelLauncher.jl
@@ -30,18 +30,17 @@ function Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti}
     _can_use_dynshmem(shared_mem) && return kernel(args...; threads, blocks, shmem = shared_mem)
 
     ## otherwise use global memory
-    device = device()
-    warp_size = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_WARP_SIZE)
-    nes = blocks * warp_size # Allocate only based on the number of active threads
-    kes = CUDA.zeros(Float32, nes * n_basefuncs * n_basefuncs)
-    fes = CUDA.zeros(Float32, nes * n_basefuncs)
+    nes = blocks * threads
+    kes = CUDA.zeros(Float32, nes, n_basefuncs, n_basefuncs)
+    fes = CUDA.zeros(Float32, nes, n_basefuncs)
     args = _to_localdh(args, kes, fes)
-    return kernel(args...; threads, blocks)
+    @cuda blocks = blocks threads = threads ker(args...)
+    return nothing
 end
 
 
 function _to_localdh(args::Tuple, kes::AbstractArray, fes::AbstractArray)
-    dh_index = findfirst(x -> x isa AbstractDofHandler, args)
+    dh_index = findfirst(x -> x isa Ferrite.AbstractDofHandler, args)
     dh_index !== nothing || throw(ErrorException("No subtype of AbstractDofHandler found in the arguments"))
     arr = args |> collect
     local_dh = LocalsGPUDofHandler(arr[dh_index], kes, fes)
@@ -54,9 +53,9 @@ function _calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
 end
 
 
-function _can_use_dynshmem(required_shmem::Int32)
+function _can_use_dynshmem(required_shmem::Integer)
     dev = device()
-    MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN) #size of dynamic shared memory
+    MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) #size of dynamic shared memory
     return required_shmem < MAX_DYN_SHMEM
 end
 

diff --git a/ext/GPU/adapt.jl b/ext/GPU/adapt.jl
@@ -70,6 +70,14 @@ function Adapt.adapt_structure(to, dh::DofHandler)
 end
 
 
+function Adapt.adapt_structure(to, dh::LocalsGPUDofHandler)
+    dh_ = Adapt.adapt_structure(to, dh |> dofhandler)
+    Kes = Adapt.adapt_structure(to, dh |> localkes)
+    fes = Adapt.adapt_structure(to, dh |> localfes)
+    return LocalsGPUDofHandler(dh_, Kes, fes)
+end
+
+
 function Adapt.adapt_structure(to, assembler::GPUAssemblerSparsityPattern)
     K = Adapt.adapt_structure(to, assembler.K)
     f = Adapt.adapt_structure(to, assembler.f)

diff --git a/ext/GPU/cuda_iterator.jl b/ext/GPU/cuda_iterator.jl
@@ -1,13 +1,16 @@
 ##### GPUCellIterator #####
 
+
+abstract type AbstractCUDACellIterator <: Ferrite.AbstractKernelCellIterator end
+
 """
     CUDACellIterator{DH<:Ferrite.AbstractGPUDofHandler,GRID<: Ferrite.AbstractGPUGrid,KDynamicSharedMem,FDynamicSharedMem}
 
 Create `CUDACellIterator` object for each thread with local id `thread_id` in order to iterate over some elements in the grid
 on the GPU and these elements are associated with the thread based on a stride = `blockDim().x * gridDim().x`.
 The elements of the iterator are `GPUCellCache` objects.
 """
-struct CUDACellIterator{DH <: Ferrite.AbstractGPUDofHandler, GRID <: Ferrite.AbstractGPUGrid, KDynamicSharedMem, FDynamicSharedMem} <: Ferrite.AbstractKernelCellIterator
+struct CUDACellIterator{DH <: Ferrite.GPUDofHandler, GRID <: Ferrite.AbstractGPUGrid, KDynamicSharedMem, FDynamicSharedMem} <: AbstractCUDACellIterator
     dh::DH # TODO: subdofhandlers are not supported yet.
     grid::GRID
     n_cells::Int32
@@ -29,7 +32,8 @@ Arguments:
 Returns:
 - A `CUDACellIterator` object.
 """
-function Ferrite.CellIterator(dh::Ferrite.AbstractGPUDofHandler, n_basefuncs::Int32)
+function Ferrite.CellIterator(dh::Ferrite.GPUDofHandler, n_basefuncs::Int32)
+    ## cell iterator that uses dynamic shared memory
     grid = get_grid(dh)
     n_cells = grid |> getncells |> Int32
     bd = blockDim().x
@@ -39,35 +43,61 @@ function Ferrite.CellIterator(dh::Ferrite.AbstractGPUDofHandler, n_basefuncs::In
     return CUDACellIterator(dh, grid, n_cells, ke_shared, fe_shared, local_thread_id)
 end
 
+
 """
-    ncells(iterator::CUDACellIterator)
+    ncells(iterator::AbstractCUDACellIterator)
 
 Return the total number of cells in the grid that the iterator is iterating over.
 
 Arguments:
-- `iterator`: The `CUDACellIterator` object.
+- `iterator`: The subtype of `AbstractCUDACellIterator` type.
 
 Returns:
 - The total number of cells as `Int32`.
 """
-ncells(iterator::CUDACellIterator) = iterator.n_cells
+ncells(iterator::AbstractCUDACellIterator) = iterator.n_cells
 
 
-function Base.iterate(iterator::CUDACellIterator)
+function Base.iterate(iterator::AbstractCUDACellIterator)
     i = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x # global thread id
     i <= iterator.n_cells || return nothing
     return (_makecache(iterator, i), i)
 end
 
 
-function Base.iterate(iterator::CUDACellIterator, state)
+function Base.iterate(iterator::AbstractCUDACellIterator, state)
     stride = blockDim().x * gridDim().x
     i = state + stride # next strided element id
     i <= iterator.n_cells || return nothing
     return (_makecache(iterator, i), i)
 end
 
 
+## Cell iterator that uses global memory ##
+
+struct CUDAGlobalCellIterator{DH <: Ferrite.GPUDofHandler, GRID <: Ferrite.AbstractGPUGrid, MAT, VEC} <: AbstractCUDACellIterator
+    dh::DH # TODO: subdofhandlers are not supported yet.
+    grid::GRID
+    n_cells::Int32
+    ke::MAT # reference to the global memory for the stiffness matrix
+    fe::VEC  # reference to the global memory for the force vector
+    thread_id::Int32 # local thread id
+end
+
+function Ferrite.CellIterator(dh_::Ferrite.LocalsGPUDofHandler, n_basefuncs::Int32)
+    ## cell iterator that uses global memory
+    dh = dh_ |> dofhandler
+    grid = get_grid(dh)
+    n_cells = grid |> getncells |> Int32
+    bd = blockDim().x
+    local_thread_id = threadIdx().x
+    global_thread_id = (blockIdx().x - Int32(1)) * bd + local_thread_id
+    ke = cellke(dh_, global_thread_id)
+    fe = cellfe(dh_, global_thread_id)
+    return CUDAGlobalCellIterator(dh, grid, n_cells, ke, fe, local_thread_id)
+end
+
+
 ##### GPUCellCache #####
 
 """
@@ -96,6 +126,18 @@ end
 
 
 function _makecache(iterator::CUDACellIterator, e::Int32)
+    ke_fun = () -> (@view iterator.block_ke[iterator.thread_id, :, :])
+    fe_fun = () -> (@view iterator.block_fe[iterator.thread_id, :])
+    return _makecache(iterator, e, ke_fun, fe_fun)
+end
+
+function _makecache(iterator::CUDAGlobalCellIterator, e::Int32)
+    ke_fun = () -> iterator.ke
+    fe_fun = () -> iterator.fe
+    return _makecache(iterator, e, ke_fun, fe_fun)
+end
+
+function _makecache(iterator::AbstractCUDACellIterator, e::Int32, ke_func::Function, fe_func::Function)
     dh = iterator.dh
     grid = iterator.grid
     cellid = e
@@ -117,7 +159,7 @@ function _makecache(iterator::CUDACellIterator, e::Int32)
     coords = SVector(x...)
 
     # Return the GPUCellCache containing the cell's data.
-    return GPUCellCache(coords, dofs, cellid, nodes, (@view iterator.block_ke[iterator.thread_id, :, :]), (@view iterator.block_fe[iterator.thread_id, :, :]))
+    return GPUCellCache(coords, dofs, cellid, nodes, ke_func(), fe_func())
 end
 
 """

diff --git a/src/GPU/GPUDofHandler.jl b/src/GPU/GPUDofHandler.jl
@@ -1,7 +1,7 @@
 # This file defines the `GPUDofHandler` type, which is a degree of freedom handler that is stored on the GPU.
 # Therefore most of the functions are same as the ones defined in dof_handler.jl, but executable on the GPU.
 
-abstract type AbstractGPUDofHandler <: AbstractDofHandler  end
+abstract type AbstractGPUDofHandler <: Ferrite.AbstractDofHandler  end
 
 struct GPUDofHandler{CDOFS <: AbstractArray{<:Number, 1}, VEC_INT <: AbstractArray{Int32, 1}, GRID <: AbstractGrid} <: AbstractGPUDofHandler
     cell_dofs::CDOFS
@@ -35,5 +35,7 @@ end
 
 ## Accessors ##
 dofhandler(dh::LocalsGPUDofHandler) = dh.dh
-cellke(dh::LocalsGPUDofHandler, e::Int32) = @view dh.Kes[e, :, :]
-cellfe(dh::LocalsGPUDofHandler, e::Int32) = @view dh.fes[e, :]
+localkes(dh::LocalsGPUDofHandler) = dh.Kes
+localfes(dh::LocalsGPUDofHandler) = dh.fes
+cellke(dh::LocalsGPUDofHandler, i::Int32) = @view dh.Kes[i, :, :]
+cellfe(dh::LocalsGPUDofHandler, i::Int32) = @view dh.fes[i, :]
diff --git a/src/exports.jl b/src/exports.jl
@@ -201,4 +201,8 @@ export
     getweights,
     getbackend,
     BackendCUDA,
-    BackendCPU
+    BackendCPU,
+    LocalsGPUDofHandler,
+    dofhandler,
+    localfes,
+    localkes