Skip to content

Commit

Permalink
add working imp for global gpu mem
Browse files Browse the repository at this point in the history
  • Loading branch information
Abdelrahman912 committed Nov 18, 2024
1 parent bc8ec95 commit c7f4b0f
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 24 deletions.
5 changes: 2 additions & 3 deletions docs/src/literate-tutorials/gpu_qp_heat_equation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@ using Ferrite
using StaticArrays
using SparseArrays
using CUDA
using TimerOutputs


left = Tensor{1, 2, Float32}((0, -0)) # define the left bottom corner of the grid.

right = Tensor{1, 2, Float32}((1000.0, 1000.0)) # define the right top corner of the grid.
right = Tensor{1, 2, Float32}((100.0, 100.0)) # define the right top corner of the grid.


grid = generate_grid(Quadrilateral, (1000, 1000), left, right)
grid = generate_grid(Quadrilateral, (100, 100), left, right)


ip = Lagrange{RefQuadrilateral, 2}() # define the interpolation function (i.e. Bilinear lagrange)
Expand Down
17 changes: 8 additions & 9 deletions ext/GPU/CUDAKernelLauncher.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,17 @@ function Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti}
_can_use_dynshmem(shared_mem) && return kernel(args...; threads, blocks, shmem = shared_mem)

## otherwise use global memory
device = device()
warp_size = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_WARP_SIZE)
nes = blocks * warp_size # Allocate only based on the number of active threads
kes = CUDA.zeros(Float32, nes * n_basefuncs * n_basefuncs)
fes = CUDA.zeros(Float32, nes * n_basefuncs)
nes = blocks * threads
kes = CUDA.zeros(Float32, nes, n_basefuncs, n_basefuncs)
fes = CUDA.zeros(Float32, nes, n_basefuncs)
args = _to_localdh(args, kes, fes)
return kernel(args...; threads, blocks)
@cuda blocks = blocks threads = threads ker(args...)
return nothing
end


function _to_localdh(args::Tuple, kes::AbstractArray, fes::AbstractArray)
dh_index = findfirst(x -> x isa AbstractDofHandler, args)
dh_index = findfirst(x -> x isa Ferrite.AbstractDofHandler, args)
dh_index !== nothing || throw(ErrorException("No subtype of AbstractDofHandler found in the arguments"))
arr = args |> collect
local_dh = LocalsGPUDofHandler(arr[dh_index], kes, fes)
Expand All @@ -54,9 +53,9 @@ function _calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
end


function _can_use_dynshmem(required_shmem::Int32)
function _can_use_dynshmem(required_shmem::Integer)
dev = device()
MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN) #size of dynamic shared memory
MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) #size of dynamic shared memory
return required_shmem < MAX_DYN_SHMEM
end

Expand Down
8 changes: 8 additions & 0 deletions ext/GPU/adapt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,14 @@ function Adapt.adapt_structure(to, dh::DofHandler)
end


function Adapt.adapt_structure(to, dh::LocalsGPUDofHandler)
dh_ = Adapt.adapt_structure(to, dh |> dofhandler)
Kes = Adapt.adapt_structure(to, dh |> localkes)
fes = Adapt.adapt_structure(to, dh |> localfes)
return LocalsGPUDofHandler(dh_, Kes, fes)
end


function Adapt.adapt_structure(to, assembler::GPUAssemblerSparsityPattern)
K = Adapt.adapt_structure(to, assembler.K)
f = Adapt.adapt_structure(to, assembler.f)
Expand Down
58 changes: 50 additions & 8 deletions ext/GPU/cuda_iterator.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
##### GPUCellIterator #####


abstract type AbstractCUDACellIterator <: Ferrite.AbstractKernelCellIterator end

"""
CUDACellIterator{DH<:Ferrite.AbstractGPUDofHandler,GRID<: Ferrite.AbstractGPUGrid,KDynamicSharedMem,FDynamicSharedMem}
Create `CUDACellIterator` object for each thread with local id `thread_id` in order to iterate over some elements in the grid
on the GPU and these elements are associated with the thread based on a stride = `blockDim().x * gridDim().x`.
The elements of the iterator are `GPUCellCache` objects.
"""
struct CUDACellIterator{DH <: Ferrite.AbstractGPUDofHandler, GRID <: Ferrite.AbstractGPUGrid, KDynamicSharedMem, FDynamicSharedMem} <: Ferrite.AbstractKernelCellIterator
struct CUDACellIterator{DH <: Ferrite.GPUDofHandler, GRID <: Ferrite.AbstractGPUGrid, KDynamicSharedMem, FDynamicSharedMem} <: AbstractCUDACellIterator
dh::DH # TODO: subdofhandlers are not supported yet.
grid::GRID
n_cells::Int32
Expand All @@ -29,7 +32,8 @@ Arguments:
Returns:
- A `CUDACellIterator` object.
"""
function Ferrite.CellIterator(dh::Ferrite.AbstractGPUDofHandler, n_basefuncs::Int32)
function Ferrite.CellIterator(dh::Ferrite.GPUDofHandler, n_basefuncs::Int32)
## cell iterator that uses dynamic shared memory
grid = get_grid(dh)
n_cells = grid |> getncells |> Int32
bd = blockDim().x
Expand All @@ -39,35 +43,61 @@ function Ferrite.CellIterator(dh::Ferrite.AbstractGPUDofHandler, n_basefuncs::In
return CUDACellIterator(dh, grid, n_cells, ke_shared, fe_shared, local_thread_id)
end


"""
ncells(iterator::CUDACellIterator)
ncells(iterator::AbstractCUDACellIterator)
Return the total number of cells in the grid that the iterator is iterating over.
Arguments:
- `iterator`: The `CUDACellIterator` object.
- `iterator`: The subtype of `AbstractCUDACellIterator` type.
Returns:
- The total number of cells as `Int32`.
"""
ncells(iterator::CUDACellIterator) = iterator.n_cells
ncells(iterator::AbstractCUDACellIterator) = iterator.n_cells


function Base.iterate(iterator::CUDACellIterator)
function Base.iterate(iterator::AbstractCUDACellIterator)
i = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x # global thread id
i <= iterator.n_cells || return nothing
return (_makecache(iterator, i), i)
end


function Base.iterate(iterator::CUDACellIterator, state)
function Base.iterate(iterator::AbstractCUDACellIterator, state)
stride = blockDim().x * gridDim().x
i = state + stride # next strided element id
i <= iterator.n_cells || return nothing
return (_makecache(iterator, i), i)
end


## Cell iterator that uses global memory ##

struct CUDAGlobalCellIterator{DH <: Ferrite.GPUDofHandler, GRID <: Ferrite.AbstractGPUGrid, MAT, VEC} <: AbstractCUDACellIterator
dh::DH # TODO: subdofhandlers are not supported yet.
grid::GRID
n_cells::Int32
ke::MAT # reference to the global memory for the stiffness matrix
fe::VEC # reference to the global memory for the force vector
thread_id::Int32 # local thread id
end

function Ferrite.CellIterator(dh_::Ferrite.LocalsGPUDofHandler, n_basefuncs::Int32)
## cell iterator that uses global memory
dh = dh_ |> dofhandler
grid = get_grid(dh)
n_cells = grid |> getncells |> Int32
bd = blockDim().x
local_thread_id = threadIdx().x
global_thread_id = (blockIdx().x - Int32(1)) * bd + local_thread_id
ke = cellke(dh_, global_thread_id)
fe = cellfe(dh_, global_thread_id)
return CUDAGlobalCellIterator(dh, grid, n_cells, ke, fe, local_thread_id)
end


##### GPUCellCache #####

"""
Expand Down Expand Up @@ -96,6 +126,18 @@ end


function _makecache(iterator::CUDACellIterator, e::Int32)
ke_fun = () -> (@view iterator.block_ke[iterator.thread_id, :, :])
fe_fun = () -> (@view iterator.block_fe[iterator.thread_id, :])
return _makecache(iterator, e, ke_fun, fe_fun)
end

function _makecache(iterator::CUDAGlobalCellIterator, e::Int32)
ke_fun = () -> iterator.ke
fe_fun = () -> iterator.fe
return _makecache(iterator, e, ke_fun, fe_fun)
end

function _makecache(iterator::AbstractCUDACellIterator, e::Int32, ke_func::Function, fe_func::Function)
dh = iterator.dh
grid = iterator.grid
cellid = e
Expand All @@ -117,7 +159,7 @@ function _makecache(iterator::CUDACellIterator, e::Int32)
coords = SVector(x...)

# Return the GPUCellCache containing the cell's data.
return GPUCellCache(coords, dofs, cellid, nodes, (@view iterator.block_ke[iterator.thread_id, :, :]), (@view iterator.block_fe[iterator.thread_id, :, :]))
return GPUCellCache(coords, dofs, cellid, nodes, ke_func(), fe_func())
end

"""
Expand Down
8 changes: 5 additions & 3 deletions src/GPU/GPUDofHandler.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This file defines the `GPUDofHandler` type, which is a degree of freedom handler that is stored on the GPU.
# Therefore most of the functions are same as the ones defined in dof_handler.jl, but executable on the GPU.

abstract type AbstractGPUDofHandler <: AbstractDofHandler end
abstract type AbstractGPUDofHandler <: Ferrite.AbstractDofHandler end

struct GPUDofHandler{CDOFS <: AbstractArray{<:Number, 1}, VEC_INT <: AbstractArray{Int32, 1}, GRID <: AbstractGrid} <: AbstractGPUDofHandler
cell_dofs::CDOFS
Expand Down Expand Up @@ -35,5 +35,7 @@ end

## Accessors ##
dofhandler(dh::LocalsGPUDofHandler) = dh.dh
cellke(dh::LocalsGPUDofHandler, e::Int32) = @view dh.Kes[e, :, :]
cellfe(dh::LocalsGPUDofHandler, e::Int32) = @view dh.fes[e, :]
localkes(dh::LocalsGPUDofHandler) = dh.Kes
localfes(dh::LocalsGPUDofHandler) = dh.fes
cellke(dh::LocalsGPUDofHandler, i::Int32) = @view dh.Kes[i, :, :]
cellfe(dh::LocalsGPUDofHandler, i::Int32) = @view dh.fes[i, :]
6 changes: 5 additions & 1 deletion src/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -201,4 +201,8 @@ export
getweights,
getbackend,
BackendCUDA,
BackendCPU
BackendCPU,
LocalsGPUDofHandler,
dofhandler,
localfes,
localkes

0 comments on commit c7f4b0f

Please sign in to comment.