Skip to content

Commit

Permalink
init cpu multi threading
Browse files Browse the repository at this point in the history
  • Loading branch information
Abdelrahman912 committed Nov 4, 2024
1 parent a356d8d commit ee1f77c
Show file tree
Hide file tree
Showing 10 changed files with 249 additions and 56 deletions.
29 changes: 20 additions & 9 deletions docs/src/literate-tutorials/gpu_qp_heat_equation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ using CUDA

left = Tensor{1,2,Float32}((0,-0)) # define the left bottom corner of the grid.

right = Tensor{1,2,Float32}((100.0,100.0)) # define the right top corner of the grid.
right = Tensor{1,2,Float32}((10.0,10.0)) # define the right top corner of the grid.


grid = generate_grid(Quadrilateral, (100, 100),left,right)

grid = generate_grid(Quadrilateral, (5, 5),left,right)


ip = Lagrange{RefQuadrilateral, 1}() # define the interpolation function (i.e. Bilinear lagrange)

colors = create_coloring(grid)


qr = QuadratureRule{RefQuadrilateral}(Float32,2)
Expand All @@ -34,7 +34,7 @@ add!(dh, :u, ip)

close!(dh);


dh |> get_grid

# Standard assembly of the element.
function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
Expand Down Expand Up @@ -138,8 +138,10 @@ end

n_basefuncs = getnbasefunctions(cellvalues)

# Allocate CPU matrix
K = allocate_matrix(SparseMatrixCSC{Float32, Int32},dh);
## Allocate CPU matrix
## K = allocate_matrix(SparseMatrixCSC{Float32, Int32},dh);
K = allocate_matrix(SparseMatrixCSC{Float64, Int64},dh);
f = zeros(ndofs(dh));

# Allocate GPU matrix
## commented to pass the test
Expand All @@ -150,13 +152,22 @@ n_cells = dh |> get_grid |> getncells

# Kernel configuration
## commented to pass the test
## init_gpu_kernel(BackendCUDA,n_cells,n_basefuncs,assemble_gpu!, (Kgpu,fgpu, cellvalues, dh)) |> launch!

##init_kernel(BackendCUDA,n_cells,n_basefuncs,assemble_gpu!, (Kgpu,fgpu, cellvalues, dh)) |> launch!
cpu_kernel = init_kernel(BackendCPU,n_cells,n_basefuncs,assemble_gpu!, (K,f, cellvalues, dh));
cpu_kernel()

stassy(cv,dh) = assemble_global!(cv,dh,Val(false))


norm(K)
## commented to pass the test
## norm(Kgpu)
Kstd , Fstd = stassy(cellvalues,dh);
norm(Kstd)



for i in 1:10
Threads.@threads for j in 1:4
@show i,j
end
end
13 changes: 4 additions & 9 deletions ext/GPU/CUDAKernelLauncher.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@


function Ferrite.init_gpu_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti<: Integer}
function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti<: Integer}
if CUDA.functional()
return GPUKernel(n_cells, n_basefuncs, kernel, args, BackendCUDA)
return LazyKernel(n_cells, n_basefuncs, kernel, args, BackendCUDA)
else
throw(ArgumentError("CUDA is not functional, please check your GPU driver and CUDA installation"))
end
Expand All @@ -17,13 +17,7 @@ Launch a CUDA kernel with the given configuration.
Arguments:
- `kernel_config`: The `CUDAKernelLauncher` object containing a higher level fields for kernel configuration.
"""
function Ferrite.launch!(kernel::GPUKernel{Ti}) where Ti
backend = kernel |> getbackend
_launch_kernel!(backend, kernel)
end


function _launch_kernel!(::Type{BackendCUDA}, kernel::GPUKernel{Ti}) where Ti
function Ferrite.launch!(kernel::LazyKernel{Ti,BackendCUDA}) where Ti
n_cells = kernel.n_cells
n_basefuncs = kernel.n_basefuncs
ker = kernel.kernel
Expand All @@ -36,6 +30,7 @@ function _launch_kernel!(::Type{BackendCUDA}, kernel::GPUKernel{Ti}) where Ti
kernel(args...; threads, blocks, shmem=shared_mem)
end


function _calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
return sizeof(Float32) * (threads) * ( n_basefuncs) * n_basefuncs + sizeof(Float32) * (threads) * n_basefuncs
end
Expand Down
4 changes: 2 additions & 2 deletions ext/GPU/cuda_iterator.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Create `CUDACellIterator` object for each thread with local id `thread_id` in or
on the GPU and these elements are associated with the thread based on a stride = `blockDim().x * gridDim().x`.
The elements of the iterator are `GPUCellCache` objects.
"""
struct CUDACellIterator{DH<:Ferrite.AbstractGPUDofHandler,GRID<: Ferrite.AbstractGPUGrid,KDynamicSharedMem,FDynamicSharedMem} <: Ferrite.AbstractGPUCellIterator
struct CUDACellIterator{DH<:Ferrite.AbstractGPUDofHandler,GRID<: Ferrite.AbstractGPUGrid,KDynamicSharedMem,FDynamicSharedMem} <: Ferrite.AbstractKernelCellIterator
dh::DH # TODO: subdofhandlers are not supported yet.
grid::GRID
n_cells::Int32
Expand Down Expand Up @@ -85,7 +85,7 @@ Arguments:
- `ke`: View into shared memory for the cell's stiffness matrix.
- `fe`: View into shared memory for the cell's force vector.
"""
struct GPUCellCache{DOFS <: AbstractVector{Int32},NN,NODES <: SVector{NN,Int32},X, COORDS<: SVector{X},KDynamicSharedMem,FDynamicSharedMem} <: Ferrite.AbstractGPUCellCache
struct GPUCellCache{DOFS <: AbstractVector{Int32},NN,NODES <: SVector{NN,Int32},X, COORDS<: SVector{X},KDynamicSharedMem,FDynamicSharedMem} <: Ferrite.AbstractKernelCellCache
coords::COORDS
dofs::DOFS
cellid::Int32
Expand Down
6 changes: 4 additions & 2 deletions src/Ferrite.jl
Original file line number Diff line number Diff line change
Expand Up @@ -169,10 +169,12 @@ include("deprecations.jl")
include("docs.jl")

# GPU support
include("GPU/GPUKernelLauncher.jl")
include("GPU/coloring_dof.jl")
include("GPU/KernelLauncher.jl")
include("GPU/CPUKernelLauncher.jl")
include("GPU/gpu_grid.jl")
include("GPU/GPUDofHandler.jl")
include("GPU/gpu_iterator.jl")
include("GPU/parallel_iterator.jl")


end # module
51 changes: 51 additions & 0 deletions src/GPU/CPUKernelLauncher.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
function init_kernel(::Type{BackendCPU}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti<: Integer}
return LazyKernel(n_cells, n_basefuncs, kernel, args, BackendCPU)
end

function launch!(kernel::LazyKernel{Ti,BackendCPU}) where Ti
ker = kernel.kernel
args = kernel.args
## Naive implementation to circumvent the issue with cellvalues
## on GPU the we are using the static version of cellvalues because it's immutable
## so in order to unify the parallel kernel interface we need (for now) to use the static version
## without changing the routine, so basically we search for any cellvalues passed in the args and
## convert it to the static version
cell_index = findfirst(x -> x isa CellValues, args)
(cell_index === nothing) || (args = _update_cell_args(args,cell_index))
args, color_dh = _to_colordh(args) # convert the dofhandler to color dofhandler
no_colors = ncolors(color_dh)
nthreads = Threads.nthreads()
for i in 1:no_colors
current_color!(color_dh, i)
@show "color" i, current_color(color_dh)
Threads.@threads for j in 1:nthreads
ker(args...)
end
end
end


function _to_colordh(args::Tuple)
dh_index = findfirst(x -> x isa AbstractDofHandler, args)
dh_index !== nothing || throw(ErrorException("No subtype of AbstractDofHandler found in the arguments"))
arr = args |> collect
color_dh = init_colordh(arr[dh_index])
arr[dh_index] = color_dh
return Tuple(arr), color_dh
end

function _update_cell_args(args::Tuple,index::Int)
## since tuples are immutable we need to convert it to an array to update the values
## then convert it back to a tuple
arr = args |> collect
arr[index] = _to_static_cellvalues(arr[index])
return Tuple(arr)
end


function _to_static_cellvalues(cv::CellValues)
fv = StaticInterpolationValues(cv.fun_values)
gm =StaticInterpolationValues(cv.geo_mapping)
weights = ntuple(i -> getweights(cv.qr)[i], getnquadpoints(cv))
return Ferrite.StaticCellValues(fv,gm, weights)
end
21 changes: 12 additions & 9 deletions src/GPU/GPUKernelLauncher.jl → src/GPU/KernelLauncher.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ and backends, serving as a foundation for GPU-accelerated computations.
=#

### Abstract Types ###
abstract type AbstractGPUKernel end
abstract type AbstractGPUBackend end
abstract type AbstractKernel end
abstract type AbstractBackend end


### Functions ###
Expand All @@ -28,7 +28,7 @@ kernel function, and additional arguments.
This function needs to be implemented for each specific backend. Calling this function
without a concrete implementation will raise an error.
"""
function init_gpu_kernel(backend::AbstractGPUBackend, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
function init_kernel(backend::AbstractBackend, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
throw(ErrorException("A concrete implementation of init_gpu_kernel is required"))
end

Expand All @@ -45,15 +45,15 @@ mechanism for running GPU-accelerated computations across different GPU backends
This function must be implemented for specific GPU kernels. If not implemented,
an error will be thrown.
"""
function launch!(kernel::AbstractGPUKernel)
function launch!(kernel::AbstractKernel)
throw(ErrorException("A concrete implementation of launch! is required"))
end


### Concrete Types ###

"""
GPUKernel{Ti}(n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple, backend::Type{<:AbstractGPUBackend})
LazyKernel{Ti}(n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple, backend::Type{<:AbstractGPUBackend})
Represents a high-level interface to a GPU backend for configuring and launching GPU kernels.
It stores the necessary parameters for kernel execution, such as the number of cells,
Expand All @@ -69,13 +69,15 @@ number of base functions, the kernel function, and any additional arguments.
# Type Parameters
- `Ti`: An integer type representing the number type used for `n_cells` and `n_basefuncs`.
"""
struct GPUKernel{Ti} <: AbstractGPUKernel
struct LazyKernel{Ti,BKD<:AbstractBackend} <: AbstractKernel
n_cells::Ti # Number of cells
n_basefuncs::Ti # Number of base functions
kernel::Function # Kernel function to execute
args::Tuple # Arguments for the kernel function
backend::Type{<:AbstractGPUBackend} # GPU backend
backend::Type{BKD} # GPU backend
end

(ker::LazyKernel)() = launch!(ker)

"""
getbackend(kernel::GPUKernel) -> Type{<:AbstractGPUBackend}
Expand All @@ -88,7 +90,7 @@ Returns the backend associated with the given `GPUKernel`.
# Returns
The backend type associated with the kernel.
"""
getbackend(kernel::GPUKernel) = kernel.backend
getbackend(kernel::LazyKernel) = kernel.backend


### GPU Backend ###
Expand All @@ -99,4 +101,5 @@ getbackend(kernel::GPUKernel) = kernel.backend
Represents the CUDA backend for GPU acceleration. This type serves as a concrete
implementation of `AbstractGPUBackend` for executing GPU computations using CUDA.
"""
struct BackendCUDA <: AbstractGPUBackend end
struct BackendCUDA <: AbstractBackend end
struct BackendCPU <: AbstractBackend end
18 changes: 18 additions & 0 deletions src/GPU/coloring_dof.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
mutable struct ColoringDofHandler{Ti<:Integer,VECS<:Vector{Vector{Ti}},DH<:AbstractDofHandler}
dh::DH
colors::VECS
current_color::Ti
end

function init_colordh(dh::AbstractDofHandler)
grid = get_grid(dh)
colors = create_coloring(grid)
ColoringDofHandler(dh, colors, 0)
end

dofhandler(cd::ColoringDofHandler) = cd.dh
colors(cd::ColoringDofHandler) = cd.colors
eles_in_color(cd::ColoringDofHandler, color::Ti) where {Ti<:Integer} = cd.colors[color]
current_color(cd::ColoringDofHandler) = cd.current_color
current_color!(cd::ColoringDofHandler, color::Ti) where {Ti<:Integer} = (cd.current_color = color)
ncolors(cd::ColoringDofHandler) = cd |> colors |> length
22 changes: 0 additions & 22 deletions src/GPU/gpu_iterator.jl

This file was deleted.

Loading

0 comments on commit ee1f77c

Please sign in to comment.