init cpu multi threading

Ferrite-FEM · Nov 4, 2024 · ee1f77c · ee1f77c
1 parent a356d8d
commit ee1f77c
Show file tree

Hide file tree

Showing 10 changed files with 249 additions and 56 deletions.
diff --git a/docs/src/literate-tutorials/gpu_qp_heat_equation.jl b/docs/src/literate-tutorials/gpu_qp_heat_equation.jl
@@ -10,15 +10,15 @@ using CUDA
 
 left = Tensor{1,2,Float32}((0,-0)) # define the left bottom corner of the grid.
 
-right = Tensor{1,2,Float32}((100.0,100.0)) # define the right top corner of the grid.
+right = Tensor{1,2,Float32}((10.0,10.0)) # define the right top corner of the grid.
 
 
-grid = generate_grid(Quadrilateral, (100, 100),left,right)
-
+grid = generate_grid(Quadrilateral, (5, 5),left,right)
 
 
 ip = Lagrange{RefQuadrilateral, 1}() # define the interpolation function (i.e. Bilinear lagrange)
 
+colors = create_coloring(grid)
 
 
 qr = QuadratureRule{RefQuadrilateral}(Float32,2)
@@ -34,7 +34,7 @@ add!(dh, :u, ip)
 
 close!(dh);
 
-
+dh |> get_grid
 
 # Standard assembly of the element.
 function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
@@ -138,8 +138,10 @@ end
 
 n_basefuncs = getnbasefunctions(cellvalues)
 
-# Allocate CPU matrix
-K = allocate_matrix(SparseMatrixCSC{Float32, Int32},dh);
+## Allocate CPU matrix
+## K = allocate_matrix(SparseMatrixCSC{Float32, Int32},dh);
+K = allocate_matrix(SparseMatrixCSC{Float64, Int64},dh);
+f = zeros(ndofs(dh));
 
 # Allocate GPU matrix
 ## commented to pass the test
@@ -150,13 +152,22 @@ n_cells = dh |> get_grid |> getncells
 
 # Kernel configuration
 ## commented to pass the test
-## init_gpu_kernel(BackendCUDA,n_cells,n_basefuncs,assemble_gpu!, (Kgpu,fgpu, cellvalues, dh)) |> launch!
-
+##init_kernel(BackendCUDA,n_cells,n_basefuncs,assemble_gpu!, (Kgpu,fgpu, cellvalues, dh)) |> launch!
+cpu_kernel = init_kernel(BackendCPU,n_cells,n_basefuncs,assemble_gpu!, (K,f, cellvalues, dh));
+cpu_kernel()
 
 stassy(cv,dh) = assemble_global!(cv,dh,Val(false))
 
-
+norm(K)
 ## commented to pass the test
 ## norm(Kgpu)
 Kstd , Fstd = stassy(cellvalues,dh);
 norm(Kstd)
+
+
+
+for i in 1:10 
+    Threads.@threads for j in 1:4
+        @show i,j
+    end
+end
diff --git a/ext/GPU/CUDAKernelLauncher.jl b/ext/GPU/CUDAKernelLauncher.jl
@@ -1,8 +1,8 @@
 
 
-function Ferrite.init_gpu_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti<: Integer}
+function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti<: Integer}
     if CUDA.functional()
-        return GPUKernel(n_cells, n_basefuncs, kernel, args, BackendCUDA)
+        return LazyKernel(n_cells, n_basefuncs, kernel, args, BackendCUDA)
     else
         throw(ArgumentError("CUDA is not functional, please check your GPU driver and CUDA installation"))
     end
@@ -17,13 +17,7 @@ Launch a CUDA kernel with the given configuration.
 Arguments:
 - `kernel_config`: The `CUDAKernelLauncher` object containing a higher level fields for kernel configuration.
 """
-function Ferrite.launch!(kernel::GPUKernel{Ti}) where Ti
-    backend  = kernel |> getbackend
-    _launch_kernel!(backend, kernel)
-end
-
-
-function _launch_kernel!(::Type{BackendCUDA}, kernel::GPUKernel{Ti}) where Ti
+function Ferrite.launch!(kernel::LazyKernel{Ti,BackendCUDA}) where Ti
     n_cells = kernel.n_cells
     n_basefuncs = kernel.n_basefuncs
     ker = kernel.kernel
@@ -36,6 +30,7 @@ function _launch_kernel!(::Type{BackendCUDA}, kernel::GPUKernel{Ti}) where Ti
     kernel(args...; threads, blocks, shmem=shared_mem)
 end
 
+
 function _calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
     return sizeof(Float32) * (threads) * ( n_basefuncs) * n_basefuncs + sizeof(Float32) * (threads) * n_basefuncs
 end

diff --git a/ext/GPU/cuda_iterator.jl b/ext/GPU/cuda_iterator.jl
@@ -7,7 +7,7 @@ Create `CUDACellIterator` object for each thread with local id `thread_id` in or
 on the GPU and these elements are associated with the thread based on a stride = `blockDim().x * gridDim().x`.
 The elements of the iterator are `GPUCellCache` objects.
 """
-struct CUDACellIterator{DH<:Ferrite.AbstractGPUDofHandler,GRID<: Ferrite.AbstractGPUGrid,KDynamicSharedMem,FDynamicSharedMem} <: Ferrite.AbstractGPUCellIterator
+struct CUDACellIterator{DH<:Ferrite.AbstractGPUDofHandler,GRID<: Ferrite.AbstractGPUGrid,KDynamicSharedMem,FDynamicSharedMem} <: Ferrite.AbstractKernelCellIterator
     dh::DH # TODO: subdofhandlers are not supported yet.
     grid::GRID
     n_cells::Int32
@@ -85,7 +85,7 @@ Arguments:
 - `ke`: View into shared memory for the cell's stiffness matrix.
 - `fe`: View into shared memory for the cell's force vector.
 """
-struct GPUCellCache{DOFS <: AbstractVector{Int32},NN,NODES <: SVector{NN,Int32},X, COORDS<: SVector{X},KDynamicSharedMem,FDynamicSharedMem} <: Ferrite.AbstractGPUCellCache
+struct GPUCellCache{DOFS <: AbstractVector{Int32},NN,NODES <: SVector{NN,Int32},X, COORDS<: SVector{X},KDynamicSharedMem,FDynamicSharedMem} <: Ferrite.AbstractKernelCellCache
     coords::COORDS
     dofs::DOFS
     cellid::Int32

diff --git a/src/Ferrite.jl b/src/Ferrite.jl
@@ -169,10 +169,12 @@ include("deprecations.jl")
 include("docs.jl")
 
 # GPU support
-include("GPU/GPUKernelLauncher.jl")
+include("GPU/coloring_dof.jl")
+include("GPU/KernelLauncher.jl")
+include("GPU/CPUKernelLauncher.jl")
 include("GPU/gpu_grid.jl")
 include("GPU/GPUDofHandler.jl")
-include("GPU/gpu_iterator.jl")
+include("GPU/parallel_iterator.jl")
 
 
 end # module
diff --git a/src/GPU/CPUKernelLauncher.jl b/src/GPU/CPUKernelLauncher.jl
@@ -0,0 +1,51 @@
+function init_kernel(::Type{BackendCPU}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti<: Integer}
+    return LazyKernel(n_cells, n_basefuncs, kernel, args, BackendCPU)
+end
+
+function launch!(kernel::LazyKernel{Ti,BackendCPU}) where Ti
+    ker = kernel.kernel
+    args = kernel.args
+    ## Naive implementation to circumvent the issue with cellvalues
+    ## on GPU the we are using the static version of cellvalues because it's immutable
+    ## so in order to unify the parallel kernel interface we need (for now) to use the static version
+    ## without changing the routine, so basically we search for any cellvalues passed in the args and
+    ## convert it to the static version
+    cell_index = findfirst(x -> x isa CellValues, args)
+    (cell_index === nothing) || (args = _update_cell_args(args,cell_index))
+    args, color_dh = _to_colordh(args) # convert the dofhandler to color dofhandler
+    no_colors = ncolors(color_dh)
+    nthreads = Threads.nthreads()
+    for i in 1:no_colors
+        current_color!(color_dh, i)
+        @show "color" i, current_color(color_dh)
+        Threads.@threads for j in 1:nthreads
+            ker(args...)
+        end
+    end
+end
+
+
+function _to_colordh(args::Tuple)
+    dh_index = findfirst(x -> x isa AbstractDofHandler, args)
+    dh_index !== nothing || throw(ErrorException("No subtype of AbstractDofHandler found in the arguments"))
+    arr  = args |> collect 
+    color_dh = init_colordh(arr[dh_index])
+    arr[dh_index] = color_dh
+    return Tuple(arr), color_dh
+end
+
+function _update_cell_args(args::Tuple,index::Int)
+    ## since tuples are immutable we need to convert it to an array to update the values
+    ## then convert it back to a tuple
+    arr  = args |> collect 
+    arr[index] = _to_static_cellvalues(arr[index])
+    return Tuple(arr)
+end
+
+
+function _to_static_cellvalues(cv::CellValues)
+    fv = StaticInterpolationValues(cv.fun_values)
+    gm =StaticInterpolationValues(cv.geo_mapping)
+    weights = ntuple(i -> getweights(cv.qr)[i], getnquadpoints(cv))
+    return Ferrite.StaticCellValues(fv,gm, weights)
+end
diff --git a/src/GPU/GPUKernelLauncher.jl → src/GPU/KernelLauncher.jl b/src/GPU/GPUKernelLauncher.jl → src/GPU/KernelLauncher.jl
@@ -5,8 +5,8 @@ and backends, serving as a foundation for GPU-accelerated computations.
 =#
 
 ### Abstract Types ###
-abstract type AbstractGPUKernel end
-abstract type AbstractGPUBackend end
+abstract type AbstractKernel end
+abstract type AbstractBackend end
 
 
 ### Functions ###
@@ -28,7 +28,7 @@ kernel function, and additional arguments.
 This function needs to be implemented for each specific backend. Calling this function
 without a concrete implementation will raise an error.
 """
-function init_gpu_kernel(backend::AbstractGPUBackend, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
+function init_kernel(backend::AbstractBackend, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
     throw(ErrorException("A concrete implementation of init_gpu_kernel is required"))
 end
 
@@ -45,15 +45,15 @@ mechanism for running GPU-accelerated computations across different GPU backends
 This function must be implemented for specific GPU kernels. If not implemented,
 an error will be thrown.
 """
-function launch!(kernel::AbstractGPUKernel)
+function launch!(kernel::AbstractKernel)
     throw(ErrorException("A concrete implementation of launch! is required"))
 end
 
 
 ### Concrete Types ###
 
 """
-    GPUKernel{Ti}(n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple, backend::Type{<:AbstractGPUBackend})
+    LazyKernel{Ti}(n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple, backend::Type{<:AbstractGPUBackend})
 
 Represents a high-level interface to a GPU backend for configuring and launching GPU kernels.
 It stores the necessary parameters for kernel execution, such as the number of cells,
@@ -69,13 +69,15 @@ number of base functions, the kernel function, and any additional arguments.
 # Type Parameters
 - `Ti`: An integer type representing the number type used for `n_cells` and `n_basefuncs`.
 """
-struct GPUKernel{Ti} <: AbstractGPUKernel
+struct LazyKernel{Ti,BKD<:AbstractBackend} <: AbstractKernel
     n_cells::Ti               # Number of cells
     n_basefuncs::Ti           # Number of base functions
     kernel::Function          # Kernel function to execute
     args::Tuple               # Arguments for the kernel function
-    backend::Type{<:AbstractGPUBackend} # GPU backend
+    backend::Type{BKD} # GPU backend
 end
+
+(ker::LazyKernel)() = launch!(ker)
 
 """
     getbackend(kernel::GPUKernel) -> Type{<:AbstractGPUBackend}
@@ -88,7 +90,7 @@ Returns the backend associated with the given `GPUKernel`.
 # Returns
 The backend type associated with the kernel.
 """
-getbackend(kernel::GPUKernel) = kernel.backend
+getbackend(kernel::LazyKernel) = kernel.backend
 
 
 ### GPU Backend ###
@@ -99,4 +101,5 @@ getbackend(kernel::GPUKernel) = kernel.backend
 Represents the CUDA backend for GPU acceleration. This type serves as a concrete
 implementation of `AbstractGPUBackend` for executing GPU computations using CUDA.
 """
-struct BackendCUDA <: AbstractGPUBackend end
+struct BackendCUDA <: AbstractBackend end
+struct BackendCPU <: AbstractBackend end
diff --git a/src/GPU/coloring_dof.jl b/src/GPU/coloring_dof.jl
@@ -0,0 +1,18 @@
+mutable struct ColoringDofHandler{Ti<:Integer,VECS<:Vector{Vector{Ti}},DH<:AbstractDofHandler}
+    dh::DH
+    colors::VECS
+    current_color::Ti
+end
+
+function init_colordh(dh::AbstractDofHandler)
+    grid = get_grid(dh)
+    colors = create_coloring(grid)
+    ColoringDofHandler(dh, colors, 0)
+end
+
+dofhandler(cd::ColoringDofHandler) = cd.dh
+colors(cd::ColoringDofHandler) = cd.colors
+eles_in_color(cd::ColoringDofHandler, color::Ti) where {Ti<:Integer} = cd.colors[color]
+current_color(cd::ColoringDofHandler) = cd.current_color
+current_color!(cd::ColoringDofHandler, color::Ti) where {Ti<:Integer} = (cd.current_color = color)
+ncolors(cd::ColoringDofHandler) = cd |> colors |> length
diff --git a/src/GPU/gpu_iterator.jl b/src/GPU/gpu_iterator.jl