-
Notifications
You must be signed in to change notification settings - Fork 92
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Cuda heat example w quaditer #913
base: master
Are you sure you want to change the base?
Changes from 10 commits
a979fb2
298158c
1794db3
51ab4f2
22a7377
18377f3
27a3a96
95b5729
d4e881d
f55b878
c1ef6ad
394ac6a
1f0df67
3152042
aac5994
142f89a
eaff534
ffdc341
59595e8
0e3cb21
687141d
11d5a01
d5c951c
f4272a6
d5cf949
2e52de1
2cd0168
54922ab
9406ff9
8fedba5
8bd417a
abf11b6
4f85cf5
4935b70
188cceb
9c904e4
06432db
a67caaa
ecee17f
60edda9
063ff7a
9206be3
1eeb568
5e339a0
204f3be
0f2e6b7
7100e0a
f129449
618adb5
78f120c
4971cba
ea8451c
986c5db
f93fdfb
f442ae2
81274d5
fbc05ed
506328c
b505189
b0a94aa
aa3d1ae
c8cf6fe
8a4523d
9617a4f
427a6b0
bbed047
85c055c
2b77613
f9c70ab
0519016
5752676
0fe023c
a352612
2a6120a
67face7
aca8a6f
9e4d592
6114495
2a8abeb
630017c
fc26670
ae7bc93
0e28f14
0eb376d
8f7a182
9b1567d
a08ab97
b34c43b
b289b69
b2c0347
b87d78b
e10e2f6
42a28e1
e59b8b8
e7157e4
e4b194d
a613107
113a7a2
d1e831e
7f8fa3c
c38419c
190e43e
763c6b5
1b6060d
d767668
726ea9e
8590aa4
39e1f0c
f0cd305
9d4e8b9
12f64bb
361333b
e31c6e3
626dec2
fbc1b4b
ea83925
a356d8d
ee1f77c
fb7e1fc
b38ab72
adb166a
6300a4a
b7301c2
18f47b8
f6e9cc6
8a796de
75e89ed
a77c347
7338788
cbab665
1c81281
d42bcab
1ab1650
bc8ec95
c7f4b0f
d4d5967
3b2196b
825d257
6109bd1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,254 @@ | ||
using Ferrite, CUDA | ||
using StaticArrays | ||
using Adapt | ||
|
||
left = Tensor{1,2,Float64}((0,-0)) # define the left bottom corner of the grid. | ||
right = Tensor{1,2,Float64}((400.0,400.0)) # define the right top corner of the grid. | ||
grid = generate_grid(Quadrilateral, (100, 100),left,right); | ||
|
||
|
||
|
||
ip = Lagrange{RefQuadrilateral, 1}() # define the interpolation function (i.e. Bilinear lagrange) | ||
|
||
# define the numerical integration rule | ||
# (i.e. integrating over quad shape with two quadrature points per direction) | ||
qr = QuadratureRule{RefQuadrilateral}(2) | ||
cellvalues = CellValues(qr, ip); | ||
static_cellvalues = Ferrite.StaticCellValues(cellvalues, Val(true)); | ||
size(static_cellvalues.fv.Nξ,1) | ||
# Notes about cell values regarding gpu: | ||
# 1. fun_values & geo_mapping in CellValues are not bits object. Therefore, they cannot be put on the gpu. | ||
# 2. fv & gm in StaticCellValues are bits object. Therefore, they can be put on the gpu. | ||
# 3. StaticCellValues can be a bitstype be reomoving x property from it. | ||
|
||
|
||
dh = DofHandler(grid) | ||
add!(dh, :u, ip) | ||
close!(dh); | ||
|
||
|
||
|
||
# Standard assembly of the element. | ||
function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues) | ||
n_basefuncs = getnbasefunctions(cellvalues) | ||
|
||
# Loop over quadrature points | ||
for q_point in 1:getnquadpoints(cellvalues) | ||
# Get the quadrature weight | ||
dΩ = getdetJdV(cellvalues, q_point) | ||
# Loop over test shape functions | ||
for i in 1:n_basefuncs | ||
δu = shape_value(cellvalues, q_point, i) | ||
∇δu = shape_gradient(cellvalues, q_point, i) | ||
# Add contribution to fe | ||
fe[i] += δu * dΩ | ||
# Loop over trial shape functions | ||
for j in 1:n_basefuncs | ||
∇u = shape_gradient(cellvalues, q_point, j) | ||
# Add contribution to Ke | ||
Ke[i, j] += (∇δu ⋅ ∇u) * dΩ | ||
end | ||
end | ||
end | ||
return Ke, fe | ||
end | ||
|
||
|
||
# Element assembly by using static cell (PR #883) | ||
function assemble_element_qpiter!(Ke::Matrix, fe::Vector, cellvalues) | ||
n_basefuncs = getnbasefunctions(cellvalues) | ||
## Loop over quadrature points | ||
for qv in Ferrite.QuadratureValuesIterator(cellvalues) | ||
## Get the quadrature weight | ||
dΩ = getdetJdV(qv) | ||
## Loop over test shape functions | ||
for i in 1:n_basefuncs | ||
δu = shape_value(qv, i) | ||
∇δu = shape_gradient(qv, i) | ||
## Add contribution to fe | ||
fe[i] += δu * dΩ | ||
## Loop over trial shape functions | ||
for j in 1:n_basefuncs | ||
∇u = shape_gradient(qv, j) | ||
## Add contribution to Ke | ||
Ke[i, j] += (∇δu ⋅ ∇u) * dΩ | ||
end | ||
end | ||
end | ||
return Ke, fe | ||
end | ||
|
||
|
||
function create_buffers(cellvalues, dh) | ||
f = zeros(ndofs(dh)) | ||
K = create_sparsity_pattern(dh) | ||
assembler = start_assemble(K, f) | ||
## Local quantities | ||
n_basefuncs = getnbasefunctions(cellvalues) | ||
Ke = zeros(n_basefuncs, n_basefuncs) | ||
fe = zeros(n_basefuncs) | ||
return (;f, K, assembler, Ke, fe) | ||
end | ||
|
||
|
||
# Standard global assembly | ||
function assemble_global!(cellvalues, dh::DofHandler,qp_iter::Val{QPiter}) where {QPiter} | ||
(;f, K, assembler, Ke, fe) = create_buffers(cellvalues,dh) | ||
# Loop over all cels | ||
for cell in CellIterator(dh) | ||
fill!(Ke, 0) | ||
fill!(fe, 0) | ||
if QPiter | ||
reinit!(cellvalues, getcoordinates(cell)) | ||
assemble_element_qpiter!(Ke, fe, cellvalues) | ||
else | ||
# Reinitialize cellvalues for this cell | ||
reinit!(cellvalues, cell) | ||
# Compute element contribution | ||
assemble_element_std!(Ke, fe, cellvalues) | ||
end | ||
# Assemble Ke and fe into K and f | ||
assemble!(assembler, celldofs(cell), Ke, fe) | ||
end | ||
return K, f | ||
end | ||
|
||
|
||
|
||
# Helper function to get all the coordinates from the grid. | ||
function get_all_coordinates(grid::Ferrite.AbstractGrid{dim}) where {dim} | ||
coords = Vector{Vec{2,Float32}}() | ||
n_cells = length(grid.cells) | ||
for i = 1:n_cells | ||
append!(coords,getcoordinates(grid,i)) | ||
end | ||
coords | ||
end | ||
|
||
struct GPUGrid{sdim,V<:Vec{sdim,Float32},COORDS<:AbstractArray{V,1}} <: Ferrite.AbstractGrid{sdim} | ||
all_coords::COORDS | ||
n_cells::Int32 | ||
|
||
end | ||
|
||
function GPUGrid(grid::Grid{sdim}) where sdim | ||
all_coords = cu(get_all_coordinates(grid)) | ||
n_cells = Int32(length(grid.cells)) | ||
GPUGrid(all_coords,n_cells) | ||
end | ||
|
||
|
||
struct GPUDofHandler{CDOFS<:AbstractArray{<:Number,1},GRID<:GPUGrid}<: Ferrite.AbstractDofHandler | ||
cell_dofs::CDOFS | ||
grid::GRID | ||
end | ||
|
||
|
||
function GPUDofHandler(dh::DofHandler) | ||
GPUDofHandler(cu(Int32.(dh.cell_dofs)),GPUGrid(dh.grid)) | ||
end | ||
|
||
function Adapt.adapt_structure(to, grid::GPUGrid) | ||
all_coords = Adapt.adapt_structure(to, grid.all_coords) | ||
n_cells = Adapt.adapt_structure(to, grid.n_cells) | ||
GPUGrid(all_coords, n_cells) | ||
end | ||
|
||
function Adapt.adapt_structure(to, dh::GPUDofHandler) | ||
cell_dofs = Adapt.adapt_structure(to, dh.cell_dofs) | ||
grid = Adapt.adapt_structure(to, dh.grid) | ||
GPUDofHandler(cell_dofs, grid) | ||
end | ||
|
||
function Adapt.adapt_structure(to, cv::Ferrite.StaticCellValues) | ||
fv = Adapt.adapt_structure(to, cv.fv) | ||
gm = Adapt.adapt_structure(to, cv.gm) | ||
x = Adapt.adapt_structure(to, cu(cv.x)) | ||
weights = Adapt.adapt_structure(to, cv.weights) | ||
Ferrite.StaticCellValues(fv,gm,x, weights) | ||
end | ||
|
||
|
||
gm = static_cellvalues.gm | ||
x = get_all_coordinates(grid) | ||
J = gm.dNdξ[1,1] ⊗ x[1] + gm.dNdξ[2,1] ⊗ x[2] + gm.dNdξ[3,1] ⊗ x[3] + gm.dNdξ[4,1] ⊗ x[4] | ||
det(J) | ||
inv_J = inv(J) | ||
inv_J ⋅ static_cellvalues.fv.dNdξ[1,1] | ||
|
||
function getjacobian(gm::Ferrite.StaticInterpolationValues,x,qr) | ||
n_basefuncs = size(gm.Nξ,1) | ||
J = gm.dNdξ[1,qr] ⊗ x[1] | ||
for i = 2:n_basefuncs | ||
J+= gm.dNdξ[i,qr] ⊗ x[i] | ||
end | ||
return J | ||
end | ||
|
||
function assemble_element_gpu!(Kgpu,cv::Ferrite.StaticCellValues,dh::GPUDofHandler) | ||
tx = threadIdx().x | ||
bx = blockIdx().x | ||
bd = blockDim().x | ||
e = tx + (bx-1)*bd | ||
n_cells = dh.grid.n_cells | ||
e ≤ n_cells || return nothing # e here is the current element index. | ||
n_qr = length(cv.weights) | ||
n_basefuncs = size(cv.fv.Nξ,1) | ||
dofs = dh.cell_dofs | ||
x = dh.grid.all_coords | ||
for qr = 1:n_qr # loop over quadrature points # TODO: propogate_ibounds | ||
si = (e-1)*n_basefuncs | ||
#J = gm.dNdξ[1,qr] ⊗ x[si+1] + gm.dNdξ[2,qr] ⊗ x[si+2] + gm.dNdξ[3,qr] ⊗ x[si+3] + gm.dNdξ[4,qr] ⊗ x[si+4] | ||
cell_x = @view x[si+1:si+size(cv.gm.Nξ,1)] | ||
J = getjacobian(cv.gm, cell_x,qr) | ||
inv_J = inv(J) | ||
#@cushow det(J) | ||
@inbounds dΩ = det(J) * cv.weights[qr] | ||
for i = 1:n_basefuncs | ||
@inbounds ∇δu = inv_J ⋅ cv.fv.dNdξ[i,qr] | ||
for j = 1:n_basefuncs | ||
@inbounds ∇u = inv_J ⋅ cv.fv.dNdξ[j,qr] | ||
@inbounds ig = dofs[(e-1)*n_basefuncs+i] | ||
@inbounds jg = dofs[(e-1)*n_basefuncs+j] | ||
CUDA.@atomic Kgpu[ig, jg] += (∇δu ⋅ ∇u) * dΩ # atomic because many threads might write into the same memory addrres at the same time. | ||
end | ||
end | ||
end | ||
return nothing | ||
end | ||
|
||
Kgpu = CUDA.zeros(dh.ndofs.x,dh.ndofs.x) | ||
gpu_dh = GPUDofHandler(dh) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe that's the plan anyway but wouldn't it be nicer to write function Adapt.adapt_structure(to,dh:DofHandler)
return adapt_structure(GPUDofHandler(cu(Int32.(dh.cell_dofs)),GPUGrid(dh.grid)))
end and then just use the "normal" structs from a user perspective that are automatically converted when needed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am still a bit split about this, because it contains a performance pitfalls. If we have repeated assembly, then the dof handler will be converted and copied to GPU for each assembly instead of only once. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Am I missing something or wouldn't the adapt call not also happen for each assembly kernel launch for the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should not happen, because we do not need to adapt the GPUDofHandler (it is already a GPU datastructure). |
||
|
||
|
||
|
||
function assemble_global_gpu!(Kgpu) | ||
kernel = @cuda launch=false assemble_element_gpu!(Kgpu,static_cellvalues,gpu_dh) | ||
config = launch_configuration(kernel.fun) | ||
threads = min(length(grid.cells), config.threads) | ||
blocks = cld(length(grid.cells), threads) | ||
kernel(Kgpu,static_cellvalues,gpu_dh; threads=threads, blocks=blocks) | ||
end | ||
|
||
stassy(cv,dh) = assemble_global!(cv,dh,Val(false)) | ||
|
||
qpassy(cv,dh) = assemble_global!(cv,dh,Val(true)) | ||
|
||
using BenchmarkTools | ||
using LinearAlgebra | ||
|
||
|
||
assemble_global_gpu!(Kgpu) | ||
|
||
Kgpu | ||
norm(Kgpu) | ||
|
||
Kstd , Fstd = stassy(cellvalues,dh); | ||
|
||
norm(Kstd) | ||
|
||
cvs = Ferrite.StaticCellValues(cellvalues, Val(true)) | ||
|
||
Kqp , Fqp = qpassy(cvs,dh); | ||
|
||
norm(Kqp) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe this is still a work in progress but the mix of functions and global variables make it kind of confusing to read the code.