Skip to content

Commit

Permalink
Rework the package, and add POCL-based CPU CI (#210)
Browse files Browse the repository at this point in the history
  • Loading branch information
maleadt authored Aug 9, 2024
1 parent 488704f commit 30113b6
Show file tree
Hide file tree
Showing 68 changed files with 4,171 additions and 3,010 deletions.
8 changes: 5 additions & 3 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
steps:
- label: "Julia 1.6 - CUDA"
- label: "CUDA"
plugins:
- JuliaCI/julia#v1:
version: "1.6"
version: "1.10"
- JuliaCI/julia-test#v1: ~
- JuliaCI/julia-coverage#v1:
codecov: true
agents:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip tests\]/
command: "mkdir -p /etc/OpenCL/vendors && echo libnvidia-opencl.so.1 > /etc/OpenCL/vendors/nvidia.icd"
env:
JULIA_OPENCL_BACKEND: "CUDA"
OCL_ICD_FILENAMES: "libnvidia-opencl.so.1"
timeout_in_minutes: 60
37 changes: 37 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: CI

on:
push:
branches: [master]
tags: ["*"]
pull_request:
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
test:
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
version: ['1.10', 'pre']
os: ['ubuntu-latest', 'macOS-latest']
arch: [x64]
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
with:
version: ${{ matrix.version }}
arch: ${{ matrix.arch }}
- uses: julia-actions/cache@v2
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
- uses: julia-actions/julia-processcoverage@v1
- uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: lcov.info
12 changes: 3 additions & 9 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,18 +1,12 @@
name = "OpenCL"
uuid = "08131aa3-fb12-5dee-8b74-c09406e224a2"
version = "0.9.0"
version = "0.10.0"

[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"

[compat]
OpenCL_jll = "2022.9.23"
julia = "1.6"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
OpenCL_jll = "2024.5.8"
julia = "1.10"
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ b = rand(Float32, 50_000)

device, ctx, queue = cl.create_compute_context()

a_buff = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=a)
b_buff = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=b)
c_buff = cl.Buffer(Float32, ctx, :w, length(a))
a_buff = cl.Buffer(Float32, ctx, length(a), (:r, :copy), hostbuf=a)
b_buff = cl.Buffer(Float32, ctx, length(b), (:r, :copy), hostbuf=b)
c_buff = cl.Buffer(Float32, ctx, length(a), :w)

p = cl.Program(ctx, source=sum_kernel) |> cl.build!
k = cl.Kernel(p, "sum")
Expand Down Expand Up @@ -146,7 +146,7 @@ Here's a rough translation between the OpenCL API in C to this Julia version. Op
| `clGetKernelInfo` | `cl.info(kernel, :symbol)` | Kernel info: `:name`, `:num_args`, `:reference_count`, `:program`, `:attributes` |
| `clEnqueueNDRangeKernel` | `cl.enqueue_kernel(queue, kernel, global_work_size)`, `cl.enqueue_kernel(queue, kernel, global_work_size, local_work_size; global_work_offset, wait_on)` | |
| `clSetKernelArg` | `cl.set_arg!(kernel, idx, arg)` | `idx` starts at 1 |
| `clCreateUserEvent` | `cl.UserEvent(ctx; retain)` | |
| `clCreateUserEvent` | `cl.UserEvent(ctx; retain)` | |
| `clGetEventInfo` | `cl.info(event, :symbol)` | Event info: `:context`, `:command_queue`, `:reference_count`, `:command_type`, `:status`, `:profile_start`, `:profile_end`, `:profile_queued`, `:profile_submit`, `:profile_duration`
| `clWaitForEvents` | `cl.wait(event)`, `cl.wait(events)` |
| `clEnqueueMarkerWithWaitList` | `cl.enqueue_marker_with_wait_list(queue, wait_for)` | |
Expand Down
6 changes: 3 additions & 3 deletions examples/demo.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ device, ctx, queue = cl.create_compute_context()

# create opencl buffer objects
# copies to the device initiated when the kernel function is called
a_buff = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=a)
b_buff = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=b)
c_buff = cl.Buffer(Float32, ctx, :w, length(a))
a_buff = cl.Buffer(Float32, ctx, length(a), (:r, :copy); hostbuf=a)
b_buff = cl.Buffer(Float32, ctx, length(b), (:r, :copy); hostbuf=b)
c_buff = cl.Buffer(Float32, ctx, length(a), :w)

# build the program and construct a kernel object
p = cl.Program(ctx, source=sum_kernel_src) |> cl.build!
Expand Down
14 changes: 7 additions & 7 deletions examples/hands_on_opencl/ex04/vadd_chain.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,14 @@ h_g = rand(Float32, LENGTH)
# {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)}

# Create the input (a, b, e, g) arrays in device memory and copy data from host
d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a)
d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b)
d_e = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_e)
d_g = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_g)
d_a = cl.Buffer(Float32, ctx, length(h_a), (:r, :copy), hostbuf=h_a)
d_b = cl.Buffer(Float32, ctx, length(h_b), (:r, :copy), hostbuf=h_b)
d_e = cl.Buffer(Float32, ctx, length(h_e), (:r, :copy), hostbuf=h_e)
d_g = cl.Buffer(Float32, ctx, length(h_g), (:r, :copy), hostbuf=h_g)
# Create the output (c, d, f) array in device memory
d_c = cl.Buffer(Float32, ctx, :w, LENGTH)
d_d = cl.Buffer(Float32, ctx, :w, LENGTH)
d_f = cl.Buffer(Float32, ctx, :w, LENGTH)
d_c = cl.Buffer(Float32, ctx, :LENGTH, w)
d_d = cl.Buffer(Float32, ctx, :LENGTH, w)
d_f = cl.Buffer(Float32, ctx, :LENGTH, w)

# create the kernel
vadd = cl.Kernel(program, "vadd")
Expand Down
8 changes: 4 additions & 4 deletions examples/hands_on_opencl/ex05/vadd_abc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ h_a = rand(Float32, LENGTH)
h_b = rand(Float32, LENGTH)
h_c = rand(Float32, LENGTH)

d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a)
d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b)
d_c = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_c)
d_a = cl.Buffer(Float32, ctx, length(h_a), (:r, :copy), hostbuf=h_a)
d_b = cl.Buffer(Float32, ctx, length(h_b), (:r, :copy), hostbuf=h_b)
d_c = cl.Buffer(Float32, ctx, length(h_c), (:r, :copy), hostbuf=h_c)

# create the output (r) buffer in device memory
d_r = cl.Buffer(Float32, ctx, :w, LENGTH)
d_r = cl.Buffer(Float32, ctx, LENGTH, :w)

# create the kernel
vadd = cl.Kernel(program, "vadd")
Expand Down
6 changes: 3 additions & 3 deletions examples/hands_on_opencl/ex06/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,9 @@ ctx = cl.create_some_context()
queue = cl.CmdQueue(ctx, :profile)

# create OpenCL Buffers
d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A)
d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B)
d_c = cl.Buffer(Float32, ctx, :w, length(h_C))
d_a = cl.Buffer(Float32, ctx, length(h_A), (:r,:copy), hostbuf=h_A)
d_b = cl.Buffer(Float32, ctx, length(h_B), (:r,:copy), hostbuf=h_B)
d_c = cl.Buffer(Float32, ctx, length(h_C), :w)

prg = cl.Program(ctx, source=kernel_source) |> cl.build!
mmul = cl.Kernel(prg, "mmul")
Expand Down
6 changes: 3 additions & 3 deletions examples/hands_on_opencl/ex07/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ ctx = cl.create_some_context()
queue = cl.CmdQueue(ctx, :profile)

# create OpenCL Buffers
d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A)
d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B)
d_c = cl.Buffer(Float32, ctx, :w, length(h_C))
d_a = cl.Buffer(Float32, ctx, length(h_A), (:r,:copy), hostbuf=h_A)
d_b = cl.Buffer(Float32, ctx, length(h_B), (:r,:copy), hostbuf=h_B)
d_c = cl.Buffer(Float32, ctx, length(h_C), :w)

#--------------------------------------------------------------------------------
# OpenCL matrix multiplication ... Naive
Expand Down
6 changes: 3 additions & 3 deletions examples/hands_on_opencl/ex08/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ ctx = cl.create_some_context()
queue = cl.CmdQueue(ctx, :profile)

# create OpenCL Buffers
d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A)
d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B)
d_c = cl.Buffer(Float32, ctx, :w, length(h_C))
d_a = cl.Buffer(Float32, ctx, length(h_A), (:r,:copy), hostbuf=h_A)
d_b = cl.Buffer(Float32, ctx, length(h_B), (:r,:copy), hostbuf=h_B)
d_c = cl.Buffer(Float32, ctx, length(h_C), :w)

#--------------------------------------------------------------------------------
# OpenCL matrix multiplication ... Naive
Expand Down
2 changes: 1 addition & 1 deletion examples/hands_on_opencl/ex09/pi_ocl.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ h_psum = Vector{Float32}(undef, nwork_groups)
println("$nwork_groups work groups of size $work_group_size.")
println("$nsteps integration steps")

d_partial_sums = cl.Buffer(Float32, ctx, :w, length(h_psum))
d_partial_sums = cl.Buffer(Float32, ctx, length(h_psum), :w)

# start timer
rtime = time()
Expand Down
2 changes: 1 addition & 1 deletion examples/hands_on_opencl/exA/pi_vocl.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ h_psum = Vector{Float32}(undef, nwork_groups)
println("$nwork_groups work groups of size $work_group_size.")
println("$nsteps integration steps")

d_partial_sums = cl.Buffer(Float32, ctx, :w, length(h_psum))
d_partial_sums = cl.Buffer(Float32, ctx, length(h_psum), :w)

# start timer
rtime = time()
Expand Down
4 changes: 2 additions & 2 deletions examples/notebooks/Transpose.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,8 @@
" (\"block\", enqueue_block_kernel, block_kernel))\n",
" for s in array_sizes \n",
" src = rand(Float32, (s, s))\n",
" a_buf = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=src)\n",
" a_t_buf = cl.Buffer(Float32, ctx, :w, length(src))\n",
" a_buf = cl.Buffer(Float32, ctx, length(src), (:r, :copy), hostbuf=src)\n",
" a_t_buf = cl.Buffer(Float32, ctx, length(src), :w)\n",
" \n",
" # warm up....\n",
" for i in 1:4\n",
Expand Down
14 changes: 7 additions & 7 deletions examples/notebooks/julia_set_fractal.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -295,16 +295,16 @@
"julia_source = \"\n",
"\n",
"__kernel void julia(__global float2 *q,\n",
" __global ushort *output, \n",
" __global ushort *output,\n",
" ushort const maxiter)\n",
"{\n",
" int gid = get_global_id(0);\n",
" float nreal = 0;\n",
" float real = q[gid].x;\n",
" float imag = q[gid].y;\n",
" \n",
"\n",
" output[gid] = 0;\n",
" \n",
"\n",
" for(int curiter = 0; curiter < maxiter; curiter++) {\n",
" if (real*real + imag*imag > 4.0f) {\n",
" output[gid] = curiter;\n",
Expand Down Expand Up @@ -341,15 +341,15 @@
"\n",
" out = Array{UInt16}(size(q))\n",
"\n",
" q_buff = cl.Buffer(Complex64, ctx, (:r, :copy), hostbuf=q)\n",
" o_buff = cl.Buffer(UInt16, ctx, :w, length(out))\n",
" q_buff = cl.Buffer(Complex64, ctx, length(q), (:r, :copy), hostbuf=q)\n",
" o_buff = cl.Buffer(UInt16, ctx, length(out), :w)\n",
"\n",
" prg = cl.Program(ctx, source=julia_source) |> cl.build!\n",
" k = cl.Kernel(prg, \"julia\")\n",
" \n",
"\n",
" queue(k, length(q), nothing, q_buff, o_buff, UInt16(maxiter))\n",
" cl.copy!(queue, out, o_buff)\n",
" \n",
"\n",
" return out\n",
"end\n"
]
Expand Down
12 changes: 6 additions & 6 deletions examples/notebooks/mandelbrot_fractal.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
"source": [
"mandel_source = \"\n",
"__kernel void mandelbrot(__global float2 *q,\n",
" __global ushort *output, \n",
" __global ushort *output,\n",
" ushort const maxiter)\n",
"{\n",
" int gid = get_global_id(0);\n",
Expand Down Expand Up @@ -98,18 +98,18 @@
"\n",
" out = Array{UInt16}(size(q))\n",
"\n",
" q_buff = cl.Buffer(Complex64, ctx, (:r, :copy), hostbuf=q)\n",
" o_buff = cl.Buffer(UInt16, ctx, :w, length(out))\n",
" q_buff = cl.Buffer(Complex64, ctx, length(q), (:r, :copy), hostbuf=q)\n",
" o_buff = cl.Buffer(UInt16, ctx, length(out), :w)\n",
"\n",
" prg = cl.Program(ctx, source=mandel_source) |> cl.build!\n",
" \n",
"\n",
" k = cl.Kernel(prg, \"mandelbrot\")\n",
" #cl.call(queue, k, length(out), nothing, q_buff, o_buff, uint16(maxiter))\n",
" queue(k, length(q), nothing, q_buff, o_buff, UInt16(maxiter))\n",
"\n",
"\n",
" cl.copy!(queue, out, o_buff)\n",
" \n",
"\n",
" return out\n",
"end"
]
Expand Down Expand Up @@ -150,7 +150,7 @@
" y2 = 1.0\n",
" x1 = -1.5\n",
" x2 = 0.5\n",
" \n",
"\n",
" q = Array{Complex64}(h, w)\n",
" for x in 1:w\n",
" for y in 1:h\n",
Expand Down
6 changes: 3 additions & 3 deletions examples/performance.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ function cl_performance(ndatapts::Integer, nworkers::Integer)
ctx = cl.Context(device)
queue = cl.CmdQueue(ctx, :profile)

a_buf = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=a)
b_buf = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=b)
c_buf = cl.Buffer(Float32, ctx, :w, length(a))
a_buf = cl.Buffer(Float32, ctx, length(a), (:r, :copy), hostbuf=a)
b_buf = cl.Buffer(Float32, ctx, length(b), (:r, :copy), hostbuf=b)
c_buf = cl.Buffer(Float32, ctx, length(a), :w)

prg = cl.Program(ctx, source=bench_kernel) |> cl.build!
kern = cl.Kernel(prg, "sum")
Expand Down
Loading

0 comments on commit 30113b6

Please sign in to comment.